1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN 3; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 4; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 8; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 9 10define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { 11; GCN-LABEL: test_load_store: 12; GCN: ; %bb.0: 13; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; GCN-NEXT: s_mov_b32 s6, 0 15; GCN-NEXT: s_mov_b32 s7, 0xf000 16; GCN-NEXT: s_mov_b32 s4, s6 17; GCN-NEXT: s_mov_b32 s5, s6 18; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 19; GCN-NEXT: s_waitcnt vmcnt(0) 20; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 21; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 22; GCN-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX7-LABEL: test_load_store: 25; GFX7: ; %bb.0: 26; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX7-NEXT: s_mov_b32 s6, 0 28; GFX7-NEXT: s_mov_b32 s7, 0xf000 29; GFX7-NEXT: s_mov_b32 s4, s6 30; GFX7-NEXT: s_mov_b32 s5, s6 31; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 32; GFX7-NEXT: s_waitcnt vmcnt(0) 33; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 34; GFX7-NEXT: s_waitcnt vmcnt(0) 35; GFX7-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX8-LABEL: test_load_store: 38; GFX8: ; %bb.0: 39; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX8-NEXT: flat_load_ushort v0, v[0:1] 41; GFX8-NEXT: s_waitcnt vmcnt(0) 42; GFX8-NEXT: flat_store_short v[2:3], v0 43; GFX8-NEXT: s_waitcnt vmcnt(0) 44; GFX8-NEXT: s_setpc_b64 s[30:31] 45; 46; GFX9-LABEL: test_load_store: 47; GFX9: ; %bb.0: 48; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GFX9-NEXT: global_load_ushort v0, v[0:1], off 50; GFX9-NEXT: s_waitcnt vmcnt(0) 51; GFX9-NEXT: global_store_short v[2:3], v0, off 52; GFX9-NEXT: s_waitcnt vmcnt(0) 53; GFX9-NEXT: s_setpc_b64 s[30:31] 54; 55; GFX10-LABEL: test_load_store: 56; GFX10: ; %bb.0: 57; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX10-NEXT: global_load_ushort v0, v[0:1], off 59; GFX10-NEXT: s_waitcnt vmcnt(0) 60; GFX10-NEXT: global_store_short v[2:3], v0, off 61; GFX10-NEXT: s_setpc_b64 s[30:31] 62; 63; GFX11-LABEL: test_load_store: 64; GFX11: ; %bb.0: 65; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 66; GFX11-NEXT: global_load_u16 v0, v[0:1], off 67; GFX11-NEXT: s_waitcnt vmcnt(0) 68; GFX11-NEXT: global_store_b16 v[2:3], v0, off 69; GFX11-NEXT: s_setpc_b64 s[30:31] 70 %val = load bfloat, ptr addrspace(1) %in 71 store bfloat %val, ptr addrspace(1) %out 72 ret void 73} 74 75define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) { 76; GCN-LABEL: v_load_global_v2bf16: 77; GCN: ; %bb.0: 78; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 79; GCN-NEXT: s_mov_b32 s6, 0 80; GCN-NEXT: s_mov_b32 s7, 0xf000 81; GCN-NEXT: s_mov_b32 s4, s6 82; GCN-NEXT: s_mov_b32 s5, s6 83; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 84; GCN-NEXT: s_waitcnt vmcnt(0) 85; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 86; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 87; GCN-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX7-LABEL: v_load_global_v2bf16: 90; GFX7: ; %bb.0: 91; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX7-NEXT: s_mov_b32 s6, 0 93; GFX7-NEXT: s_mov_b32 s7, 0xf000 94; GFX7-NEXT: s_mov_b32 s4, s6 95; GFX7-NEXT: s_mov_b32 s5, s6 96; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 97; GFX7-NEXT: s_waitcnt vmcnt(0) 98; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 99; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 100; GFX7-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX8-LABEL: v_load_global_v2bf16: 103; GFX8: ; %bb.0: 104; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX8-NEXT: flat_load_dword v0, v[0:1] 106; GFX8-NEXT: s_waitcnt vmcnt(0) 107; GFX8-NEXT: s_setpc_b64 s[30:31] 108; 109; GFX9-LABEL: v_load_global_v2bf16: 110; GFX9: ; %bb.0: 111; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX9-NEXT: global_load_dword v0, v[0:1], off 113; GFX9-NEXT: s_waitcnt vmcnt(0) 114; GFX9-NEXT: s_setpc_b64 s[30:31] 115; 116; GFX10-LABEL: v_load_global_v2bf16: 117; GFX10: ; %bb.0: 118; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GFX10-NEXT: global_load_dword v0, v[0:1], off 120; GFX10-NEXT: s_waitcnt vmcnt(0) 121; GFX10-NEXT: s_setpc_b64 s[30:31] 122; 123; GFX11-LABEL: v_load_global_v2bf16: 124; GFX11: ; %bb.0: 125; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX11-NEXT: global_load_b32 v0, v[0:1], off 127; GFX11-NEXT: s_waitcnt vmcnt(0) 128; GFX11-NEXT: s_setpc_b64 s[30:31] 129 %load = load <2 x bfloat>, ptr addrspace(1) %ptr 130 ret <2 x bfloat> %load 131} 132 133define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) { 134; GCN-LABEL: v_load_global_v3bf16: 135; GCN: ; %bb.0: 136; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GCN-NEXT: s_mov_b32 s6, 0 138; GCN-NEXT: s_mov_b32 s7, 0xf000 139; GCN-NEXT: s_mov_b32 s4, s6 140; GCN-NEXT: s_mov_b32 s5, s6 141; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 142; GCN-NEXT: s_waitcnt vmcnt(0) 143; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 144; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 145; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 146; GCN-NEXT: s_setpc_b64 s[30:31] 147; 148; GFX7-LABEL: v_load_global_v3bf16: 149; GFX7: ; %bb.0: 150; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX7-NEXT: s_mov_b32 s6, 0 152; GFX7-NEXT: s_mov_b32 s7, 0xf000 153; GFX7-NEXT: s_mov_b32 s4, s6 154; GFX7-NEXT: s_mov_b32 s5, s6 155; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 156; GFX7-NEXT: s_waitcnt vmcnt(0) 157; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 158; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 159; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 160; GFX7-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX8-LABEL: v_load_global_v3bf16: 163; GFX8: ; %bb.0: 164; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 166; GFX8-NEXT: s_waitcnt vmcnt(0) 167; GFX8-NEXT: s_setpc_b64 s[30:31] 168; 169; GFX9-LABEL: v_load_global_v3bf16: 170; GFX9: ; %bb.0: 171; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 172; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 173; GFX9-NEXT: s_waitcnt vmcnt(0) 174; GFX9-NEXT: s_setpc_b64 s[30:31] 175; 176; GFX10-LABEL: v_load_global_v3bf16: 177; GFX10: ; %bb.0: 178; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 179; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 180; GFX10-NEXT: s_waitcnt vmcnt(0) 181; GFX10-NEXT: s_setpc_b64 s[30:31] 182; 183; GFX11-LABEL: v_load_global_v3bf16: 184; GFX11: ; %bb.0: 185; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 187; GFX11-NEXT: s_waitcnt vmcnt(0) 188; GFX11-NEXT: s_setpc_b64 s[30:31] 189 %load = load <3 x bfloat>, ptr addrspace(1) %ptr 190 ret <3 x bfloat> %load 191} 192 193define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) { 194; GCN-LABEL: v_load_global_v4bf16: 195; GCN: ; %bb.0: 196; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 197; GCN-NEXT: s_mov_b32 s6, 0 198; GCN-NEXT: s_mov_b32 s7, 0xf000 199; GCN-NEXT: s_mov_b32 s4, s6 200; GCN-NEXT: s_mov_b32 s5, s6 201; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 202; GCN-NEXT: s_waitcnt vmcnt(0) 203; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 204; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 205; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 206; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 207; GCN-NEXT: s_setpc_b64 s[30:31] 208; 209; GFX7-LABEL: v_load_global_v4bf16: 210; GFX7: ; %bb.0: 211; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 212; GFX7-NEXT: s_mov_b32 s6, 0 213; GFX7-NEXT: s_mov_b32 s7, 0xf000 214; GFX7-NEXT: s_mov_b32 s4, s6 215; GFX7-NEXT: s_mov_b32 s5, s6 216; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 217; GFX7-NEXT: s_waitcnt vmcnt(0) 218; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 219; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 220; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 221; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 222; GFX7-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX8-LABEL: v_load_global_v4bf16: 225; GFX8: ; %bb.0: 226; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 228; GFX8-NEXT: s_waitcnt vmcnt(0) 229; GFX8-NEXT: s_setpc_b64 s[30:31] 230; 231; GFX9-LABEL: v_load_global_v4bf16: 232; GFX9: ; %bb.0: 233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 235; GFX9-NEXT: s_waitcnt vmcnt(0) 236; GFX9-NEXT: s_setpc_b64 s[30:31] 237; 238; GFX10-LABEL: v_load_global_v4bf16: 239; GFX10: ; %bb.0: 240; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 241; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 242; GFX10-NEXT: s_waitcnt vmcnt(0) 243; GFX10-NEXT: s_setpc_b64 s[30:31] 244; 245; GFX11-LABEL: v_load_global_v4bf16: 246; GFX11: ; %bb.0: 247; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 249; GFX11-NEXT: s_waitcnt vmcnt(0) 250; GFX11-NEXT: s_setpc_b64 s[30:31] 251 %load = load <4 x bfloat>, ptr addrspace(1) %ptr 252 ret <4 x bfloat> %load 253} 254 255define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) { 256; GCN-LABEL: v_load_global_v6bf16: 257; GCN: ; %bb.0: 258; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GCN-NEXT: s_mov_b32 s6, 0 260; GCN-NEXT: s_mov_b32 s7, 0xf000 261; GCN-NEXT: s_mov_b32 s4, s6 262; GCN-NEXT: s_mov_b32 s5, s6 263; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 264; GCN-NEXT: s_waitcnt vmcnt(0) 265; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 266; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 267; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 268; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 269; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 270; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 271; GCN-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX7-LABEL: v_load_global_v6bf16: 274; GFX7: ; %bb.0: 275; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX7-NEXT: s_mov_b32 s6, 0 277; GFX7-NEXT: s_mov_b32 s7, 0xf000 278; GFX7-NEXT: s_mov_b32 s4, s6 279; GFX7-NEXT: s_mov_b32 s5, s6 280; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64 281; GFX7-NEXT: s_waitcnt vmcnt(0) 282; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 283; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 284; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 285; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 286; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 287; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 288; GFX7-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX8-LABEL: v_load_global_v6bf16: 291; GFX8: ; %bb.0: 292; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 294; GFX8-NEXT: s_waitcnt vmcnt(0) 295; GFX8-NEXT: s_setpc_b64 s[30:31] 296; 297; GFX9-LABEL: v_load_global_v6bf16: 298; GFX9: ; %bb.0: 299; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 300; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 301; GFX9-NEXT: s_waitcnt vmcnt(0) 302; GFX9-NEXT: s_setpc_b64 s[30:31] 303; 304; GFX10-LABEL: v_load_global_v6bf16: 305; GFX10: ; %bb.0: 306; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 307; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 308; GFX10-NEXT: s_waitcnt vmcnt(0) 309; GFX10-NEXT: s_setpc_b64 s[30:31] 310; 311; GFX11-LABEL: v_load_global_v6bf16: 312; GFX11: ; %bb.0: 313; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off 315; GFX11-NEXT: s_waitcnt vmcnt(0) 316; GFX11-NEXT: s_setpc_b64 s[30:31] 317 %load = load <6 x bfloat>, ptr addrspace(1) %ptr 318 ret <6 x bfloat> %load 319} 320 321define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) { 322; GCN-LABEL: v_load_global_v8bf16: 323; GCN: ; %bb.0: 324; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GCN-NEXT: s_mov_b32 s6, 0 326; GCN-NEXT: s_mov_b32 s7, 0xf000 327; GCN-NEXT: s_mov_b32 s4, s6 328; GCN-NEXT: s_mov_b32 s5, s6 329; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 330; GCN-NEXT: s_waitcnt vmcnt(0) 331; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 332; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 333; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 334; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 335; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 336; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 337; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 338; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 339; GCN-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX7-LABEL: v_load_global_v8bf16: 342; GFX7: ; %bb.0: 343; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX7-NEXT: s_mov_b32 s6, 0 345; GFX7-NEXT: s_mov_b32 s7, 0xf000 346; GFX7-NEXT: s_mov_b32 s4, s6 347; GFX7-NEXT: s_mov_b32 s5, s6 348; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 349; GFX7-NEXT: s_waitcnt vmcnt(0) 350; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 351; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 352; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 353; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 354; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 355; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 356; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 357; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 358; GFX7-NEXT: s_setpc_b64 s[30:31] 359; 360; GFX8-LABEL: v_load_global_v8bf16: 361; GFX8: ; %bb.0: 362; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 363; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 364; GFX8-NEXT: s_waitcnt vmcnt(0) 365; GFX8-NEXT: s_setpc_b64 s[30:31] 366; 367; GFX9-LABEL: v_load_global_v8bf16: 368; GFX9: ; %bb.0: 369; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 371; GFX9-NEXT: s_waitcnt vmcnt(0) 372; GFX9-NEXT: s_setpc_b64 s[30:31] 373; 374; GFX10-LABEL: v_load_global_v8bf16: 375; GFX10: ; %bb.0: 376; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 378; GFX10-NEXT: s_waitcnt vmcnt(0) 379; GFX10-NEXT: s_setpc_b64 s[30:31] 380; 381; GFX11-LABEL: v_load_global_v8bf16: 382; GFX11: ; %bb.0: 383; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 384; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 385; GFX11-NEXT: s_waitcnt vmcnt(0) 386; GFX11-NEXT: s_setpc_b64 s[30:31] 387 %load = load <8 x bfloat>, ptr addrspace(1) %ptr 388 ret <8 x bfloat> %load 389} 390 391define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { 392; GCN-LABEL: v_load_global_v16bf16: 393; GCN: ; %bb.0: 394; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 395; GCN-NEXT: s_mov_b32 s6, 0 396; GCN-NEXT: s_mov_b32 s7, 0xf000 397; GCN-NEXT: s_mov_b32 s4, s6 398; GCN-NEXT: s_mov_b32 s5, s6 399; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 400; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 401; GCN-NEXT: s_waitcnt vmcnt(1) 402; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 403; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 404; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 405; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 406; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 407; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 408; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 409; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 410; GCN-NEXT: s_waitcnt vmcnt(0) 411; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 412; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 413; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 414; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 415; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 416; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 417; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 418; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 419; GCN-NEXT: s_setpc_b64 s[30:31] 420; 421; GFX7-LABEL: v_load_global_v16bf16: 422; GFX7: ; %bb.0: 423; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 424; GFX7-NEXT: s_mov_b32 s6, 0 425; GFX7-NEXT: s_mov_b32 s7, 0xf000 426; GFX7-NEXT: s_mov_b32 s4, s6 427; GFX7-NEXT: s_mov_b32 s5, s6 428; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 429; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 430; GFX7-NEXT: s_waitcnt vmcnt(1) 431; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 432; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 433; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 434; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 435; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 436; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 437; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 438; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 439; GFX7-NEXT: s_waitcnt vmcnt(0) 440; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 441; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 442; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 443; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 444; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 445; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 446; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 447; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 448; GFX7-NEXT: s_setpc_b64 s[30:31] 449; 450; GFX8-LABEL: v_load_global_v16bf16: 451; GFX8: ; %bb.0: 452; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 453; GFX8-NEXT: v_mov_b32_e32 v5, v1 454; GFX8-NEXT: v_mov_b32_e32 v4, v0 455; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 456; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4 457; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 458; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 459; GFX8-NEXT: s_waitcnt vmcnt(0) 460; GFX8-NEXT: s_setpc_b64 s[30:31] 461; 462; GFX9-LABEL: v_load_global_v16bf16: 463; GFX9: ; %bb.0: 464; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 465; GFX9-NEXT: v_mov_b32_e32 v9, v1 466; GFX9-NEXT: v_mov_b32_e32 v8, v0 467; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off 468; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 469; GFX9-NEXT: s_waitcnt vmcnt(0) 470; GFX9-NEXT: s_setpc_b64 s[30:31] 471; 472; GFX10-LABEL: v_load_global_v16bf16: 473; GFX10: ; %bb.0: 474; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 475; GFX10-NEXT: v_mov_b32_e32 v9, v1 476; GFX10-NEXT: v_mov_b32_e32 v8, v0 477; GFX10-NEXT: s_clause 0x1 478; GFX10-NEXT: global_load_dwordx4 v[0:3], v[8:9], off 479; GFX10-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 480; GFX10-NEXT: s_waitcnt vmcnt(0) 481; GFX10-NEXT: s_setpc_b64 s[30:31] 482; 483; GFX11-LABEL: v_load_global_v16bf16: 484; GFX11: ; %bb.0: 485; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 487; GFX11-NEXT: s_clause 0x1 488; GFX11-NEXT: global_load_b128 v[0:3], v[4:5], off 489; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 490; GFX11-NEXT: s_waitcnt vmcnt(0) 491; GFX11-NEXT: s_setpc_b64 s[30:31] 492 %load = load <16 x bfloat>, ptr addrspace(1) %ptr 493 ret <16 x bfloat> %load 494} 495 496define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { 497; GCN-LABEL: v_load_global_v32bf16: 498; GCN: ; %bb.0: 499; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 500; GCN-NEXT: s_mov_b32 s6, 0 501; GCN-NEXT: s_mov_b32 s7, 0xf000 502; GCN-NEXT: s_mov_b32 s4, s6 503; GCN-NEXT: s_mov_b32 s5, s6 504; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 505; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 506; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 507; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 508; GCN-NEXT: s_waitcnt vmcnt(3) 509; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 510; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 511; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 512; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 513; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 514; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 515; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 516; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 517; GCN-NEXT: s_waitcnt vmcnt(2) 518; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 519; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 520; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 521; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 522; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 523; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 524; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 525; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 526; GCN-NEXT: s_waitcnt vmcnt(1) 527; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 528; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 529; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21 530; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 531; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 532; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 533; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 534; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 535; GCN-NEXT: s_waitcnt vmcnt(0) 536; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28 537; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 538; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29 539; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 540; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 541; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 542; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 543; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 544; GCN-NEXT: s_setpc_b64 s[30:31] 545; 546; GFX7-LABEL: v_load_global_v32bf16: 547; GFX7: ; %bb.0: 548; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 549; GFX7-NEXT: s_mov_b32 s6, 0 550; GFX7-NEXT: s_mov_b32 s7, 0xf000 551; GFX7-NEXT: s_mov_b32 s4, s6 552; GFX7-NEXT: s_mov_b32 s5, s6 553; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 554; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 555; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 556; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 557; GFX7-NEXT: s_waitcnt vmcnt(3) 558; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 559; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 560; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 561; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 562; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 563; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 564; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 565; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 566; GFX7-NEXT: s_waitcnt vmcnt(2) 567; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 568; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 569; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 570; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 571; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 572; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 573; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 574; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 575; GFX7-NEXT: s_waitcnt vmcnt(1) 576; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 577; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 578; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21 579; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 580; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22 581; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 582; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 583; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 584; GFX7-NEXT: s_waitcnt vmcnt(0) 585; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28 586; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 587; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29 588; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 589; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 590; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 591; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 592; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 593; GFX7-NEXT: s_setpc_b64 s[30:31] 594; 595; GFX8-LABEL: v_load_global_v32bf16: 596; GFX8: ; %bb.0: 597; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 598; GFX8-NEXT: v_mov_b32_e32 v12, v0 599; GFX8-NEXT: v_mov_b32_e32 v13, v1 600; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v12 601; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v13, vcc 602; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v12 603; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc 604; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[12:13] 605; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v12 606; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc 607; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 608; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 609; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 610; GFX8-NEXT: s_waitcnt vmcnt(0) 611; GFX8-NEXT: s_setpc_b64 s[30:31] 612; 613; GFX9-LABEL: v_load_global_v32bf16: 614; GFX9: ; %bb.0: 615; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 616; GFX9-NEXT: v_mov_b32_e32 v17, v1 617; GFX9-NEXT: v_mov_b32_e32 v16, v0 618; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off 619; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16 620; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32 621; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48 622; GFX9-NEXT: s_waitcnt vmcnt(0) 623; GFX9-NEXT: s_setpc_b64 s[30:31] 624; 625; GFX10-LABEL: v_load_global_v32bf16: 626; GFX10: ; %bb.0: 627; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX10-NEXT: v_mov_b32_e32 v17, v1 629; GFX10-NEXT: v_mov_b32_e32 v16, v0 630; GFX10-NEXT: s_clause 0x3 631; GFX10-NEXT: global_load_dwordx4 v[0:3], v[16:17], off 632; GFX10-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16 633; GFX10-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32 634; GFX10-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48 635; GFX10-NEXT: s_waitcnt vmcnt(0) 636; GFX10-NEXT: s_setpc_b64 s[30:31] 637; 638; GFX11-LABEL: v_load_global_v32bf16: 639; GFX11: ; %bb.0: 640; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0 642; GFX11-NEXT: s_clause 0x3 643; GFX11-NEXT: global_load_b128 v[0:3], v[12:13], off 644; GFX11-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16 645; GFX11-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32 646; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 647; GFX11-NEXT: s_waitcnt vmcnt(0) 648; GFX11-NEXT: s_setpc_b64 s[30:31] 649 %load = load <32 x bfloat>, ptr addrspace(1) %ptr 650 ret <32 x bfloat> %load 651} 652 653define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { 654; GCN-LABEL: v_load_global_v64bf16: 655; GCN: ; %bb.0: 656; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 657; GCN-NEXT: s_mov_b32 s7, 0xf000 658; GCN-NEXT: s_mov_b32 s6, 0 659; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0 660; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0 661; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 662; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 663; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0 664; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0 665; GCN-NEXT: s_mov_b32 s4, s6 666; GCN-NEXT: s_mov_b32 s5, s6 667; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 668; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0 669; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0 670; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 671; GCN-NEXT: s_waitcnt vmcnt(0) 672; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 673; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 674; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen 675; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen 676; GCN-NEXT: s_waitcnt expcnt(0) 677; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 678; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 679; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 680; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 681; GCN-NEXT: s_waitcnt vmcnt(0) 682; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen 683; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen 684; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen 685; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen 686; GCN-NEXT: s_waitcnt expcnt(0) 687; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 688; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 689; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0 690; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 691; GCN-NEXT: s_waitcnt vmcnt(0) 692; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen 693; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen 694; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen 695; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen 696; GCN-NEXT: s_waitcnt expcnt(0) 697; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 698; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0 699; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 700; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 701; GCN-NEXT: s_waitcnt vmcnt(0) 702; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen 703; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen 704; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen 705; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 706; GCN-NEXT: s_waitcnt expcnt(0) 707; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 708; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48 709; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0 710; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 711; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16 712; GCN-NEXT: s_waitcnt vmcnt(2) 713; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen 714; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 715; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen 716; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 717; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen 718; GCN-NEXT: s_waitcnt expcnt(0) 719; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 720; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen 721; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 722; GCN-NEXT: s_waitcnt expcnt(0) 723; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 724; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 725; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 726; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 727; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen 728; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 729; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 730; GCN-NEXT: s_waitcnt expcnt(0) 731; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 732; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen 733; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 734; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 735; GCN-NEXT: s_waitcnt expcnt(0) 736; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 737; GCN-NEXT: s_waitcnt vmcnt(8) 738; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen 739; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen 740; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen 741; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen 742; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen 743; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen 744; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen 745; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen 746; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 747; GCN-NEXT: s_setpc_b64 s[30:31] 748; 749; GFX7-LABEL: v_load_global_v64bf16: 750; GFX7: ; %bb.0: 751; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GFX7-NEXT: s_mov_b32 s6, 0 753; GFX7-NEXT: s_mov_b32 s7, 0xf000 754; GFX7-NEXT: s_mov_b32 s4, s6 755; GFX7-NEXT: s_mov_b32 s5, s6 756; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 757; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0 758; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0 759; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0 760; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0 761; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0 762; GFX7-NEXT: s_waitcnt vmcnt(0) 763; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 764; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 765; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen 766; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen 767; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 768; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 769; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 770; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0 771; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0 772; GFX7-NEXT: s_waitcnt vmcnt(0) 773; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 774; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 775; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen 776; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen 777; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 778; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 779; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 780; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 781; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 782; GFX7-NEXT: s_waitcnt vmcnt(0) 783; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 784; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 785; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen 786; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen 787; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 788; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 789; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 790; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 791; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0 792; GFX7-NEXT: s_waitcnt vmcnt(0) 793; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 794; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen 795; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen 796; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen 797; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 798; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 799; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16 800; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 801; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0 802; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 803; GFX7-NEXT: s_waitcnt vmcnt(3) 804; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen 805; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 806; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen 807; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 808; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen 809; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 810; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen 811; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 812; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 813; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 814; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 815; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 816; GFX7-NEXT: s_waitcnt vmcnt(6) 817; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen 818; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 819; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen 820; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 821; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen 822; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 823; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen 824; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 825; GFX7-NEXT: s_waitcnt vmcnt(9) 826; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen 827; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen 828; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen 829; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen 830; GFX7-NEXT: s_waitcnt vmcnt(12) 831; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen 832; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen 833; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen 834; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen 835; GFX7-NEXT: s_waitcnt vmcnt(0) 836; GFX7-NEXT: s_setpc_b64 s[30:31] 837; 838; GFX8-LABEL: v_load_global_v64bf16: 839; GFX8: ; %bb.0: 840; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 841; GFX8-NEXT: v_mov_b32_e32 v28, v0 842; GFX8-NEXT: v_mov_b32_e32 v29, v1 843; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v28 844; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v29, vcc 845; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v28 846; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v29, vcc 847; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28 848; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc 849; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28 850; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc 851; GFX8-NEXT: s_movk_i32 s4, 0x50 852; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28 853; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc 854; GFX8-NEXT: s_movk_i32 s4, 0x60 855; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28 856; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc 857; GFX8-NEXT: s_movk_i32 s4, 0x70 858; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29] 859; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 860; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 861; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc 862; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 863; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 864; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17] 865; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21] 866; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] 867; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29] 868; GFX8-NEXT: s_waitcnt vmcnt(0) 869; GFX8-NEXT: s_setpc_b64 s[30:31] 870; 871; GFX9-LABEL: v_load_global_v64bf16: 872; GFX9: ; %bb.0: 873; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 874; GFX9-NEXT: v_mov_b32_e32 v29, v1 875; GFX9-NEXT: v_mov_b32_e32 v28, v0 876; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off 877; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 878; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 879; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 880; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 881; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 882; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 883; GFX9-NEXT: s_nop 0 884; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 885; GFX9-NEXT: s_waitcnt vmcnt(0) 886; GFX9-NEXT: s_setpc_b64 s[30:31] 887; 888; GFX10-LABEL: v_load_global_v64bf16: 889; GFX10: ; %bb.0: 890; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX10-NEXT: v_mov_b32_e32 v33, v1 892; GFX10-NEXT: v_mov_b32_e32 v32, v0 893; GFX10-NEXT: s_clause 0x7 894; GFX10-NEXT: global_load_dwordx4 v[0:3], v[32:33], off 895; GFX10-NEXT: global_load_dwordx4 v[4:7], v[32:33], off offset:16 896; GFX10-NEXT: global_load_dwordx4 v[8:11], v[32:33], off offset:32 897; GFX10-NEXT: global_load_dwordx4 v[12:15], v[32:33], off offset:48 898; GFX10-NEXT: global_load_dwordx4 v[16:19], v[32:33], off offset:64 899; GFX10-NEXT: global_load_dwordx4 v[20:23], v[32:33], off offset:80 900; GFX10-NEXT: global_load_dwordx4 v[24:27], v[32:33], off offset:96 901; GFX10-NEXT: global_load_dwordx4 v[28:31], v[32:33], off offset:112 902; GFX10-NEXT: s_waitcnt vmcnt(0) 903; GFX10-NEXT: s_setpc_b64 s[30:31] 904; 905; GFX11-LABEL: v_load_global_v64bf16: 906; GFX11: ; %bb.0: 907; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 908; GFX11-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0 909; GFX11-NEXT: s_clause 0x7 910; GFX11-NEXT: global_load_b128 v[0:3], v[28:29], off 911; GFX11-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16 912; GFX11-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32 913; GFX11-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48 914; GFX11-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64 915; GFX11-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80 916; GFX11-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96 917; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 918; GFX11-NEXT: s_waitcnt vmcnt(0) 919; GFX11-NEXT: s_setpc_b64 s[30:31] 920 %load = load <64 x bfloat>, ptr addrspace(1) %ptr 921 ret <64 x bfloat> %load 922} 923 924define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { 925; GCN-LABEL: v_store_global_v2bf16: 926; GCN: ; %bb.0: 927; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 928; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 929; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 930; GCN-NEXT: s_mov_b32 s6, 0 931; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 932; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 933; GCN-NEXT: s_mov_b32 s7, 0xf000 934; GCN-NEXT: s_mov_b32 s4, s6 935; GCN-NEXT: s_mov_b32 s5, s6 936; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 937; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 938; GCN-NEXT: s_setpc_b64 s[30:31] 939; 940; GFX7-LABEL: v_store_global_v2bf16: 941; GFX7: ; %bb.0: 942; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 943; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 944; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 945; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 946; GFX7-NEXT: s_mov_b32 s6, 0 947; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 948; GFX7-NEXT: s_mov_b32 s7, 0xf000 949; GFX7-NEXT: s_mov_b32 s4, s6 950; GFX7-NEXT: s_mov_b32 s5, s6 951; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 952; GFX7-NEXT: s_waitcnt vmcnt(0) 953; GFX7-NEXT: s_setpc_b64 s[30:31] 954; 955; GFX8-LABEL: v_store_global_v2bf16: 956; GFX8: ; %bb.0: 957; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 958; GFX8-NEXT: flat_store_dword v[1:2], v0 959; GFX8-NEXT: s_waitcnt vmcnt(0) 960; GFX8-NEXT: s_setpc_b64 s[30:31] 961; 962; GFX9-LABEL: v_store_global_v2bf16: 963; GFX9: ; %bb.0: 964; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 965; GFX9-NEXT: global_store_dword v[1:2], v0, off 966; GFX9-NEXT: s_waitcnt vmcnt(0) 967; GFX9-NEXT: s_setpc_b64 s[30:31] 968; 969; GFX10-LABEL: v_store_global_v2bf16: 970; GFX10: ; %bb.0: 971; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 972; GFX10-NEXT: global_store_dword v[1:2], v0, off 973; GFX10-NEXT: s_setpc_b64 s[30:31] 974; 975; GFX11-LABEL: v_store_global_v2bf16: 976; GFX11: ; %bb.0: 977; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 978; GFX11-NEXT: global_store_b32 v[1:2], v0, off 979; GFX11-NEXT: s_setpc_b64 s[30:31] 980 store <2 x bfloat> %val, ptr addrspace(1) %ptr 981 ret void 982} 983 984define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { 985; GCN-LABEL: v_store_global_v3bf16: 986; GCN: ; %bb.0: 987; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 988; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 989; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 990; GCN-NEXT: s_mov_b32 s7, 0xf000 991; GCN-NEXT: s_mov_b32 s6, 0 992; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 993; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 994; GCN-NEXT: s_mov_b32 s4, s6 995; GCN-NEXT: s_mov_b32 s5, s6 996; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 997; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 998; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 999; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 1000; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1001; GCN-NEXT: s_setpc_b64 s[30:31] 1002; 1003; GFX7-LABEL: v_store_global_v3bf16: 1004; GFX7: ; %bb.0: 1005; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1006; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1007; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1008; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1009; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 1010; GFX7-NEXT: s_mov_b32 s6, 0 1011; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 1012; GFX7-NEXT: s_mov_b32 s7, 0xf000 1013; GFX7-NEXT: s_mov_b32 s4, s6 1014; GFX7-NEXT: s_mov_b32 s5, s6 1015; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1016; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 1017; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 1018; GFX7-NEXT: s_waitcnt vmcnt(0) 1019; GFX7-NEXT: s_setpc_b64 s[30:31] 1020; 1021; GFX8-LABEL: v_store_global_v3bf16: 1022; GFX8: ; %bb.0: 1023; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1024; GFX8-NEXT: flat_store_dword v[2:3], v0 1025; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2 1026; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1027; GFX8-NEXT: flat_store_short v[2:3], v1 1028; GFX8-NEXT: s_waitcnt vmcnt(0) 1029; GFX8-NEXT: s_setpc_b64 s[30:31] 1030; 1031; GFX9-LABEL: v_store_global_v3bf16: 1032; GFX9: ; %bb.0: 1033; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1034; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4 1035; GFX9-NEXT: global_store_dword v[2:3], v0, off 1036; GFX9-NEXT: s_waitcnt vmcnt(0) 1037; GFX9-NEXT: s_setpc_b64 s[30:31] 1038; 1039; GFX10-LABEL: v_store_global_v3bf16: 1040; GFX10: ; %bb.0: 1041; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1042; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4 1043; GFX10-NEXT: global_store_dword v[2:3], v0, off 1044; GFX10-NEXT: s_setpc_b64 s[30:31] 1045; 1046; GFX11-LABEL: v_store_global_v3bf16: 1047; GFX11: ; %bb.0: 1048; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1049; GFX11-NEXT: s_clause 0x1 1050; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 1051; GFX11-NEXT: global_store_b32 v[2:3], v0, off 1052; GFX11-NEXT: s_setpc_b64 s[30:31] 1053 store <3 x bfloat> %val, ptr addrspace(1) %ptr 1054 ret void 1055} 1056 1057define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { 1058; GCN-LABEL: v_store_global_v4bf16: 1059; GCN: ; %bb.0: 1060; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1061; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 1062; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 1063; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 1064; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 1065; GCN-NEXT: s_mov_b32 s6, 0 1066; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1067; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 1068; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 1069; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 1070; GCN-NEXT: s_mov_b32 s7, 0xf000 1071; GCN-NEXT: s_mov_b32 s4, s6 1072; GCN-NEXT: s_mov_b32 s5, s6 1073; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 1074; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1075; GCN-NEXT: s_setpc_b64 s[30:31] 1076; 1077; GFX7-LABEL: v_store_global_v4bf16: 1078; GFX7: ; %bb.0: 1079; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1080; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 1081; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1082; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1083; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1084; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1085; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1086; GFX7-NEXT: s_mov_b32 s6, 0 1087; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 1088; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 1089; GFX7-NEXT: s_mov_b32 s7, 0xf000 1090; GFX7-NEXT: s_mov_b32 s4, s6 1091; GFX7-NEXT: s_mov_b32 s5, s6 1092; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64 1093; GFX7-NEXT: s_waitcnt vmcnt(0) 1094; GFX7-NEXT: s_setpc_b64 s[30:31] 1095; 1096; GFX8-LABEL: v_store_global_v4bf16: 1097; GFX8: ; %bb.0: 1098; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1099; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1100; GFX8-NEXT: s_waitcnt vmcnt(0) 1101; GFX8-NEXT: s_setpc_b64 s[30:31] 1102; 1103; GFX9-LABEL: v_store_global_v4bf16: 1104; GFX9: ; %bb.0: 1105; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1106; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1107; GFX9-NEXT: s_waitcnt vmcnt(0) 1108; GFX9-NEXT: s_setpc_b64 s[30:31] 1109; 1110; GFX10-LABEL: v_store_global_v4bf16: 1111; GFX10: ; %bb.0: 1112; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1113; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1114; GFX10-NEXT: s_setpc_b64 s[30:31] 1115; 1116; GFX11-LABEL: v_store_global_v4bf16: 1117; GFX11: ; %bb.0: 1118; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1119; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1120; GFX11-NEXT: s_setpc_b64 s[30:31] 1121 store <4 x bfloat> %val, ptr addrspace(1) %ptr 1122 ret void 1123} 1124 1125define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { 1126; GCN-LABEL: v_store_global_v8bf16: 1127; GCN: ; %bb.0: 1128; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1129; GCN-NEXT: s_mov_b32 s7, 0xf000 1130; GCN-NEXT: s_mov_b32 s6, 0 1131; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1132; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 1133; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 1134; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 1135; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 1136; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 1137; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 1138; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 1139; GCN-NEXT: s_mov_b32 s4, s6 1140; GCN-NEXT: s_mov_b32 s5, s6 1141; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 1142; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1143; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 1144; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 1145; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 1146; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 1147; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 1148; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 1149; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 1150; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1151; GCN-NEXT: s_setpc_b64 s[30:31] 1152; 1153; GFX7-LABEL: v_store_global_v8bf16: 1154; GFX7: ; %bb.0: 1155; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1156; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 1157; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 1158; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 1159; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1160; GFX7-NEXT: s_mov_b32 s6, 0 1161; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1162; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 1163; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1164; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 1165; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1166; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1167; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1168; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1169; GFX7-NEXT: s_mov_b32 s7, 0xf000 1170; GFX7-NEXT: s_mov_b32 s4, s6 1171; GFX7-NEXT: s_mov_b32 s5, s6 1172; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 1173; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 1174; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 1175; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 1176; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 1177; GFX7-NEXT: s_waitcnt vmcnt(0) 1178; GFX7-NEXT: s_setpc_b64 s[30:31] 1179; 1180; GFX8-LABEL: v_store_global_v8bf16: 1181; GFX8: ; %bb.0: 1182; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1183; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1184; GFX8-NEXT: s_waitcnt vmcnt(0) 1185; GFX8-NEXT: s_setpc_b64 s[30:31] 1186; 1187; GFX9-LABEL: v_store_global_v8bf16: 1188; GFX9: ; %bb.0: 1189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1190; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1191; GFX9-NEXT: s_waitcnt vmcnt(0) 1192; GFX9-NEXT: s_setpc_b64 s[30:31] 1193; 1194; GFX10-LABEL: v_store_global_v8bf16: 1195; GFX10: ; %bb.0: 1196; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1197; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 1198; GFX10-NEXT: s_setpc_b64 s[30:31] 1199; 1200; GFX11-LABEL: v_store_global_v8bf16: 1201; GFX11: ; %bb.0: 1202; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1203; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 1204; GFX11-NEXT: s_setpc_b64 s[30:31] 1205 store <8 x bfloat> %val, ptr addrspace(1) %ptr 1206 ret void 1207} 1208 1209define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { 1210; GCN-LABEL: v_store_global_v16bf16: 1211; GCN: ; %bb.0: 1212; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1213; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1214; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 1215; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 1216; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 1217; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 1218; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 1219; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 1220; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 1221; GCN-NEXT: s_mov_b32 s7, 0xf000 1222; GCN-NEXT: s_mov_b32 s6, 0 1223; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 1224; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 1225; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 1226; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 1227; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 1228; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 1229; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 1230; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 1231; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1232; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1233; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 1234; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 1235; GCN-NEXT: s_mov_b32 s4, s6 1236; GCN-NEXT: s_mov_b32 s5, s6 1237; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 1238; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 1239; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 1240; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 1241; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 1242; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 1243; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 1244; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 1245; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 1246; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 1247; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 1248; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 1249; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 1250; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 1251; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1252; GCN-NEXT: s_setpc_b64 s[30:31] 1253; 1254; GFX7-LABEL: v_store_global_v16bf16: 1255; GFX7: ; %bb.0: 1256; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1257; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 1258; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 1259; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1260; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1261; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 1262; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1263; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1264; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1265; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1266; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 1267; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 1268; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 1269; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 1270; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1271; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 1272; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 1273; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 1274; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1275; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 1276; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 1277; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 1278; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1279; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 1280; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 1281; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 1282; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 1283; GFX7-NEXT: s_mov_b32 s6, 0 1284; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1285; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 1286; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1287; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 1288; GFX7-NEXT: s_mov_b32 s7, 0xf000 1289; GFX7-NEXT: s_mov_b32 s4, s6 1290; GFX7-NEXT: s_mov_b32 s5, s6 1291; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 1292; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 1293; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 1294; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 1295; GFX7-NEXT: s_waitcnt vmcnt(0) 1296; GFX7-NEXT: s_setpc_b64 s[30:31] 1297; 1298; GFX8-LABEL: v_store_global_v16bf16: 1299; GFX8: ; %bb.0: 1300; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1301; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1302; GFX8-NEXT: s_nop 0 1303; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8 1304; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc 1305; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 1306; GFX8-NEXT: s_waitcnt vmcnt(0) 1307; GFX8-NEXT: s_setpc_b64 s[30:31] 1308; 1309; GFX9-LABEL: v_store_global_v16bf16: 1310; GFX9: ; %bb.0: 1311; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1312; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 1313; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 1314; GFX9-NEXT: s_waitcnt vmcnt(0) 1315; GFX9-NEXT: s_setpc_b64 s[30:31] 1316; 1317; GFX10-LABEL: v_store_global_v16bf16: 1318; GFX10: ; %bb.0: 1319; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1320; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 1321; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 1322; GFX10-NEXT: s_setpc_b64 s[30:31] 1323; 1324; GFX11-LABEL: v_store_global_v16bf16: 1325; GFX11: ; %bb.0: 1326; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1327; GFX11-NEXT: s_clause 0x1 1328; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 1329; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 1330; GFX11-NEXT: s_setpc_b64 s[30:31] 1331 store <16 x bfloat> %val, ptr addrspace(1) %ptr 1332 ret void 1333} 1334 1335define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { 1336; GCN-LABEL: v_store_global_v32bf16: 1337; GCN: ; %bb.0: 1338; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1339; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 1340; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 1341; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 1342; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 1343; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 1344; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 1345; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 1346; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 1347; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 1348; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 1349; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 1350; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 1351; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 1352; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 1353; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 1354; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 1355; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1356; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 1357; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 1358; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 1359; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1360; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 1361; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 1362; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 1363; GCN-NEXT: s_mov_b32 s6, 0 1364; GCN-NEXT: s_mov_b32 s7, 0xf000 1365; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 1366; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 1367; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 1368; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 1369; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 1370; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 1371; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 1372; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 1373; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 1374; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 1375; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 1376; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 1377; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 1378; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 1379; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 1380; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 1381; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 1382; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 1383; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 1384; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 1385; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 1386; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 1387; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 1388; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 1389; GCN-NEXT: s_mov_b32 s4, s6 1390; GCN-NEXT: s_mov_b32 s5, s6 1391; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 1392; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 1393; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 1394; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 1395; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 1396; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 1397; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 1398; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 1399; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 1400; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 1401; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 1402; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 1403; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 1404; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 1405; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 1406; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 1407; GCN-NEXT: s_waitcnt vmcnt(1) 1408; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 1409; GCN-NEXT: s_waitcnt vmcnt(1) 1410; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 1411; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 1412; GCN-NEXT: s_waitcnt expcnt(0) 1413; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 1414; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 1415; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 1416; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 1417; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1418; GCN-NEXT: s_setpc_b64 s[30:31] 1419; 1420; GFX7-LABEL: v_store_global_v32bf16: 1421; GFX7: ; %bb.0: 1422; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1423; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 1424; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1425; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1426; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1427; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1428; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1429; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 1430; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 1431; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 1432; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 1433; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 1434; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 1435; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 1436; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 1437; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 1438; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 1439; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 1440; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1441; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1442; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 1443; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 1444; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 1445; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 1446; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 1447; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 1448; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 1449; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 1450; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1451; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 1452; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 1453; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 1454; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1455; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 1456; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16 1457; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 1458; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 1459; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 1460; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30 1461; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 1462; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 1463; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 1464; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 1465; GFX7-NEXT: s_mov_b32 s6, 0 1466; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 1467; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 1468; GFX7-NEXT: s_mov_b32 s7, 0xf000 1469; GFX7-NEXT: s_mov_b32 s4, s6 1470; GFX7-NEXT: s_mov_b32 s5, s6 1471; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 1472; GFX7-NEXT: s_waitcnt vmcnt(2) 1473; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14 1474; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1475; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16 1476; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 1477; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 1478; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 1479; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 1480; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1481; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 1482; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 1483; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 1484; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 1485; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1486; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 1487; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 1488; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 1489; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 1490; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 1491; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 1492; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1493; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 1494; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 1495; GFX7-NEXT: s_waitcnt vmcnt(0) 1496; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 1497; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 1498; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 1499; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 1500; GFX7-NEXT: s_waitcnt vmcnt(0) 1501; GFX7-NEXT: s_setpc_b64 s[30:31] 1502; 1503; GFX8-LABEL: v_store_global_v32bf16: 1504; GFX8: ; %bb.0: 1505; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1506; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1507; GFX8-NEXT: s_nop 0 1508; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16 1509; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc 1510; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] 1511; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16 1512; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc 1513; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 1514; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16 1515; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc 1516; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 1517; GFX8-NEXT: s_waitcnt vmcnt(0) 1518; GFX8-NEXT: s_setpc_b64 s[30:31] 1519; 1520; GFX9-LABEL: v_store_global_v32bf16: 1521; GFX9: ; %bb.0: 1522; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1523; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 1524; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 1525; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 1526; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off 1527; GFX9-NEXT: s_waitcnt vmcnt(0) 1528; GFX9-NEXT: s_setpc_b64 s[30:31] 1529; 1530; GFX10-LABEL: v_store_global_v32bf16: 1531; GFX10: ; %bb.0: 1532; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1533; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 1534; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 1535; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 1536; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off 1537; GFX10-NEXT: s_setpc_b64 s[30:31] 1538; 1539; GFX11-LABEL: v_store_global_v32bf16: 1540; GFX11: ; %bb.0: 1541; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1542; GFX11-NEXT: s_clause 0x3 1543; GFX11-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48 1544; GFX11-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32 1545; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16 1546; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off 1547; GFX11-NEXT: s_setpc_b64 s[30:31] 1548 store <32 x bfloat> %val, ptr addrspace(1) %ptr 1549 ret void 1550} 1551 1552define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { 1553; GCN-LABEL: v_store_global_v64bf16: 1554; GCN: ; %bb.0: 1555; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1556; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 1557; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 1558; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 1559; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 1560; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 1561; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 1562; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 1563; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 1564; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 1565; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 1566; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 1567; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 1568; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 1569; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 1570; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 1571; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 1572; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 1573; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 1574; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 1575; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 1576; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 1577; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 1578; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 1579; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 1580; GCN-NEXT: s_mov_b32 s6, 0 1581; GCN-NEXT: s_mov_b32 s7, 0xf000 1582; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 1583; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 1584; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 1585; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 1586; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1587; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 1588; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 1589; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 1590; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 1591; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 1592; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 1593; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 1594; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 1595; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 1596; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 1597; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 1598; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 1599; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 1600; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1601; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1602; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 1603; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 1604; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 1605; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 1606; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 1607; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 1608; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 1609; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 1610; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 1611; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 1612; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 1613; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 1614; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 1615; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 1616; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 1617; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 1618; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 1619; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 1620; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 1621; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 1622; GCN-NEXT: s_mov_b32 s4, s6 1623; GCN-NEXT: s_mov_b32 s5, s6 1624; GCN-NEXT: s_waitcnt vmcnt(6) 1625; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 1626; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 1627; GCN-NEXT: s_waitcnt expcnt(0) 1628; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 1629; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 1630; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 1631; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 1632; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 1633; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 1634; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 1635; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 1636; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 1637; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 1638; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 1639; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 1640; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 1641; GCN-NEXT: s_waitcnt vmcnt(14) 1642; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1643; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 1644; GCN-NEXT: s_waitcnt vmcnt(13) 1645; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 1646; GCN-NEXT: s_waitcnt vmcnt(12) 1647; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 1648; GCN-NEXT: s_waitcnt vmcnt(11) 1649; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 1650; GCN-NEXT: s_waitcnt vmcnt(10) 1651; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 1652; GCN-NEXT: s_waitcnt vmcnt(7) 1653; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 1654; GCN-NEXT: s_waitcnt vmcnt(6) 1655; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 1656; GCN-NEXT: s_waitcnt vmcnt(5) 1657; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 1658; GCN-NEXT: s_waitcnt vmcnt(4) 1659; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 1660; GCN-NEXT: s_waitcnt vmcnt(3) 1661; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 1662; GCN-NEXT: s_waitcnt vmcnt(2) 1663; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 1664; GCN-NEXT: s_waitcnt vmcnt(1) 1665; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 1666; GCN-NEXT: s_waitcnt vmcnt(0) 1667; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 1668; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1669; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 1670; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 1671; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 1672; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 1673; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 1674; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 1675; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 1676; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 1677; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 1678; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 1679; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 1680; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 1681; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 1682; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 1683; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 1684; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 1685; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 1686; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 1687; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 1688; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 1689; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 1690; GCN-NEXT: s_waitcnt vmcnt(7) 1691; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 1692; GCN-NEXT: s_waitcnt vmcnt(6) 1693; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 1694; GCN-NEXT: s_waitcnt vmcnt(5) 1695; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 1696; GCN-NEXT: s_waitcnt vmcnt(4) 1697; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 1698; GCN-NEXT: s_waitcnt vmcnt(3) 1699; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 1700; GCN-NEXT: s_waitcnt vmcnt(2) 1701; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 1702; GCN-NEXT: s_waitcnt vmcnt(1) 1703; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 1704; GCN-NEXT: s_waitcnt vmcnt(0) 1705; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 1706; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1707; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 1708; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 1709; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 1710; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 1711; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 1712; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 1713; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 1714; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 1715; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 1716; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 1717; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 1718; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 1719; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 1720; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 1721; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 1722; GCN-NEXT: s_waitcnt vmcnt(7) 1723; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 1724; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 1725; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 1726; GCN-NEXT: s_waitcnt vmcnt(6) 1727; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 1728; GCN-NEXT: s_waitcnt vmcnt(5) 1729; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 1730; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 1731; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 1732; GCN-NEXT: s_waitcnt vmcnt(4) 1733; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 1734; GCN-NEXT: s_waitcnt vmcnt(3) 1735; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 1736; GCN-NEXT: s_waitcnt vmcnt(2) 1737; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 1738; GCN-NEXT: s_waitcnt vmcnt(1) 1739; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 1740; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 1741; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 1742; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 1743; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 1744; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 1745; GCN-NEXT: s_waitcnt vmcnt(1) 1746; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 1747; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 1748; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 1749; GCN-NEXT: s_waitcnt vmcnt(2) 1750; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 1751; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 1752; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 1753; GCN-NEXT: s_waitcnt vmcnt(1) 1754; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 1755; GCN-NEXT: s_waitcnt vmcnt(0) 1756; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 1757; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 1758; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 1759; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 1760; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 1761; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 1762; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 1763; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 1764; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 1765; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1766; GCN-NEXT: s_setpc_b64 s[30:31] 1767; 1768; GFX7-LABEL: v_store_global_v64bf16: 1769; GFX7: ; %bb.0: 1770; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1771; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 1772; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 1773; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 1774; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:116 1775; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 1776; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 1777; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 1778; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 1779; GFX7-NEXT: s_mov_b32 s6, 0 1780; GFX7-NEXT: s_mov_b32 s7, 0xf000 1781; GFX7-NEXT: s_mov_b32 s4, s6 1782; GFX7-NEXT: s_mov_b32 s5, s6 1783; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 1784; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 1785; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1786; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1787; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1788; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 1789; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 1790; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 1791; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 1792; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 1793; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1794; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 1795; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 1796; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1797; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 1798; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 1799; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 1800; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 1801; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 1802; GFX7-NEXT: s_waitcnt vmcnt(7) 1803; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 1804; GFX7-NEXT: s_waitcnt vmcnt(6) 1805; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 1806; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 1807; GFX7-NEXT: s_waitcnt vmcnt(5) 1808; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 1809; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 1810; GFX7-NEXT: s_waitcnt vmcnt(3) 1811; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 1812; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 1813; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1814; GFX7-NEXT: s_waitcnt vmcnt(2) 1815; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 1816; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 1817; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 1818; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 1819; GFX7-NEXT: s_waitcnt vmcnt(1) 1820; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 1821; GFX7-NEXT: s_waitcnt vmcnt(0) 1822; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48 1823; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 1824; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16 1825; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 1826; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 1827; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 1828; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 1829; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 1830; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 1831; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 1832; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 1833; GFX7-NEXT: s_waitcnt vmcnt(6) 1834; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112 1835; GFX7-NEXT: s_waitcnt vmcnt(6) 1836; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 1837; GFX7-NEXT: s_waitcnt vmcnt(5) 1838; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 1839; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1840; GFX7-NEXT: s_waitcnt vmcnt(4) 1841; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 1842; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 1843; GFX7-NEXT: s_waitcnt vmcnt(2) 1844; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 1845; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 1846; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 1847; GFX7-NEXT: s_waitcnt vmcnt(1) 1848; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 1849; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1850; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 1851; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 1852; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 1853; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 1854; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 1855; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 1856; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 1857; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 1858; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48 1859; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 1860; GFX7-NEXT: s_waitcnt vmcnt(7) 1861; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 1862; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1863; GFX7-NEXT: s_waitcnt vmcnt(6) 1864; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 1865; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 1866; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 1867; GFX7-NEXT: s_waitcnt vmcnt(3) 1868; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 1869; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 1870; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1871; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 1872; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 1873; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 1874; GFX7-NEXT: s_waitcnt vmcnt(2) 1875; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 1876; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 1877; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1878; GFX7-NEXT: s_waitcnt vmcnt(1) 1879; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 1880; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 1881; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 1882; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 1883; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 1884; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 1885; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 1886; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 1887; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 1888; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 1889; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 1890; GFX7-NEXT: s_waitcnt vmcnt(7) 1891; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 1892; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1893; GFX7-NEXT: s_waitcnt vmcnt(6) 1894; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 1895; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 1896; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 1897; GFX7-NEXT: s_waitcnt vmcnt(3) 1898; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 1899; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 1900; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1901; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 1902; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 1903; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 1904; GFX7-NEXT: s_waitcnt vmcnt(2) 1905; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 1906; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 1907; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1908; GFX7-NEXT: s_waitcnt vmcnt(1) 1909; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 1910; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 1911; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 1912; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 1913; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 1914; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 1915; GFX7-NEXT: s_waitcnt vmcnt(2) 1916; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 1917; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 1918; GFX7-NEXT: s_waitcnt vmcnt(1) 1919; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 1920; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 1921; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 1922; GFX7-NEXT: s_nop 0 1923; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 1924; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 1925; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 1926; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 1927; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 1928; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1929; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 1930; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 1931; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 1932; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1933; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 1934; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 1935; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 1936; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1937; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 1938; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1939; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 1940; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 1941; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 1942; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 1943; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 1944; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1945; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 1946; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1947; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 1948; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 1949; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 1950; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 1951; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1952; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 1953; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 1954; GFX7-NEXT: s_waitcnt vmcnt(1) 1955; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 1956; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1957; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 1958; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 1959; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 1960; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1961; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 1962; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 1963; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 1964; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 1965; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1966; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 1967; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 1968; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 1969; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 1970; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 1971; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 1972; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64 1973; GFX7-NEXT: s_waitcnt vmcnt(0) 1974; GFX7-NEXT: s_setpc_b64 s[30:31] 1975; 1976; GFX8-LABEL: v_store_global_v64bf16: 1977; GFX8: ; %bb.0: 1978; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1979; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 1980; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 1981; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 1982; GFX8-NEXT: s_movk_i32 s4, 0x70 1983; GFX8-NEXT: s_movk_i32 s5, 0x50 1984; GFX8-NEXT: s_waitcnt vmcnt(2) 1985; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32 1986; GFX8-NEXT: s_waitcnt vmcnt(1) 1987; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc 1988; GFX8-NEXT: s_movk_i32 s4, 0x60 1989; GFX8-NEXT: s_waitcnt vmcnt(0) 1990; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31] 1991; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3] 1992; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32 1993; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc 1994; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32 1995; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc 1996; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32 1997; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc 1998; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27] 1999; GFX8-NEXT: s_nop 0 2000; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32 2001; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc 2002; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32 2003; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc 2004; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32 2005; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc 2006; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] 2007; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] 2008; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15] 2009; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11] 2010; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7] 2011; GFX8-NEXT: s_waitcnt vmcnt(0) 2012; GFX8-NEXT: s_setpc_b64 s[30:31] 2013; 2014; GFX9-LABEL: v_store_global_v64bf16: 2015; GFX9: ; %bb.0: 2016; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2017; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 2018; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 2019; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 2020; GFX9-NEXT: s_waitcnt vmcnt(0) 2021; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 2022; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 2023; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 2024; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 2025; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 2026; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 2027; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 2028; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off 2029; GFX9-NEXT: s_waitcnt vmcnt(0) 2030; GFX9-NEXT: s_setpc_b64 s[30:31] 2031; 2032; GFX10-LABEL: v_store_global_v64bf16: 2033; GFX10: ; %bb.0: 2034; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2035; GFX10-NEXT: s_clause 0x2 2036; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 2037; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 2038; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 2039; GFX10-NEXT: s_waitcnt vmcnt(0) 2040; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 2041; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 2042; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 2043; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 2044; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 2045; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 2046; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 2047; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off 2048; GFX10-NEXT: s_setpc_b64 s[30:31] 2049; 2050; GFX11-LABEL: v_store_global_v64bf16: 2051; GFX11: ; %bb.0: 2052; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2053; GFX11-NEXT: s_clause 0x2 2054; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 2055; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 2056; GFX11-NEXT: scratch_load_b32 v31, off, s32 2057; GFX11-NEXT: s_waitcnt vmcnt(0) 2058; GFX11-NEXT: s_clause 0x7 2059; GFX11-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112 2060; GFX11-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96 2061; GFX11-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80 2062; GFX11-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64 2063; GFX11-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48 2064; GFX11-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32 2065; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16 2066; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off 2067; GFX11-NEXT: s_setpc_b64 s[30:31] 2068 store <64 x bfloat> %val, ptr addrspace(1) %ptr 2069 ret void 2070} 2071 2072define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { 2073; GCN-LABEL: test_store_fpimm: 2074; GCN: ; %bb.0: 2075; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2076; GCN-NEXT: s_mov_b32 s7, 0xf000 2077; GCN-NEXT: s_mov_b32 s6, 0 2078; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80 2079; GCN-NEXT: v_mov_b32_e32 v5, 0x4228 2080; GCN-NEXT: s_mov_b32 s4, s6 2081; GCN-NEXT: s_mov_b32 s5, s6 2082; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64 2083; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64 2084; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2085; GCN-NEXT: s_setpc_b64 s[30:31] 2086; 2087; GFX7-LABEL: test_store_fpimm: 2088; GFX7: ; %bb.0: 2089; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2090; GFX7-NEXT: s_mov_b32 s6, 0 2091; GFX7-NEXT: s_mov_b32 s7, 0xf000 2092; GFX7-NEXT: s_mov_b32 s4, s6 2093; GFX7-NEXT: s_mov_b32 s5, s6 2094; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80 2095; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64 2096; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228 2097; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 2098; GFX7-NEXT: s_waitcnt vmcnt(0) 2099; GFX7-NEXT: s_setpc_b64 s[30:31] 2100; 2101; GFX8-LABEL: test_store_fpimm: 2102; GFX8: ; %bb.0: 2103; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2104; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80 2105; GFX8-NEXT: flat_store_short v[0:1], v4 2106; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228 2107; GFX8-NEXT: flat_store_short v[2:3], v0 2108; GFX8-NEXT: s_waitcnt vmcnt(0) 2109; GFX8-NEXT: s_setpc_b64 s[30:31] 2110; 2111; GFX9-LABEL: test_store_fpimm: 2112; GFX9: ; %bb.0: 2113; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2114; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80 2115; GFX9-NEXT: global_store_short v[0:1], v4, off 2116; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228 2117; GFX9-NEXT: global_store_short v[2:3], v0, off 2118; GFX9-NEXT: s_waitcnt vmcnt(0) 2119; GFX9-NEXT: s_setpc_b64 s[30:31] 2120; 2121; GFX10-LABEL: test_store_fpimm: 2122; GFX10: ; %bb.0: 2123; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2124; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 2125; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228 2126; GFX10-NEXT: global_store_short v[0:1], v4, off 2127; GFX10-NEXT: global_store_short v[2:3], v5, off 2128; GFX10-NEXT: s_setpc_b64 s[30:31] 2129; 2130; GFX11-LABEL: test_store_fpimm: 2131; GFX11: ; %bb.0: 2132; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2133; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80 2134; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228 2135; GFX11-NEXT: global_store_b16 v[0:1], v4, off 2136; GFX11-NEXT: global_store_b16 v[2:3], v5, off 2137; GFX11-NEXT: s_setpc_b64 s[30:31] 2138 store bfloat 1.0, ptr addrspace(1) %ptr0 2139 store bfloat 42.0, ptr addrspace(1) %ptr1 2140 ret void 2141} 2142 2143define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2144; GCN-LABEL: test_load_store_f32_to_bf16: 2145; GCN: ; %bb.0: 2146; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2147; GCN-NEXT: s_mov_b32 s6, 0 2148; GCN-NEXT: s_mov_b32 s7, 0xf000 2149; GCN-NEXT: s_mov_b32 s4, s6 2150; GCN-NEXT: s_mov_b32 s5, s6 2151; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2152; GCN-NEXT: s_waitcnt vmcnt(0) 2153; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 2154; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2155; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 2156; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2157; GCN-NEXT: s_setpc_b64 s[30:31] 2158; 2159; GFX7-LABEL: test_load_store_f32_to_bf16: 2160; GFX7: ; %bb.0: 2161; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2162; GFX7-NEXT: s_mov_b32 s6, 0 2163; GFX7-NEXT: s_mov_b32 s7, 0xf000 2164; GFX7-NEXT: s_mov_b32 s4, s6 2165; GFX7-NEXT: s_mov_b32 s5, s6 2166; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2167; GFX7-NEXT: s_waitcnt vmcnt(0) 2168; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 2169; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2170; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 2171; GFX7-NEXT: s_waitcnt vmcnt(0) 2172; GFX7-NEXT: s_setpc_b64 s[30:31] 2173; 2174; GFX8-LABEL: test_load_store_f32_to_bf16: 2175; GFX8: ; %bb.0: 2176; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2177; GFX8-NEXT: flat_load_dword v0, v[0:1] 2178; GFX8-NEXT: s_waitcnt vmcnt(0) 2179; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 2180; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 2181; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 2182; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 2183; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 2184; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 2185; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2186; GFX8-NEXT: flat_store_short v[2:3], v0 2187; GFX8-NEXT: s_waitcnt vmcnt(0) 2188; GFX8-NEXT: s_setpc_b64 s[30:31] 2189; 2190; GFX9-LABEL: test_load_store_f32_to_bf16: 2191; GFX9: ; %bb.0: 2192; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2193; GFX9-NEXT: global_load_dword v0, v[0:1], off 2194; GFX9-NEXT: s_movk_i32 s4, 0x7fff 2195; GFX9-NEXT: s_waitcnt vmcnt(0) 2196; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 2197; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 2198; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 2199; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 2200; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 2201; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off 2202; GFX9-NEXT: s_waitcnt vmcnt(0) 2203; GFX9-NEXT: s_setpc_b64 s[30:31] 2204; 2205; GFX10-LABEL: test_load_store_f32_to_bf16: 2206; GFX10: ; %bb.0: 2207; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2208; GFX10-NEXT: global_load_dword v0, v[0:1], off 2209; GFX10-NEXT: s_waitcnt vmcnt(0) 2210; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 2211; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 2212; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 2213; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 2214; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo 2215; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off 2216; GFX10-NEXT: s_setpc_b64 s[30:31] 2217; 2218; GFX11-LABEL: test_load_store_f32_to_bf16: 2219; GFX11: ; %bb.0: 2220; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2221; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2222; GFX11-NEXT: s_waitcnt vmcnt(0) 2223; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 2224; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 2225; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 2226; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2227; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 2228; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo 2229; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off 2230; GFX11-NEXT: s_setpc_b64 s[30:31] 2231 %val = load float, ptr addrspace(1) %in 2232 %val.bf16 = fptrunc float %val to bfloat 2233 store bfloat %val.bf16, ptr addrspace(1) %out 2234 ret void 2235} 2236 2237define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2238; GCN-LABEL: test_load_store_f64_to_bf16: 2239; GCN: ; %bb.0: 2240; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2241; GCN-NEXT: s_mov_b32 s6, 0 2242; GCN-NEXT: s_mov_b32 s7, 0xf000 2243; GCN-NEXT: s_mov_b32 s4, s6 2244; GCN-NEXT: s_mov_b32 s5, s6 2245; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 2246; GCN-NEXT: s_waitcnt vmcnt(0) 2247; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 2248; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2249; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 2250; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2251; GCN-NEXT: s_setpc_b64 s[30:31] 2252; 2253; GFX7-LABEL: test_load_store_f64_to_bf16: 2254; GFX7: ; %bb.0: 2255; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2256; GFX7-NEXT: s_mov_b32 s6, 0 2257; GFX7-NEXT: s_mov_b32 s7, 0xf000 2258; GFX7-NEXT: s_mov_b32 s4, s6 2259; GFX7-NEXT: s_mov_b32 s5, s6 2260; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 2261; GFX7-NEXT: s_waitcnt vmcnt(0) 2262; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] 2263; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2264; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 2265; GFX7-NEXT: s_waitcnt vmcnt(0) 2266; GFX7-NEXT: s_setpc_b64 s[30:31] 2267; 2268; GFX8-LABEL: test_load_store_f64_to_bf16: 2269; GFX8: ; %bb.0: 2270; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2271; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2272; GFX8-NEXT: s_waitcnt vmcnt(0) 2273; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 2274; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1 2275; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 2276; GFX8-NEXT: v_and_b32_e32 v8, 1, v6 2277; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 2278; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5] 2279; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5] 2280; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5] 2281; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4 2282; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc 2283; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 2284; GFX8-NEXT: v_or_b32_e32 v5, v4, v7 2285; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1 2286; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 2287; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 2288; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 2289; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 2290; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 2291; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2292; GFX8-NEXT: flat_store_short v[2:3], v0 2293; GFX8-NEXT: s_waitcnt vmcnt(0) 2294; GFX8-NEXT: s_setpc_b64 s[30:31] 2295; 2296; GFX9-LABEL: test_load_store_f64_to_bf16: 2297; GFX9: ; %bb.0: 2298; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2299; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 2300; GFX9-NEXT: s_brev_b32 s8, 1 2301; GFX9-NEXT: s_movk_i32 s9, 0x7fff 2302; GFX9-NEXT: s_waitcnt vmcnt(0) 2303; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 2304; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 2305; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 2306; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 2307; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5] 2308; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5] 2309; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] 2310; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 2311; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc 2312; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 2313; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] 2314; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4 2315; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1 2316; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9 2317; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 2318; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 2319; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off 2320; GFX9-NEXT: s_waitcnt vmcnt(0) 2321; GFX9-NEXT: s_setpc_b64 s[30:31] 2322; 2323; GFX10-LABEL: test_load_store_f64_to_bf16: 2324; GFX10: ; %bb.0: 2325; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2326; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 2327; GFX10-NEXT: s_waitcnt vmcnt(0) 2328; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 2329; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 2330; GFX10-NEXT: v_and_b32_e32 v7, 1, v6 2331; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 2332; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5] 2333; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5] 2334; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5 2335; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo 2336; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 2337; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo 2338; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] 2339; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 2340; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1 2341; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff 2342; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5 2343; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 2344; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off 2345; GFX10-NEXT: s_setpc_b64 s[30:31] 2346; 2347; GFX11-LABEL: test_load_store_f64_to_bf16: 2348; GFX11: ; %bb.0: 2349; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2350; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 2351; GFX11-NEXT: s_waitcnt vmcnt(0) 2352; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| 2353; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2354; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 2355; GFX11-NEXT: v_and_b32_e32 v7, 1, v6 2356; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 2357; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2358; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5] 2359; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] 2360; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1 2361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2362; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo 2363; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 2364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2365; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo 2366; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] 2367; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4 2368; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1 2369; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2370; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff 2371; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5 2372; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 2373; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off 2374; GFX11-NEXT: s_setpc_b64 s[30:31] 2375 %val = load double, ptr addrspace(1) %in 2376 %val.bf16 = fptrunc double %val to bfloat 2377 store bfloat %val.bf16, ptr addrspace(1) %out 2378 ret void 2379} 2380 2381define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2382; GCN-LABEL: test_load_store_bf16_to_f32: 2383; GCN: ; %bb.0: 2384; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2385; GCN-NEXT: s_mov_b32 s6, 0 2386; GCN-NEXT: s_mov_b32 s7, 0xf000 2387; GCN-NEXT: s_mov_b32 s4, s6 2388; GCN-NEXT: s_mov_b32 s5, s6 2389; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 2390; GCN-NEXT: s_waitcnt vmcnt(0) 2391; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2392; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2393; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2394; GCN-NEXT: s_setpc_b64 s[30:31] 2395; 2396; GFX7-LABEL: test_load_store_bf16_to_f32: 2397; GFX7: ; %bb.0: 2398; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2399; GFX7-NEXT: s_mov_b32 s6, 0 2400; GFX7-NEXT: s_mov_b32 s7, 0xf000 2401; GFX7-NEXT: s_mov_b32 s4, s6 2402; GFX7-NEXT: s_mov_b32 s5, s6 2403; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 2404; GFX7-NEXT: s_waitcnt vmcnt(0) 2405; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2406; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2407; GFX7-NEXT: s_waitcnt vmcnt(0) 2408; GFX7-NEXT: s_setpc_b64 s[30:31] 2409; 2410; GFX8-LABEL: test_load_store_bf16_to_f32: 2411; GFX8: ; %bb.0: 2412; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2413; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2414; GFX8-NEXT: s_waitcnt vmcnt(0) 2415; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2416; GFX8-NEXT: flat_store_dword v[2:3], v0 2417; GFX8-NEXT: s_waitcnt vmcnt(0) 2418; GFX8-NEXT: s_setpc_b64 s[30:31] 2419; 2420; GFX9-LABEL: test_load_store_bf16_to_f32: 2421; GFX9: ; %bb.0: 2422; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2423; GFX9-NEXT: global_load_ushort v0, v[0:1], off 2424; GFX9-NEXT: s_waitcnt vmcnt(0) 2425; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2426; GFX9-NEXT: global_store_dword v[2:3], v0, off 2427; GFX9-NEXT: s_waitcnt vmcnt(0) 2428; GFX9-NEXT: s_setpc_b64 s[30:31] 2429; 2430; GFX10-LABEL: test_load_store_bf16_to_f32: 2431; GFX10: ; %bb.0: 2432; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2433; GFX10-NEXT: global_load_ushort v0, v[0:1], off 2434; GFX10-NEXT: s_waitcnt vmcnt(0) 2435; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2436; GFX10-NEXT: global_store_dword v[2:3], v0, off 2437; GFX10-NEXT: s_setpc_b64 s[30:31] 2438; 2439; GFX11-LABEL: test_load_store_bf16_to_f32: 2440; GFX11: ; %bb.0: 2441; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2442; GFX11-NEXT: global_load_u16 v0, v[0:1], off 2443; GFX11-NEXT: s_waitcnt vmcnt(0) 2444; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2445; GFX11-NEXT: global_store_b32 v[2:3], v0, off 2446; GFX11-NEXT: s_setpc_b64 s[30:31] 2447 %val = load bfloat, ptr addrspace(1) %in 2448 %val.f32 = fpext bfloat %val to float 2449 store float %val.f32, ptr addrspace(1) %out 2450 ret void 2451} 2452 2453define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2454; GCN-LABEL: test_load_store_bf16_to_f64: 2455; GCN: ; %bb.0: 2456; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2457; GCN-NEXT: s_mov_b32 s6, 0 2458; GCN-NEXT: s_mov_b32 s7, 0xf000 2459; GCN-NEXT: s_mov_b32 s4, s6 2460; GCN-NEXT: s_mov_b32 s5, s6 2461; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 2462; GCN-NEXT: s_waitcnt vmcnt(0) 2463; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2464; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2465; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 2466; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2467; GCN-NEXT: s_setpc_b64 s[30:31] 2468; 2469; GFX7-LABEL: test_load_store_bf16_to_f64: 2470; GFX7: ; %bb.0: 2471; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2472; GFX7-NEXT: s_mov_b32 s6, 0 2473; GFX7-NEXT: s_mov_b32 s7, 0xf000 2474; GFX7-NEXT: s_mov_b32 s4, s6 2475; GFX7-NEXT: s_mov_b32 s5, s6 2476; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 2477; GFX7-NEXT: s_waitcnt vmcnt(0) 2478; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2479; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2480; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 2481; GFX7-NEXT: s_waitcnt vmcnt(0) 2482; GFX7-NEXT: s_setpc_b64 s[30:31] 2483; 2484; GFX8-LABEL: test_load_store_bf16_to_f64: 2485; GFX8: ; %bb.0: 2486; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2487; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2488; GFX8-NEXT: s_waitcnt vmcnt(0) 2489; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2490; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2491; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2492; GFX8-NEXT: s_waitcnt vmcnt(0) 2493; GFX8-NEXT: s_setpc_b64 s[30:31] 2494; 2495; GFX9-LABEL: test_load_store_bf16_to_f64: 2496; GFX9: ; %bb.0: 2497; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2498; GFX9-NEXT: global_load_ushort v0, v[0:1], off 2499; GFX9-NEXT: s_waitcnt vmcnt(0) 2500; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2501; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2502; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 2503; GFX9-NEXT: s_waitcnt vmcnt(0) 2504; GFX9-NEXT: s_setpc_b64 s[30:31] 2505; 2506; GFX10-LABEL: test_load_store_bf16_to_f64: 2507; GFX10: ; %bb.0: 2508; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2509; GFX10-NEXT: global_load_ushort v0, v[0:1], off 2510; GFX10-NEXT: s_waitcnt vmcnt(0) 2511; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2512; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2513; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 2514; GFX10-NEXT: s_setpc_b64 s[30:31] 2515; 2516; GFX11-LABEL: test_load_store_bf16_to_f64: 2517; GFX11: ; %bb.0: 2518; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2519; GFX11-NEXT: global_load_u16 v0, v[0:1], off 2520; GFX11-NEXT: s_waitcnt vmcnt(0) 2521; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2522; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2523; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 2524; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 2525; GFX11-NEXT: s_setpc_b64 s[30:31] 2526 %val = load bfloat, ptr addrspace(1) %in 2527 %val.f64 = fpext bfloat %val to double 2528 store double %val.f64, ptr addrspace(1) %out 2529 ret void 2530} 2531 2532define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2533; GCN-LABEL: test_load_store_v2bf16: 2534; GCN: ; %bb.0: 2535; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2536; GCN-NEXT: s_mov_b32 s6, 0 2537; GCN-NEXT: s_mov_b32 s7, 0xf000 2538; GCN-NEXT: s_mov_b32 s4, s6 2539; GCN-NEXT: s_mov_b32 s5, s6 2540; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2541; GCN-NEXT: s_waitcnt vmcnt(0) 2542; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2543; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2544; GCN-NEXT: s_setpc_b64 s[30:31] 2545; 2546; GFX7-LABEL: test_load_store_v2bf16: 2547; GFX7: ; %bb.0: 2548; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2549; GFX7-NEXT: s_mov_b32 s6, 0 2550; GFX7-NEXT: s_mov_b32 s7, 0xf000 2551; GFX7-NEXT: s_mov_b32 s4, s6 2552; GFX7-NEXT: s_mov_b32 s5, s6 2553; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2554; GFX7-NEXT: s_waitcnt vmcnt(0) 2555; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2556; GFX7-NEXT: s_waitcnt vmcnt(0) 2557; GFX7-NEXT: s_setpc_b64 s[30:31] 2558; 2559; GFX8-LABEL: test_load_store_v2bf16: 2560; GFX8: ; %bb.0: 2561; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; GFX8-NEXT: flat_load_dword v0, v[0:1] 2563; GFX8-NEXT: s_waitcnt vmcnt(0) 2564; GFX8-NEXT: flat_store_dword v[2:3], v0 2565; GFX8-NEXT: s_waitcnt vmcnt(0) 2566; GFX8-NEXT: s_setpc_b64 s[30:31] 2567; 2568; GFX9-LABEL: test_load_store_v2bf16: 2569; GFX9: ; %bb.0: 2570; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2571; GFX9-NEXT: global_load_dword v0, v[0:1], off 2572; GFX9-NEXT: s_waitcnt vmcnt(0) 2573; GFX9-NEXT: global_store_dword v[2:3], v0, off 2574; GFX9-NEXT: s_waitcnt vmcnt(0) 2575; GFX9-NEXT: s_setpc_b64 s[30:31] 2576; 2577; GFX10-LABEL: test_load_store_v2bf16: 2578; GFX10: ; %bb.0: 2579; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2580; GFX10-NEXT: global_load_dword v0, v[0:1], off 2581; GFX10-NEXT: s_waitcnt vmcnt(0) 2582; GFX10-NEXT: global_store_dword v[2:3], v0, off 2583; GFX10-NEXT: s_setpc_b64 s[30:31] 2584; 2585; GFX11-LABEL: test_load_store_v2bf16: 2586; GFX11: ; %bb.0: 2587; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2588; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2589; GFX11-NEXT: s_waitcnt vmcnt(0) 2590; GFX11-NEXT: global_store_b32 v[2:3], v0, off 2591; GFX11-NEXT: s_setpc_b64 s[30:31] 2592 %val = load <2 x bfloat>, ptr addrspace(1) %in 2593 store <2 x bfloat> %val, ptr addrspace(1) %out 2594 ret void 2595} 2596 2597define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2598; GCN-LABEL: test_load_store_v4bf16: 2599; GCN: ; %bb.0: 2600; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2601; GCN-NEXT: s_mov_b32 s6, 0 2602; GCN-NEXT: s_mov_b32 s7, 0xf000 2603; GCN-NEXT: s_mov_b32 s4, s6 2604; GCN-NEXT: s_mov_b32 s5, s6 2605; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 2606; GCN-NEXT: s_waitcnt vmcnt(0) 2607; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 2608; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2609; GCN-NEXT: s_setpc_b64 s[30:31] 2610; 2611; GFX7-LABEL: test_load_store_v4bf16: 2612; GFX7: ; %bb.0: 2613; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2614; GFX7-NEXT: s_mov_b32 s6, 0 2615; GFX7-NEXT: s_mov_b32 s7, 0xf000 2616; GFX7-NEXT: s_mov_b32 s4, s6 2617; GFX7-NEXT: s_mov_b32 s5, s6 2618; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 2619; GFX7-NEXT: s_waitcnt vmcnt(0) 2620; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 2621; GFX7-NEXT: s_waitcnt vmcnt(0) 2622; GFX7-NEXT: s_setpc_b64 s[30:31] 2623; 2624; GFX8-LABEL: test_load_store_v4bf16: 2625; GFX8: ; %bb.0: 2626; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2627; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2628; GFX8-NEXT: s_waitcnt vmcnt(0) 2629; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2630; GFX8-NEXT: s_waitcnt vmcnt(0) 2631; GFX8-NEXT: s_setpc_b64 s[30:31] 2632; 2633; GFX9-LABEL: test_load_store_v4bf16: 2634; GFX9: ; %bb.0: 2635; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2636; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 2637; GFX9-NEXT: s_waitcnt vmcnt(0) 2638; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 2639; GFX9-NEXT: s_waitcnt vmcnt(0) 2640; GFX9-NEXT: s_setpc_b64 s[30:31] 2641; 2642; GFX10-LABEL: test_load_store_v4bf16: 2643; GFX10: ; %bb.0: 2644; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2645; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 2646; GFX10-NEXT: s_waitcnt vmcnt(0) 2647; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 2648; GFX10-NEXT: s_setpc_b64 s[30:31] 2649; 2650; GFX11-LABEL: test_load_store_v4bf16: 2651; GFX11: ; %bb.0: 2652; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2653; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 2654; GFX11-NEXT: s_waitcnt vmcnt(0) 2655; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 2656; GFX11-NEXT: s_setpc_b64 s[30:31] 2657 %val = load <4 x bfloat>, ptr addrspace(1) %in 2658 store <4 x bfloat> %val, ptr addrspace(1) %out 2659 ret void 2660} 2661 2662define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2663; GCN-LABEL: test_load_store_v8bf16: 2664; GCN: ; %bb.0: 2665; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2666; GCN-NEXT: s_mov_b32 s6, 0 2667; GCN-NEXT: s_mov_b32 s7, 0xf000 2668; GCN-NEXT: s_mov_b32 s4, s6 2669; GCN-NEXT: s_mov_b32 s5, s6 2670; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 2671; GCN-NEXT: s_waitcnt vmcnt(0) 2672; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 2673; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2674; GCN-NEXT: s_setpc_b64 s[30:31] 2675; 2676; GFX7-LABEL: test_load_store_v8bf16: 2677; GFX7: ; %bb.0: 2678; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2679; GFX7-NEXT: s_mov_b32 s6, 0 2680; GFX7-NEXT: s_mov_b32 s7, 0xf000 2681; GFX7-NEXT: s_mov_b32 s4, s6 2682; GFX7-NEXT: s_mov_b32 s5, s6 2683; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 2684; GFX7-NEXT: s_waitcnt vmcnt(0) 2685; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 2686; GFX7-NEXT: s_waitcnt vmcnt(0) 2687; GFX7-NEXT: s_setpc_b64 s[30:31] 2688; 2689; GFX8-LABEL: test_load_store_v8bf16: 2690; GFX8: ; %bb.0: 2691; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2692; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 2693; GFX8-NEXT: s_waitcnt vmcnt(0) 2694; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] 2695; GFX8-NEXT: s_waitcnt vmcnt(0) 2696; GFX8-NEXT: s_setpc_b64 s[30:31] 2697; 2698; GFX9-LABEL: test_load_store_v8bf16: 2699; GFX9: ; %bb.0: 2700; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2701; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2702; GFX9-NEXT: s_waitcnt vmcnt(0) 2703; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off 2704; GFX9-NEXT: s_waitcnt vmcnt(0) 2705; GFX9-NEXT: s_setpc_b64 s[30:31] 2706; 2707; GFX10-LABEL: test_load_store_v8bf16: 2708; GFX10: ; %bb.0: 2709; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2710; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 2711; GFX10-NEXT: s_waitcnt vmcnt(0) 2712; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off 2713; GFX10-NEXT: s_setpc_b64 s[30:31] 2714; 2715; GFX11-LABEL: test_load_store_v8bf16: 2716; GFX11: ; %bb.0: 2717; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2718; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 2719; GFX11-NEXT: s_waitcnt vmcnt(0) 2720; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off 2721; GFX11-NEXT: s_setpc_b64 s[30:31] 2722 %val = load <8 x bfloat>, ptr addrspace(1) %in 2723 store <8 x bfloat> %val, ptr addrspace(1) %out 2724 ret void 2725} 2726 2727define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2728; GCN-LABEL: test_load_store_v16bf16: 2729; GCN: ; %bb.0: 2730; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2731; GCN-NEXT: s_mov_b32 s6, 0 2732; GCN-NEXT: s_mov_b32 s7, 0xf000 2733; GCN-NEXT: s_mov_b32 s4, s6 2734; GCN-NEXT: s_mov_b32 s5, s6 2735; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 2736; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 2737; GCN-NEXT: s_waitcnt vmcnt(1) 2738; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 2739; GCN-NEXT: s_waitcnt vmcnt(1) 2740; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 2741; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2742; GCN-NEXT: s_setpc_b64 s[30:31] 2743; 2744; GFX7-LABEL: test_load_store_v16bf16: 2745; GFX7: ; %bb.0: 2746; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2747; GFX7-NEXT: s_mov_b32 s6, 0 2748; GFX7-NEXT: s_mov_b32 s7, 0xf000 2749; GFX7-NEXT: s_mov_b32 s4, s6 2750; GFX7-NEXT: s_mov_b32 s5, s6 2751; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 2752; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 2753; GFX7-NEXT: s_waitcnt vmcnt(1) 2754; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16 2755; GFX7-NEXT: s_waitcnt vmcnt(1) 2756; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 2757; GFX7-NEXT: s_waitcnt vmcnt(0) 2758; GFX7-NEXT: s_setpc_b64 s[30:31] 2759; 2760; GFX8-LABEL: test_load_store_v16bf16: 2761; GFX8: ; %bb.0: 2762; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2763; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 2764; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 2765; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 2766; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 2767; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 2768; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 2769; GFX8-NEXT: s_waitcnt vmcnt(1) 2770; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] 2771; GFX8-NEXT: s_waitcnt vmcnt(1) 2772; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 2773; GFX8-NEXT: s_waitcnt vmcnt(0) 2774; GFX8-NEXT: s_setpc_b64 s[30:31] 2775; 2776; GFX9-LABEL: test_load_store_v16bf16: 2777; GFX9: ; %bb.0: 2778; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2779; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 2780; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off 2781; GFX9-NEXT: s_waitcnt vmcnt(1) 2782; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 2783; GFX9-NEXT: s_waitcnt vmcnt(1) 2784; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off 2785; GFX9-NEXT: s_waitcnt vmcnt(0) 2786; GFX9-NEXT: s_setpc_b64 s[30:31] 2787; 2788; GFX10-LABEL: test_load_store_v16bf16: 2789; GFX10: ; %bb.0: 2790; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2791; GFX10-NEXT: s_clause 0x1 2792; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 2793; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off 2794; GFX10-NEXT: s_waitcnt vmcnt(1) 2795; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off offset:16 2796; GFX10-NEXT: s_waitcnt vmcnt(0) 2797; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off 2798; GFX10-NEXT: s_setpc_b64 s[30:31] 2799; 2800; GFX11-LABEL: test_load_store_v16bf16: 2801; GFX11: ; %bb.0: 2802; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2803; GFX11-NEXT: s_clause 0x1 2804; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 2805; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off 2806; GFX11-NEXT: s_waitcnt vmcnt(1) 2807; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16 2808; GFX11-NEXT: s_waitcnt vmcnt(0) 2809; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off 2810; GFX11-NEXT: s_setpc_b64 s[30:31] 2811 %val = load <16 x bfloat>, ptr addrspace(1) %in 2812 store <16 x bfloat> %val, ptr addrspace(1) %out 2813 ret void 2814} 2815 2816define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { 2817; GCN-LABEL: test_arg_store: 2818; GCN: ; %bb.0: 2819; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2820; GCN-NEXT: s_mov_b32 s7, 0xf000 2821; GCN-NEXT: s_mov_b32 s6, 0 2822; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 2823; GCN-NEXT: s_mov_b32 s4, s6 2824; GCN-NEXT: s_mov_b32 s5, s6 2825; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2826; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 2827; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2828; GCN-NEXT: s_setpc_b64 s[30:31] 2829; 2830; GFX7-LABEL: test_arg_store: 2831; GFX7: ; %bb.0: 2832; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2833; GFX7-NEXT: s_mov_b32 s6, 0 2834; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 2835; GFX7-NEXT: s_mov_b32 s7, 0xf000 2836; GFX7-NEXT: s_mov_b32 s4, s6 2837; GFX7-NEXT: s_mov_b32 s5, s6 2838; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2839; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 2840; GFX7-NEXT: s_waitcnt vmcnt(0) 2841; GFX7-NEXT: s_setpc_b64 s[30:31] 2842; 2843; GFX8-LABEL: test_arg_store: 2844; GFX8: ; %bb.0: 2845; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2846; GFX8-NEXT: flat_store_short v[1:2], v0 2847; GFX8-NEXT: s_waitcnt vmcnt(0) 2848; GFX8-NEXT: s_setpc_b64 s[30:31] 2849; 2850; GFX9-LABEL: test_arg_store: 2851; GFX9: ; %bb.0: 2852; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2853; GFX9-NEXT: global_store_short v[1:2], v0, off 2854; GFX9-NEXT: s_waitcnt vmcnt(0) 2855; GFX9-NEXT: s_setpc_b64 s[30:31] 2856; 2857; GFX10-LABEL: test_arg_store: 2858; GFX10: ; %bb.0: 2859; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2860; GFX10-NEXT: global_store_short v[1:2], v0, off 2861; GFX10-NEXT: s_setpc_b64 s[30:31] 2862; 2863; GFX11-LABEL: test_arg_store: 2864; GFX11: ; %bb.0: 2865; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2866; GFX11-NEXT: global_store_b16 v[1:2], v0, off 2867; GFX11-NEXT: s_setpc_b64 s[30:31] 2868 store bfloat %in, ptr addrspace(1) %out 2869 ret void 2870} 2871 2872define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { 2873; GCN-LABEL: test_arg_store_v2bf16: 2874; GCN: ; %bb.0: 2875; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2876; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 2877; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 2878; GCN-NEXT: s_mov_b32 s6, 0 2879; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2880; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 2881; GCN-NEXT: s_mov_b32 s7, 0xf000 2882; GCN-NEXT: s_mov_b32 s4, s6 2883; GCN-NEXT: s_mov_b32 s5, s6 2884; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2885; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2886; GCN-NEXT: s_setpc_b64 s[30:31] 2887; 2888; GFX7-LABEL: test_arg_store_v2bf16: 2889; GFX7: ; %bb.0: 2890; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2891; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 2892; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2893; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 2894; GFX7-NEXT: s_mov_b32 s6, 0 2895; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 2896; GFX7-NEXT: s_mov_b32 s7, 0xf000 2897; GFX7-NEXT: s_mov_b32 s4, s6 2898; GFX7-NEXT: s_mov_b32 s5, s6 2899; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 2900; GFX7-NEXT: s_waitcnt vmcnt(0) 2901; GFX7-NEXT: s_setpc_b64 s[30:31] 2902; 2903; GFX8-LABEL: test_arg_store_v2bf16: 2904; GFX8: ; %bb.0: 2905; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2906; GFX8-NEXT: flat_store_dword v[1:2], v0 2907; GFX8-NEXT: s_waitcnt vmcnt(0) 2908; GFX8-NEXT: s_setpc_b64 s[30:31] 2909; 2910; GFX9-LABEL: test_arg_store_v2bf16: 2911; GFX9: ; %bb.0: 2912; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2913; GFX9-NEXT: global_store_dword v[1:2], v0, off 2914; GFX9-NEXT: s_waitcnt vmcnt(0) 2915; GFX9-NEXT: s_setpc_b64 s[30:31] 2916; 2917; GFX10-LABEL: test_arg_store_v2bf16: 2918; GFX10: ; %bb.0: 2919; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2920; GFX10-NEXT: global_store_dword v[1:2], v0, off 2921; GFX10-NEXT: s_setpc_b64 s[30:31] 2922; 2923; GFX11-LABEL: test_arg_store_v2bf16: 2924; GFX11: ; %bb.0: 2925; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2926; GFX11-NEXT: global_store_b32 v[1:2], v0, off 2927; GFX11-NEXT: s_setpc_b64 s[30:31] 2928 store <2 x bfloat> %in, ptr addrspace(1) %out 2929 ret void 2930} 2931 2932define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { 2933; GCN-LABEL: test_arg_store_v3bf16: 2934; GCN: ; %bb.0: 2935; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2936; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 2937; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 2938; GCN-NEXT: s_mov_b32 s7, 0xf000 2939; GCN-NEXT: s_mov_b32 s6, 0 2940; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 2941; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2942; GCN-NEXT: s_mov_b32 s4, s6 2943; GCN-NEXT: s_mov_b32 s5, s6 2944; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2945; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 2946; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 2947; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 2948; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2949; GCN-NEXT: s_setpc_b64 s[30:31] 2950; 2951; GFX7-LABEL: test_arg_store_v3bf16: 2952; GFX7: ; %bb.0: 2953; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2954; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 2955; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2956; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 2957; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 2958; GFX7-NEXT: s_mov_b32 s6, 0 2959; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 2960; GFX7-NEXT: s_mov_b32 s7, 0xf000 2961; GFX7-NEXT: s_mov_b32 s4, s6 2962; GFX7-NEXT: s_mov_b32 s5, s6 2963; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2964; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4 2965; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 2966; GFX7-NEXT: s_waitcnt vmcnt(0) 2967; GFX7-NEXT: s_setpc_b64 s[30:31] 2968; 2969; GFX8-LABEL: test_arg_store_v3bf16: 2970; GFX8: ; %bb.0: 2971; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2972; GFX8-NEXT: flat_store_dword v[2:3], v0 2973; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2 2974; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2975; GFX8-NEXT: flat_store_short v[2:3], v1 2976; GFX8-NEXT: s_waitcnt vmcnt(0) 2977; GFX8-NEXT: s_setpc_b64 s[30:31] 2978; 2979; GFX9-LABEL: test_arg_store_v3bf16: 2980; GFX9: ; %bb.0: 2981; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2982; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4 2983; GFX9-NEXT: global_store_dword v[2:3], v0, off 2984; GFX9-NEXT: s_waitcnt vmcnt(0) 2985; GFX9-NEXT: s_setpc_b64 s[30:31] 2986; 2987; GFX10-LABEL: test_arg_store_v3bf16: 2988; GFX10: ; %bb.0: 2989; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2990; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4 2991; GFX10-NEXT: global_store_dword v[2:3], v0, off 2992; GFX10-NEXT: s_setpc_b64 s[30:31] 2993; 2994; GFX11-LABEL: test_arg_store_v3bf16: 2995; GFX11: ; %bb.0: 2996; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2997; GFX11-NEXT: s_clause 0x1 2998; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 2999; GFX11-NEXT: global_store_b32 v[2:3], v0, off 3000; GFX11-NEXT: s_setpc_b64 s[30:31] 3001 store <3 x bfloat> %in, ptr addrspace(1) %out 3002 ret void 3003} 3004 3005define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { 3006; GCN-LABEL: test_arg_store_v4bf16: 3007; GCN: ; %bb.0: 3008; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3009; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 3010; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 3011; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 3012; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 3013; GCN-NEXT: s_mov_b32 s6, 0 3014; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 3015; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 3016; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 3017; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 3018; GCN-NEXT: s_mov_b32 s7, 0xf000 3019; GCN-NEXT: s_mov_b32 s4, s6 3020; GCN-NEXT: s_mov_b32 s5, s6 3021; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 3022; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3023; GCN-NEXT: s_setpc_b64 s[30:31] 3024; 3025; GFX7-LABEL: test_arg_store_v4bf16: 3026; GFX7: ; %bb.0: 3027; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3028; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 3029; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 3030; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 3031; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 3032; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3033; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 3034; GFX7-NEXT: s_mov_b32 s6, 0 3035; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 3036; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 3037; GFX7-NEXT: s_mov_b32 s7, 0xf000 3038; GFX7-NEXT: s_mov_b32 s4, s6 3039; GFX7-NEXT: s_mov_b32 s5, s6 3040; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64 3041; GFX7-NEXT: s_waitcnt vmcnt(0) 3042; GFX7-NEXT: s_setpc_b64 s[30:31] 3043; 3044; GFX8-LABEL: test_arg_store_v4bf16: 3045; GFX8: ; %bb.0: 3046; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3047; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 3048; GFX8-NEXT: s_waitcnt vmcnt(0) 3049; GFX8-NEXT: s_setpc_b64 s[30:31] 3050; 3051; GFX9-LABEL: test_arg_store_v4bf16: 3052; GFX9: ; %bb.0: 3053; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3054; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 3055; GFX9-NEXT: s_waitcnt vmcnt(0) 3056; GFX9-NEXT: s_setpc_b64 s[30:31] 3057; 3058; GFX10-LABEL: test_arg_store_v4bf16: 3059; GFX10: ; %bb.0: 3060; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3061; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 3062; GFX10-NEXT: s_setpc_b64 s[30:31] 3063; 3064; GFX11-LABEL: test_arg_store_v4bf16: 3065; GFX11: ; %bb.0: 3066; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3067; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 3068; GFX11-NEXT: s_setpc_b64 s[30:31] 3069 store <4 x bfloat> %in, ptr addrspace(1) %out 3070 ret void 3071} 3072 3073define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { 3074; GCN-LABEL: test_arg_store_v8bf16: 3075; GCN: ; %bb.0: 3076; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3077; GCN-NEXT: s_mov_b32 s7, 0xf000 3078; GCN-NEXT: s_mov_b32 s6, 0 3079; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 3080; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 3081; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 3082; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 3083; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 3084; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 3085; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 3086; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 3087; GCN-NEXT: s_mov_b32 s4, s6 3088; GCN-NEXT: s_mov_b32 s5, s6 3089; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 3090; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 3091; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 3092; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 3093; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 3094; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 3095; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 3096; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 3097; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 3098; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3099; GCN-NEXT: s_setpc_b64 s[30:31] 3100; 3101; GFX7-LABEL: test_arg_store_v8bf16: 3102; GFX7: ; %bb.0: 3103; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3104; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 3105; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 3106; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 3107; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 3108; GFX7-NEXT: s_mov_b32 s6, 0 3109; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 3110; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 3111; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 3112; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 3113; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 3114; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 3115; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3116; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 3117; GFX7-NEXT: s_mov_b32 s7, 0xf000 3118; GFX7-NEXT: s_mov_b32 s4, s6 3119; GFX7-NEXT: s_mov_b32 s5, s6 3120; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 3121; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 3122; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 3123; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 3124; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 3125; GFX7-NEXT: s_waitcnt vmcnt(0) 3126; GFX7-NEXT: s_setpc_b64 s[30:31] 3127; 3128; GFX8-LABEL: test_arg_store_v8bf16: 3129; GFX8: ; %bb.0: 3130; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3131; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3132; GFX8-NEXT: s_waitcnt vmcnt(0) 3133; GFX8-NEXT: s_setpc_b64 s[30:31] 3134; 3135; GFX9-LABEL: test_arg_store_v8bf16: 3136; GFX9: ; %bb.0: 3137; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3138; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 3139; GFX9-NEXT: s_waitcnt vmcnt(0) 3140; GFX9-NEXT: s_setpc_b64 s[30:31] 3141; 3142; GFX10-LABEL: test_arg_store_v8bf16: 3143; GFX10: ; %bb.0: 3144; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3145; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 3146; GFX10-NEXT: s_setpc_b64 s[30:31] 3147; 3148; GFX11-LABEL: test_arg_store_v8bf16: 3149; GFX11: ; %bb.0: 3150; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3151; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 3152; GFX11-NEXT: s_setpc_b64 s[30:31] 3153 store <8 x bfloat> %in, ptr addrspace(1) %out 3154 ret void 3155} 3156 3157define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { 3158; GCN-LABEL: test_arg_store_v16bf16: 3159; GCN: ; %bb.0: 3160; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3161; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 3162; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 3163; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 3164; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 3165; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 3166; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 3167; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 3168; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 3169; GCN-NEXT: s_mov_b32 s7, 0xf000 3170; GCN-NEXT: s_mov_b32 s6, 0 3171; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 3172; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 3173; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 3174; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 3175; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 3176; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 3177; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 3178; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 3179; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 3180; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 3181; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 3182; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 3183; GCN-NEXT: s_mov_b32 s4, s6 3184; GCN-NEXT: s_mov_b32 s5, s6 3185; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 3186; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 3187; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 3188; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 3189; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 3190; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 3191; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 3192; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 3193; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 3194; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 3195; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 3196; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 3197; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 3198; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 3199; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3200; GCN-NEXT: s_setpc_b64 s[30:31] 3201; 3202; GFX7-LABEL: test_arg_store_v16bf16: 3203; GFX7: ; %bb.0: 3204; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3205; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 3206; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 3207; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 3208; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 3209; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 3210; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 3211; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 3212; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3213; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 3214; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 3215; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 3216; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 3217; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 3218; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3219; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 3220; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 3221; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 3222; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3223; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 3224; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 3225; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 3226; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3227; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 3228; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 3229; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 3230; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 3231; GFX7-NEXT: s_mov_b32 s6, 0 3232; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3233; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 3234; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 3235; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 3236; GFX7-NEXT: s_mov_b32 s7, 0xf000 3237; GFX7-NEXT: s_mov_b32 s4, s6 3238; GFX7-NEXT: s_mov_b32 s5, s6 3239; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 3240; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 3241; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 3242; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 3243; GFX7-NEXT: s_waitcnt vmcnt(0) 3244; GFX7-NEXT: s_setpc_b64 s[30:31] 3245; 3246; GFX8-LABEL: test_arg_store_v16bf16: 3247; GFX8: ; %bb.0: 3248; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3249; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3250; GFX8-NEXT: s_nop 0 3251; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8 3252; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc 3253; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 3254; GFX8-NEXT: s_waitcnt vmcnt(0) 3255; GFX8-NEXT: s_setpc_b64 s[30:31] 3256; 3257; GFX9-LABEL: test_arg_store_v16bf16: 3258; GFX9: ; %bb.0: 3259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3260; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 3261; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3262; GFX9-NEXT: s_waitcnt vmcnt(0) 3263; GFX9-NEXT: s_setpc_b64 s[30:31] 3264; 3265; GFX10-LABEL: test_arg_store_v16bf16: 3266; GFX10: ; %bb.0: 3267; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3268; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 3269; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off 3270; GFX10-NEXT: s_setpc_b64 s[30:31] 3271; 3272; GFX11-LABEL: test_arg_store_v16bf16: 3273; GFX11: ; %bb.0: 3274; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3275; GFX11-NEXT: s_clause 0x1 3276; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 3277; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 3278; GFX11-NEXT: s_setpc_b64 s[30:31] 3279 store <16 x bfloat> %in, ptr addrspace(1) %out 3280 ret void 3281} 3282 3283define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) { 3284; GCN-LABEL: test_inreg_arg_store: 3285; GCN: ; %bb.0: 3286; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3287; GCN-NEXT: s_mov_b32 s39, 0xf000 3288; GCN-NEXT: s_mov_b32 s38, 0 3289; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4 3290; GCN-NEXT: s_mov_b32 s36, s38 3291; GCN-NEXT: s_mov_b32 s37, s38 3292; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 3293; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 3294; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3295; GCN-NEXT: s_setpc_b64 s[30:31] 3296; 3297; GFX7-LABEL: test_inreg_arg_store: 3298; GFX7: ; %bb.0: 3299; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3300; GFX7-NEXT: s_mov_b32 s38, 0 3301; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 3302; GFX7-NEXT: s_mov_b32 s39, 0xf000 3303; GFX7-NEXT: s_mov_b32 s36, s38 3304; GFX7-NEXT: s_mov_b32 s37, s38 3305; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 3306; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 3307; GFX7-NEXT: s_waitcnt vmcnt(0) 3308; GFX7-NEXT: s_setpc_b64 s[30:31] 3309; 3310; GFX8-LABEL: test_inreg_arg_store: 3311; GFX8: ; %bb.0: 3312; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3313; GFX8-NEXT: v_mov_b32_e32 v2, s4 3314; GFX8-NEXT: flat_store_short v[0:1], v2 3315; GFX8-NEXT: s_waitcnt vmcnt(0) 3316; GFX8-NEXT: s_setpc_b64 s[30:31] 3317; 3318; GFX9-LABEL: test_inreg_arg_store: 3319; GFX9: ; %bb.0: 3320; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3321; GFX9-NEXT: v_mov_b32_e32 v2, s4 3322; GFX9-NEXT: global_store_short v[0:1], v2, off 3323; GFX9-NEXT: s_waitcnt vmcnt(0) 3324; GFX9-NEXT: s_setpc_b64 s[30:31] 3325; 3326; GFX10-LABEL: test_inreg_arg_store: 3327; GFX10: ; %bb.0: 3328; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3329; GFX10-NEXT: v_mov_b32_e32 v2, s4 3330; GFX10-NEXT: global_store_short v[0:1], v2, off 3331; GFX10-NEXT: s_setpc_b64 s[30:31] 3332; 3333; GFX11-LABEL: test_inreg_arg_store: 3334; GFX11: ; %bb.0: 3335; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3336; GFX11-NEXT: v_mov_b32_e32 v2, s4 3337; GFX11-NEXT: global_store_b16 v[0:1], v2, off 3338; GFX11-NEXT: s_setpc_b64 s[30:31] 3339 store bfloat %in, ptr addrspace(1) %out 3340 ret void 3341} 3342 3343define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { 3344; GCN-LABEL: test_byval: 3345; GCN: ; %bb.0: 3346; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3347; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 3348; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3349; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32 3350; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3351; GCN-NEXT: s_setpc_b64 s[30:31] 3352; 3353; GFX7-LABEL: test_byval: 3354; GFX7: ; %bb.0: 3355; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3356; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0 3357; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3358; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32 3359; GFX7-NEXT: s_waitcnt vmcnt(0) 3360; GFX7-NEXT: s_setpc_b64 s[30:31] 3361; 3362; GFX8-LABEL: test_byval: 3363; GFX8: ; %bb.0: 3364; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3365; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32 3366; GFX8-NEXT: s_waitcnt vmcnt(0) 3367; GFX8-NEXT: s_setpc_b64 s[30:31] 3368; 3369; GFX9-LABEL: test_byval: 3370; GFX9: ; %bb.0: 3371; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3372; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 3373; GFX9-NEXT: s_waitcnt vmcnt(0) 3374; GFX9-NEXT: s_setpc_b64 s[30:31] 3375; 3376; GFX10-LABEL: test_byval: 3377; GFX10: ; %bb.0: 3378; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3379; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32 3380; GFX10-NEXT: s_setpc_b64 s[30:31] 3381; 3382; GFX11-LABEL: test_byval: 3383; GFX11: ; %bb.0: 3384; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3385; GFX11-NEXT: scratch_store_b16 off, v0, s32 3386; GFX11-NEXT: s_setpc_b64 s[30:31] 3387 store bfloat %val, ptr addrspace(5) %bv 3388 %retval = load bfloat, ptr addrspace(5) %bv 3389 ret bfloat %retval 3390} 3391 3392define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { 3393; GCN-LABEL: test_sret: 3394; GCN: ; %bb.0: 3395; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3396; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 3397; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3398; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 3399; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3400; GCN-NEXT: s_setpc_b64 s[30:31] 3401; 3402; GFX7-LABEL: test_sret: 3403; GFX7: ; %bb.0: 3404; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3405; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 3406; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3407; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 3408; GFX7-NEXT: s_waitcnt vmcnt(0) 3409; GFX7-NEXT: s_setpc_b64 s[30:31] 3410; 3411; GFX8-LABEL: test_sret: 3412; GFX8: ; %bb.0: 3413; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3414; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 3415; GFX8-NEXT: s_waitcnt vmcnt(0) 3416; GFX8-NEXT: s_setpc_b64 s[30:31] 3417; 3418; GFX9-LABEL: test_sret: 3419; GFX9: ; %bb.0: 3420; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3421; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 3422; GFX9-NEXT: s_waitcnt vmcnt(0) 3423; GFX9-NEXT: s_setpc_b64 s[30:31] 3424; 3425; GFX10-LABEL: test_sret: 3426; GFX10: ; %bb.0: 3427; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3428; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 3429; GFX10-NEXT: s_setpc_b64 s[30:31] 3430; 3431; GFX11-LABEL: test_sret: 3432; GFX11: ; %bb.0: 3433; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3434; GFX11-NEXT: scratch_store_b16 v0, v1, off 3435; GFX11-NEXT: s_setpc_b64 s[30:31] 3436 store bfloat %val, ptr addrspace(5) %sret 3437 ret void 3438} 3439 3440define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { 3441; GCN-LABEL: test_bitcast_from_bfloat: 3442; GCN: ; %bb.0: 3443; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3444; GCN-NEXT: s_mov_b32 s6, 0 3445; GCN-NEXT: s_mov_b32 s7, 0xf000 3446; GCN-NEXT: s_mov_b32 s4, s6 3447; GCN-NEXT: s_mov_b32 s5, s6 3448; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 3449; GCN-NEXT: s_waitcnt vmcnt(0) 3450; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 3451; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3452; GCN-NEXT: s_setpc_b64 s[30:31] 3453; 3454; GFX7-LABEL: test_bitcast_from_bfloat: 3455; GFX7: ; %bb.0: 3456; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3457; GFX7-NEXT: s_mov_b32 s6, 0 3458; GFX7-NEXT: s_mov_b32 s7, 0xf000 3459; GFX7-NEXT: s_mov_b32 s4, s6 3460; GFX7-NEXT: s_mov_b32 s5, s6 3461; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 3462; GFX7-NEXT: s_waitcnt vmcnt(0) 3463; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 3464; GFX7-NEXT: s_waitcnt vmcnt(0) 3465; GFX7-NEXT: s_setpc_b64 s[30:31] 3466; 3467; GFX8-LABEL: test_bitcast_from_bfloat: 3468; GFX8: ; %bb.0: 3469; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3470; GFX8-NEXT: flat_load_ushort v0, v[0:1] 3471; GFX8-NEXT: s_waitcnt vmcnt(0) 3472; GFX8-NEXT: flat_store_short v[2:3], v0 3473; GFX8-NEXT: s_waitcnt vmcnt(0) 3474; GFX8-NEXT: s_setpc_b64 s[30:31] 3475; 3476; GFX9-LABEL: test_bitcast_from_bfloat: 3477; GFX9: ; %bb.0: 3478; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3479; GFX9-NEXT: global_load_ushort v0, v[0:1], off 3480; GFX9-NEXT: s_waitcnt vmcnt(0) 3481; GFX9-NEXT: global_store_short v[2:3], v0, off 3482; GFX9-NEXT: s_waitcnt vmcnt(0) 3483; GFX9-NEXT: s_setpc_b64 s[30:31] 3484; 3485; GFX10-LABEL: test_bitcast_from_bfloat: 3486; GFX10: ; %bb.0: 3487; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3488; GFX10-NEXT: global_load_ushort v0, v[0:1], off 3489; GFX10-NEXT: s_waitcnt vmcnt(0) 3490; GFX10-NEXT: global_store_short v[2:3], v0, off 3491; GFX10-NEXT: s_setpc_b64 s[30:31] 3492; 3493; GFX11-LABEL: test_bitcast_from_bfloat: 3494; GFX11: ; %bb.0: 3495; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3496; GFX11-NEXT: global_load_u16 v0, v[0:1], off 3497; GFX11-NEXT: s_waitcnt vmcnt(0) 3498; GFX11-NEXT: global_store_b16 v[2:3], v0, off 3499; GFX11-NEXT: s_setpc_b64 s[30:31] 3500 %val = load bfloat, ptr addrspace(1) %in 3501 %val_int = bitcast bfloat %val to i16 3502 store i16 %val_int, ptr addrspace(1) %out 3503 ret void 3504} 3505 3506define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) { 3507; GCN-LABEL: test_bitcast_to_bfloat: 3508; GCN: ; %bb.0: 3509; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3510; GCN-NEXT: s_mov_b32 s6, 0 3511; GCN-NEXT: s_mov_b32 s7, 0xf000 3512; GCN-NEXT: s_mov_b32 s4, s6 3513; GCN-NEXT: s_mov_b32 s5, s6 3514; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 3515; GCN-NEXT: s_waitcnt vmcnt(0) 3516; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 3517; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3518; GCN-NEXT: s_setpc_b64 s[30:31] 3519; 3520; GFX7-LABEL: test_bitcast_to_bfloat: 3521; GFX7: ; %bb.0: 3522; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3523; GFX7-NEXT: s_mov_b32 s6, 0 3524; GFX7-NEXT: s_mov_b32 s7, 0xf000 3525; GFX7-NEXT: s_mov_b32 s4, s6 3526; GFX7-NEXT: s_mov_b32 s5, s6 3527; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 3528; GFX7-NEXT: s_waitcnt vmcnt(0) 3529; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 3530; GFX7-NEXT: s_waitcnt vmcnt(0) 3531; GFX7-NEXT: s_setpc_b64 s[30:31] 3532; 3533; GFX8-LABEL: test_bitcast_to_bfloat: 3534; GFX8: ; %bb.0: 3535; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3536; GFX8-NEXT: flat_load_ushort v2, v[2:3] 3537; GFX8-NEXT: s_waitcnt vmcnt(0) 3538; GFX8-NEXT: flat_store_short v[0:1], v2 3539; GFX8-NEXT: s_waitcnt vmcnt(0) 3540; GFX8-NEXT: s_setpc_b64 s[30:31] 3541; 3542; GFX9-LABEL: test_bitcast_to_bfloat: 3543; GFX9: ; %bb.0: 3544; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3545; GFX9-NEXT: global_load_ushort v2, v[2:3], off 3546; GFX9-NEXT: s_waitcnt vmcnt(0) 3547; GFX9-NEXT: global_store_short v[0:1], v2, off 3548; GFX9-NEXT: s_waitcnt vmcnt(0) 3549; GFX9-NEXT: s_setpc_b64 s[30:31] 3550; 3551; GFX10-LABEL: test_bitcast_to_bfloat: 3552; GFX10: ; %bb.0: 3553; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3554; GFX10-NEXT: global_load_ushort v2, v[2:3], off 3555; GFX10-NEXT: s_waitcnt vmcnt(0) 3556; GFX10-NEXT: global_store_short v[0:1], v2, off 3557; GFX10-NEXT: s_setpc_b64 s[30:31] 3558; 3559; GFX11-LABEL: test_bitcast_to_bfloat: 3560; GFX11: ; %bb.0: 3561; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3562; GFX11-NEXT: global_load_u16 v2, v[2:3], off 3563; GFX11-NEXT: s_waitcnt vmcnt(0) 3564; GFX11-NEXT: global_store_b16 v[0:1], v2, off 3565; GFX11-NEXT: s_setpc_b64 s[30:31] 3566 %val = load i16, ptr addrspace(1) %in 3567 %val_fp = bitcast i16 %val to bfloat 3568 store bfloat %val_fp, ptr addrspace(1) %out 3569 ret void 3570} 3571 3572define bfloat @test_ret(bfloat %in) { 3573; GCN-LABEL: test_ret: 3574; GCN: ; %bb.0: ; %entry 3575; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3576; GCN-NEXT: s_setpc_b64 s[30:31] 3577; 3578; GFX7-LABEL: test_ret: 3579; GFX7: ; %bb.0: ; %entry 3580; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3581; GFX7-NEXT: s_setpc_b64 s[30:31] 3582; 3583; GFX8-LABEL: test_ret: 3584; GFX8: ; %bb.0: ; %entry 3585; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3586; GFX8-NEXT: s_setpc_b64 s[30:31] 3587; 3588; GFX9-LABEL: test_ret: 3589; GFX9: ; %bb.0: ; %entry 3590; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3591; GFX9-NEXT: s_setpc_b64 s[30:31] 3592; 3593; GFX10-LABEL: test_ret: 3594; GFX10: ; %bb.0: ; %entry 3595; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3596; GFX10-NEXT: s_setpc_b64 s[30:31] 3597; 3598; GFX11-LABEL: test_ret: 3599; GFX11: ; %bb.0: ; %entry 3600; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3601; GFX11-NEXT: s_setpc_b64 s[30:31] 3602entry: 3603 ret bfloat %in 3604} 3605 3606define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) { 3607; GCN-LABEL: test_ret_v2bf16: 3608; GCN: ; %bb.0: ; %entry 3609; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3610; GCN-NEXT: s_setpc_b64 s[30:31] 3611; 3612; GFX7-LABEL: test_ret_v2bf16: 3613; GFX7: ; %bb.0: ; %entry 3614; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3615; GFX7-NEXT: s_setpc_b64 s[30:31] 3616; 3617; GFX8-LABEL: test_ret_v2bf16: 3618; GFX8: ; %bb.0: ; %entry 3619; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3620; GFX8-NEXT: s_setpc_b64 s[30:31] 3621; 3622; GFX9-LABEL: test_ret_v2bf16: 3623; GFX9: ; %bb.0: ; %entry 3624; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3625; GFX9-NEXT: s_setpc_b64 s[30:31] 3626; 3627; GFX10-LABEL: test_ret_v2bf16: 3628; GFX10: ; %bb.0: ; %entry 3629; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3630; GFX10-NEXT: s_setpc_b64 s[30:31] 3631; 3632; GFX11-LABEL: test_ret_v2bf16: 3633; GFX11: ; %bb.0: ; %entry 3634; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3635; GFX11-NEXT: s_setpc_b64 s[30:31] 3636entry: 3637 ret <2 x bfloat> %in 3638} 3639 3640define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { 3641; GCN-LABEL: test_ret_v3bf16: 3642; GCN: ; %bb.0: ; %entry 3643; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3644; GCN-NEXT: s_setpc_b64 s[30:31] 3645; 3646; GFX7-LABEL: test_ret_v3bf16: 3647; GFX7: ; %bb.0: ; %entry 3648; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3649; GFX7-NEXT: s_setpc_b64 s[30:31] 3650; 3651; GFX8-LABEL: test_ret_v3bf16: 3652; GFX8: ; %bb.0: ; %entry 3653; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3654; GFX8-NEXT: s_setpc_b64 s[30:31] 3655; 3656; GFX9-LABEL: test_ret_v3bf16: 3657; GFX9: ; %bb.0: ; %entry 3658; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3659; GFX9-NEXT: s_setpc_b64 s[30:31] 3660; 3661; GFX10-LABEL: test_ret_v3bf16: 3662; GFX10: ; %bb.0: ; %entry 3663; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3664; GFX10-NEXT: s_setpc_b64 s[30:31] 3665; 3666; GFX11-LABEL: test_ret_v3bf16: 3667; GFX11: ; %bb.0: ; %entry 3668; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3669; GFX11-NEXT: s_setpc_b64 s[30:31] 3670entry: 3671 ret <3 x bfloat> %in 3672} 3673 3674define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) { 3675; GCN-LABEL: test_ret_v4bf16: 3676; GCN: ; %bb.0: ; %entry 3677; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3678; GCN-NEXT: s_setpc_b64 s[30:31] 3679; 3680; GFX7-LABEL: test_ret_v4bf16: 3681; GFX7: ; %bb.0: ; %entry 3682; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3683; GFX7-NEXT: s_setpc_b64 s[30:31] 3684; 3685; GFX8-LABEL: test_ret_v4bf16: 3686; GFX8: ; %bb.0: ; %entry 3687; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3688; GFX8-NEXT: s_setpc_b64 s[30:31] 3689; 3690; GFX9-LABEL: test_ret_v4bf16: 3691; GFX9: ; %bb.0: ; %entry 3692; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3693; GFX9-NEXT: s_setpc_b64 s[30:31] 3694; 3695; GFX10-LABEL: test_ret_v4bf16: 3696; GFX10: ; %bb.0: ; %entry 3697; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3698; GFX10-NEXT: s_setpc_b64 s[30:31] 3699; 3700; GFX11-LABEL: test_ret_v4bf16: 3701; GFX11: ; %bb.0: ; %entry 3702; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3703; GFX11-NEXT: s_setpc_b64 s[30:31] 3704entry: 3705 ret <4 x bfloat> %in 3706} 3707 3708define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) { 3709; GCN-LABEL: test_ret_v8bf16: 3710; GCN: ; %bb.0: ; %entry 3711; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3712; GCN-NEXT: s_setpc_b64 s[30:31] 3713; 3714; GFX7-LABEL: test_ret_v8bf16: 3715; GFX7: ; %bb.0: ; %entry 3716; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3717; GFX7-NEXT: s_setpc_b64 s[30:31] 3718; 3719; GFX8-LABEL: test_ret_v8bf16: 3720; GFX8: ; %bb.0: ; %entry 3721; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3722; GFX8-NEXT: s_setpc_b64 s[30:31] 3723; 3724; GFX9-LABEL: test_ret_v8bf16: 3725; GFX9: ; %bb.0: ; %entry 3726; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3727; GFX9-NEXT: s_setpc_b64 s[30:31] 3728; 3729; GFX10-LABEL: test_ret_v8bf16: 3730; GFX10: ; %bb.0: ; %entry 3731; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3732; GFX10-NEXT: s_setpc_b64 s[30:31] 3733; 3734; GFX11-LABEL: test_ret_v8bf16: 3735; GFX11: ; %bb.0: ; %entry 3736; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3737; GFX11-NEXT: s_setpc_b64 s[30:31] 3738entry: 3739 ret <8 x bfloat> %in 3740} 3741 3742define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) { 3743; GCN-LABEL: test_ret_v16bf16: 3744; GCN: ; %bb.0: ; %entry 3745; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3746; GCN-NEXT: s_setpc_b64 s[30:31] 3747; 3748; GFX7-LABEL: test_ret_v16bf16: 3749; GFX7: ; %bb.0: ; %entry 3750; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3751; GFX7-NEXT: s_setpc_b64 s[30:31] 3752; 3753; GFX8-LABEL: test_ret_v16bf16: 3754; GFX8: ; %bb.0: ; %entry 3755; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3756; GFX8-NEXT: s_setpc_b64 s[30:31] 3757; 3758; GFX9-LABEL: test_ret_v16bf16: 3759; GFX9: ; %bb.0: ; %entry 3760; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3761; GFX9-NEXT: s_setpc_b64 s[30:31] 3762; 3763; GFX10-LABEL: test_ret_v16bf16: 3764; GFX10: ; %bb.0: ; %entry 3765; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3766; GFX10-NEXT: s_setpc_b64 s[30:31] 3767; 3768; GFX11-LABEL: test_ret_v16bf16: 3769; GFX11: ; %bb.0: ; %entry 3770; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3771; GFX11-NEXT: s_setpc_b64 s[30:31] 3772entry: 3773 ret <16 x bfloat> %in 3774} 3775 3776define void @test_call(bfloat %in, ptr addrspace(5) %out) { 3777; GCN-LABEL: test_call: 3778; GCN: ; %bb.0: ; %entry 3779; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3780; GCN-NEXT: s_mov_b32 s18, s33 3781; GCN-NEXT: s_mov_b32 s33, s32 3782; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 3783; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 3784; GCN-NEXT: s_mov_b64 exec, s[16:17] 3785; GCN-NEXT: s_addk_i32 s32, 0x400 3786; GCN-NEXT: s_waitcnt expcnt(0) 3787; GCN-NEXT: v_writelane_b32 v2, s30, 0 3788; GCN-NEXT: v_writelane_b32 v2, s31, 1 3789; GCN-NEXT: s_getpc_b64 s[16:17] 3790; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 3791; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 3792; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3793; GCN-NEXT: s_waitcnt lgkmcnt(0) 3794; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 3795; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 3796; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3797; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 3798; GCN-NEXT: s_waitcnt vmcnt(0) 3799; GCN-NEXT: v_readlane_b32 s31, v2, 1 3800; GCN-NEXT: v_readlane_b32 s30, v2, 0 3801; GCN-NEXT: s_mov_b32 s32, s33 3802; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 3803; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 3804; GCN-NEXT: s_mov_b64 exec, s[4:5] 3805; GCN-NEXT: s_mov_b32 s33, s18 3806; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3807; GCN-NEXT: s_setpc_b64 s[30:31] 3808; 3809; GFX7-LABEL: test_call: 3810; GFX7: ; %bb.0: ; %entry 3811; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3812; GFX7-NEXT: s_mov_b32 s18, s33 3813; GFX7-NEXT: s_mov_b32 s33, s32 3814; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 3815; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 3816; GFX7-NEXT: s_mov_b64 exec, s[16:17] 3817; GFX7-NEXT: s_addk_i32 s32, 0x400 3818; GFX7-NEXT: s_getpc_b64 s[16:17] 3819; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 3820; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 3821; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3822; GFX7-NEXT: v_writelane_b32 v2, s30, 0 3823; GFX7-NEXT: v_writelane_b32 v2, s31, 1 3824; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3825; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 3826; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 3827; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3828; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 3829; GFX7-NEXT: s_waitcnt vmcnt(0) 3830; GFX7-NEXT: v_readlane_b32 s31, v2, 1 3831; GFX7-NEXT: v_readlane_b32 s30, v2, 0 3832; GFX7-NEXT: s_mov_b32 s32, s33 3833; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 3834; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 3835; GFX7-NEXT: s_mov_b64 exec, s[4:5] 3836; GFX7-NEXT: s_mov_b32 s33, s18 3837; GFX7-NEXT: s_waitcnt vmcnt(0) 3838; GFX7-NEXT: s_setpc_b64 s[30:31] 3839; 3840; GFX8-LABEL: test_call: 3841; GFX8: ; %bb.0: ; %entry 3842; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3843; GFX8-NEXT: s_mov_b32 s18, s33 3844; GFX8-NEXT: s_mov_b32 s33, s32 3845; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 3846; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 3847; GFX8-NEXT: s_mov_b64 exec, s[16:17] 3848; GFX8-NEXT: s_addk_i32 s32, 0x400 3849; GFX8-NEXT: s_getpc_b64 s[16:17] 3850; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 3851; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 3852; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3853; GFX8-NEXT: v_writelane_b32 v2, s30, 0 3854; GFX8-NEXT: v_writelane_b32 v2, s31, 1 3855; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3856; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 3857; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 3858; GFX8-NEXT: s_waitcnt vmcnt(0) 3859; GFX8-NEXT: v_readlane_b32 s31, v2, 1 3860; GFX8-NEXT: v_readlane_b32 s30, v2, 0 3861; GFX8-NEXT: s_mov_b32 s32, s33 3862; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 3863; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 3864; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3865; GFX8-NEXT: s_mov_b32 s33, s18 3866; GFX8-NEXT: s_waitcnt vmcnt(0) 3867; GFX8-NEXT: s_setpc_b64 s[30:31] 3868; 3869; GFX9-LABEL: test_call: 3870; GFX9: ; %bb.0: ; %entry 3871; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3872; GFX9-NEXT: s_mov_b32 s18, s33 3873; GFX9-NEXT: s_mov_b32 s33, s32 3874; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 3875; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 3876; GFX9-NEXT: s_mov_b64 exec, s[16:17] 3877; GFX9-NEXT: s_addk_i32 s32, 0x400 3878; GFX9-NEXT: s_getpc_b64 s[16:17] 3879; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 3880; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 3881; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3882; GFX9-NEXT: v_writelane_b32 v2, s30, 0 3883; GFX9-NEXT: v_writelane_b32 v2, s31, 1 3884; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3885; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 3886; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 3887; GFX9-NEXT: s_waitcnt vmcnt(0) 3888; GFX9-NEXT: v_readlane_b32 s31, v2, 1 3889; GFX9-NEXT: v_readlane_b32 s30, v2, 0 3890; GFX9-NEXT: s_mov_b32 s32, s33 3891; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 3892; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 3893; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3894; GFX9-NEXT: s_mov_b32 s33, s18 3895; GFX9-NEXT: s_waitcnt vmcnt(0) 3896; GFX9-NEXT: s_setpc_b64 s[30:31] 3897; 3898; GFX10-LABEL: test_call: 3899; GFX10: ; %bb.0: ; %entry 3900; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3901; GFX10-NEXT: s_mov_b32 s18, s33 3902; GFX10-NEXT: s_mov_b32 s33, s32 3903; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 3904; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 3905; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3906; GFX10-NEXT: s_mov_b32 exec_lo, s16 3907; GFX10-NEXT: s_addk_i32 s32, 0x200 3908; GFX10-NEXT: s_getpc_b64 s[16:17] 3909; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 3910; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 3911; GFX10-NEXT: v_writelane_b32 v2, s30, 0 3912; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3913; GFX10-NEXT: v_writelane_b32 v2, s31, 1 3914; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3915; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 3916; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 3917; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3918; GFX10-NEXT: v_readlane_b32 s31, v2, 1 3919; GFX10-NEXT: v_readlane_b32 s30, v2, 0 3920; GFX10-NEXT: s_mov_b32 s32, s33 3921; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 3922; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 3923; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3924; GFX10-NEXT: s_mov_b32 exec_lo, s4 3925; GFX10-NEXT: s_mov_b32 s33, s18 3926; GFX10-NEXT: s_waitcnt vmcnt(0) 3927; GFX10-NEXT: s_setpc_b64 s[30:31] 3928; 3929; GFX11-LABEL: test_call: 3930; GFX11: ; %bb.0: ; %entry 3931; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3932; GFX11-NEXT: s_mov_b32 s2, s33 3933; GFX11-NEXT: s_mov_b32 s33, s32 3934; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 3935; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill 3936; GFX11-NEXT: s_mov_b32 exec_lo, s0 3937; GFX11-NEXT: s_add_i32 s32, s32, 16 3938; GFX11-NEXT: s_getpc_b64 s[0:1] 3939; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4 3940; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12 3941; GFX11-NEXT: v_writelane_b32 v2, s30, 0 3942; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 3943; GFX11-NEXT: v_writelane_b32 v2, s31, 1 3944; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3945; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 3946; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc 3947; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3948; GFX11-NEXT: v_readlane_b32 s31, v2, 1 3949; GFX11-NEXT: v_readlane_b32 s30, v2, 0 3950; GFX11-NEXT: s_mov_b32 s32, s33 3951; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 3952; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload 3953; GFX11-NEXT: s_mov_b32 exec_lo, s0 3954; GFX11-NEXT: s_mov_b32 s33, s2 3955; GFX11-NEXT: s_waitcnt vmcnt(0) 3956; GFX11-NEXT: s_setpc_b64 s[30:31] 3957entry: 3958 %result = call bfloat @test_arg_store(bfloat %in) 3959 store volatile bfloat %result, ptr addrspace(5) %out 3960 ret void 3961} 3962 3963define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { 3964; GCN-LABEL: test_call_v2bf16: 3965; GCN: ; %bb.0: ; %entry 3966; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3967; GCN-NEXT: s_mov_b32 s18, s33 3968; GCN-NEXT: s_mov_b32 s33, s32 3969; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 3970; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill 3971; GCN-NEXT: s_mov_b64 exec, s[16:17] 3972; GCN-NEXT: s_addk_i32 s32, 0x400 3973; GCN-NEXT: s_waitcnt expcnt(0) 3974; GCN-NEXT: v_writelane_b32 v4, s30, 0 3975; GCN-NEXT: v_writelane_b32 v4, s31, 1 3976; GCN-NEXT: s_getpc_b64 s[16:17] 3977; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 3978; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 3979; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 3980; GCN-NEXT: s_waitcnt lgkmcnt(0) 3981; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 3982; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 3983; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 3984; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 3985; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3986; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3987; GCN-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen 3988; GCN-NEXT: s_waitcnt vmcnt(0) 3989; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 3990; GCN-NEXT: s_waitcnt vmcnt(0) 3991; GCN-NEXT: v_readlane_b32 s31, v4, 1 3992; GCN-NEXT: v_readlane_b32 s30, v4, 0 3993; GCN-NEXT: s_mov_b32 s32, s33 3994; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 3995; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload 3996; GCN-NEXT: s_mov_b64 exec, s[4:5] 3997; GCN-NEXT: s_mov_b32 s33, s18 3998; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3999; GCN-NEXT: s_setpc_b64 s[30:31] 4000; 4001; GFX7-LABEL: test_call_v2bf16: 4002; GFX7: ; %bb.0: ; %entry 4003; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4004; GFX7-NEXT: s_mov_b32 s18, s33 4005; GFX7-NEXT: s_mov_b32 s33, s32 4006; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 4007; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill 4008; GFX7-NEXT: s_mov_b64 exec, s[16:17] 4009; GFX7-NEXT: s_addk_i32 s32, 0x400 4010; GFX7-NEXT: s_getpc_b64 s[16:17] 4011; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4012; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4013; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4014; GFX7-NEXT: v_writelane_b32 v4, s30, 0 4015; GFX7-NEXT: v_writelane_b32 v4, s31, 1 4016; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4017; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 4018; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 4019; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4020; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4021; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2 4022; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4023; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen 4024; GFX7-NEXT: s_waitcnt vmcnt(0) 4025; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 4026; GFX7-NEXT: s_waitcnt vmcnt(0) 4027; GFX7-NEXT: v_readlane_b32 s31, v4, 1 4028; GFX7-NEXT: v_readlane_b32 s30, v4, 0 4029; GFX7-NEXT: s_mov_b32 s32, s33 4030; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 4031; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload 4032; GFX7-NEXT: s_mov_b64 exec, s[4:5] 4033; GFX7-NEXT: s_mov_b32 s33, s18 4034; GFX7-NEXT: s_waitcnt vmcnt(0) 4035; GFX7-NEXT: s_setpc_b64 s[30:31] 4036; 4037; GFX8-LABEL: test_call_v2bf16: 4038; GFX8: ; %bb.0: ; %entry 4039; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4040; GFX8-NEXT: s_mov_b32 s18, s33 4041; GFX8-NEXT: s_mov_b32 s33, s32 4042; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 4043; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 4044; GFX8-NEXT: s_mov_b64 exec, s[16:17] 4045; GFX8-NEXT: s_addk_i32 s32, 0x400 4046; GFX8-NEXT: s_getpc_b64 s[16:17] 4047; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4048; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4049; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4050; GFX8-NEXT: v_writelane_b32 v2, s30, 0 4051; GFX8-NEXT: v_writelane_b32 v2, s31, 1 4052; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4053; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 4054; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 4055; GFX8-NEXT: s_waitcnt vmcnt(0) 4056; GFX8-NEXT: v_readlane_b32 s31, v2, 1 4057; GFX8-NEXT: v_readlane_b32 s30, v2, 0 4058; GFX8-NEXT: s_mov_b32 s32, s33 4059; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 4060; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 4061; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4062; GFX8-NEXT: s_mov_b32 s33, s18 4063; GFX8-NEXT: s_waitcnt vmcnt(0) 4064; GFX8-NEXT: s_setpc_b64 s[30:31] 4065; 4066; GFX9-LABEL: test_call_v2bf16: 4067; GFX9: ; %bb.0: ; %entry 4068; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4069; GFX9-NEXT: s_mov_b32 s18, s33 4070; GFX9-NEXT: s_mov_b32 s33, s32 4071; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 4072; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 4073; GFX9-NEXT: s_mov_b64 exec, s[16:17] 4074; GFX9-NEXT: s_addk_i32 s32, 0x400 4075; GFX9-NEXT: s_getpc_b64 s[16:17] 4076; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4077; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4078; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4079; GFX9-NEXT: v_writelane_b32 v2, s30, 0 4080; GFX9-NEXT: v_writelane_b32 v2, s31, 1 4081; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4082; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 4083; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 4084; GFX9-NEXT: s_waitcnt vmcnt(0) 4085; GFX9-NEXT: v_readlane_b32 s31, v2, 1 4086; GFX9-NEXT: v_readlane_b32 s30, v2, 0 4087; GFX9-NEXT: s_mov_b32 s32, s33 4088; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 4089; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 4090; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4091; GFX9-NEXT: s_mov_b32 s33, s18 4092; GFX9-NEXT: s_waitcnt vmcnt(0) 4093; GFX9-NEXT: s_setpc_b64 s[30:31] 4094; 4095; GFX10-LABEL: test_call_v2bf16: 4096; GFX10: ; %bb.0: ; %entry 4097; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4098; GFX10-NEXT: s_mov_b32 s18, s33 4099; GFX10-NEXT: s_mov_b32 s33, s32 4100; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 4101; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill 4102; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4103; GFX10-NEXT: s_mov_b32 exec_lo, s16 4104; GFX10-NEXT: s_addk_i32 s32, 0x200 4105; GFX10-NEXT: s_getpc_b64 s[16:17] 4106; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4107; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4108; GFX10-NEXT: v_writelane_b32 v2, s30, 0 4109; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4110; GFX10-NEXT: v_writelane_b32 v2, s31, 1 4111; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 4113; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 4114; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4115; GFX10-NEXT: v_readlane_b32 s31, v2, 1 4116; GFX10-NEXT: v_readlane_b32 s30, v2, 0 4117; GFX10-NEXT: s_mov_b32 s32, s33 4118; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 4119; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload 4120; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4121; GFX10-NEXT: s_mov_b32 exec_lo, s4 4122; GFX10-NEXT: s_mov_b32 s33, s18 4123; GFX10-NEXT: s_waitcnt vmcnt(0) 4124; GFX10-NEXT: s_setpc_b64 s[30:31] 4125; 4126; GFX11-LABEL: test_call_v2bf16: 4127; GFX11: ; %bb.0: ; %entry 4128; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4129; GFX11-NEXT: s_mov_b32 s2, s33 4130; GFX11-NEXT: s_mov_b32 s33, s32 4131; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4132; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill 4133; GFX11-NEXT: s_mov_b32 exec_lo, s0 4134; GFX11-NEXT: s_add_i32 s32, s32, 16 4135; GFX11-NEXT: s_getpc_b64 s[0:1] 4136; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 4137; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 4138; GFX11-NEXT: v_writelane_b32 v2, s30, 0 4139; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 4140; GFX11-NEXT: v_writelane_b32 v2, s31, 1 4141; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4142; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 4143; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc 4144; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4145; GFX11-NEXT: v_readlane_b32 s31, v2, 1 4146; GFX11-NEXT: v_readlane_b32 s30, v2, 0 4147; GFX11-NEXT: s_mov_b32 s32, s33 4148; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4149; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload 4150; GFX11-NEXT: s_mov_b32 exec_lo, s0 4151; GFX11-NEXT: s_mov_b32 s33, s2 4152; GFX11-NEXT: s_waitcnt vmcnt(0) 4153; GFX11-NEXT: s_setpc_b64 s[30:31] 4154entry: 4155 %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in) 4156 store volatile <2 x bfloat> %result, ptr addrspace(5) %out 4157 ret void 4158} 4159 4160define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { 4161; GCN-LABEL: test_call_v3bf16: 4162; GCN: ; %bb.0: ; %entry 4163; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4164; GCN-NEXT: s_mov_b32 s18, s33 4165; GCN-NEXT: s_mov_b32 s33, s32 4166; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 4167; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill 4168; GCN-NEXT: s_mov_b64 exec, s[16:17] 4169; GCN-NEXT: s_addk_i32 s32, 0x400 4170; GCN-NEXT: s_waitcnt expcnt(0) 4171; GCN-NEXT: v_writelane_b32 v5, s30, 0 4172; GCN-NEXT: v_writelane_b32 v5, s31, 1 4173; GCN-NEXT: s_getpc_b64 s[16:17] 4174; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4175; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4176; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4177; GCN-NEXT: s_waitcnt lgkmcnt(0) 4178; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 4179; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 4180; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 4181; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 4182; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 4183; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4184; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4185; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 4186; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen 4187; GCN-NEXT: s_waitcnt vmcnt(0) 4188; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen 4189; GCN-NEXT: s_waitcnt vmcnt(0) 4190; GCN-NEXT: v_readlane_b32 s31, v5, 1 4191; GCN-NEXT: v_readlane_b32 s30, v5, 0 4192; GCN-NEXT: s_mov_b32 s32, s33 4193; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 4194; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload 4195; GCN-NEXT: s_mov_b64 exec, s[4:5] 4196; GCN-NEXT: s_mov_b32 s33, s18 4197; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4198; GCN-NEXT: s_setpc_b64 s[30:31] 4199; 4200; GFX7-LABEL: test_call_v3bf16: 4201; GFX7: ; %bb.0: ; %entry 4202; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4203; GFX7-NEXT: s_mov_b32 s18, s33 4204; GFX7-NEXT: s_mov_b32 s33, s32 4205; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 4206; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill 4207; GFX7-NEXT: s_mov_b64 exec, s[16:17] 4208; GFX7-NEXT: s_addk_i32 s32, 0x400 4209; GFX7-NEXT: s_getpc_b64 s[16:17] 4210; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4211; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4212; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4213; GFX7-NEXT: v_writelane_b32 v4, s30, 0 4214; GFX7-NEXT: v_writelane_b32 v4, s31, 1 4215; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4216; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 4217; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 4218; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4219; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4220; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 4221; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 4222; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4223; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 4224; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen 4225; GFX7-NEXT: s_waitcnt vmcnt(0) 4226; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen 4227; GFX7-NEXT: s_waitcnt vmcnt(0) 4228; GFX7-NEXT: v_readlane_b32 s31, v4, 1 4229; GFX7-NEXT: v_readlane_b32 s30, v4, 0 4230; GFX7-NEXT: s_mov_b32 s32, s33 4231; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 4232; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload 4233; GFX7-NEXT: s_mov_b64 exec, s[4:5] 4234; GFX7-NEXT: s_mov_b32 s33, s18 4235; GFX7-NEXT: s_waitcnt vmcnt(0) 4236; GFX7-NEXT: s_setpc_b64 s[30:31] 4237; 4238; GFX8-LABEL: test_call_v3bf16: 4239; GFX8: ; %bb.0: ; %entry 4240; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4241; GFX8-NEXT: s_mov_b32 s18, s33 4242; GFX8-NEXT: s_mov_b32 s33, s32 4243; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 4244; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill 4245; GFX8-NEXT: s_mov_b64 exec, s[16:17] 4246; GFX8-NEXT: s_addk_i32 s32, 0x400 4247; GFX8-NEXT: s_getpc_b64 s[16:17] 4248; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4249; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4250; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4251; GFX8-NEXT: v_writelane_b32 v4, s30, 0 4252; GFX8-NEXT: v_writelane_b32 v4, s31, 1 4253; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4254; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 4255; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 4256; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen 4257; GFX8-NEXT: s_waitcnt vmcnt(0) 4258; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4259; GFX8-NEXT: s_waitcnt vmcnt(0) 4260; GFX8-NEXT: v_readlane_b32 s31, v4, 1 4261; GFX8-NEXT: v_readlane_b32 s30, v4, 0 4262; GFX8-NEXT: s_mov_b32 s32, s33 4263; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 4264; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload 4265; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4266; GFX8-NEXT: s_mov_b32 s33, s18 4267; GFX8-NEXT: s_waitcnt vmcnt(0) 4268; GFX8-NEXT: s_setpc_b64 s[30:31] 4269; 4270; GFX9-LABEL: test_call_v3bf16: 4271; GFX9: ; %bb.0: ; %entry 4272; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4273; GFX9-NEXT: s_mov_b32 s18, s33 4274; GFX9-NEXT: s_mov_b32 s33, s32 4275; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 4276; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill 4277; GFX9-NEXT: s_mov_b64 exec, s[16:17] 4278; GFX9-NEXT: s_addk_i32 s32, 0x400 4279; GFX9-NEXT: s_getpc_b64 s[16:17] 4280; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4281; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4282; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4283; GFX9-NEXT: v_writelane_b32 v3, s30, 0 4284; GFX9-NEXT: v_writelane_b32 v3, s31, 1 4285; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4286; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 4287; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 4288; GFX9-NEXT: s_waitcnt vmcnt(0) 4289; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4290; GFX9-NEXT: s_waitcnt vmcnt(0) 4291; GFX9-NEXT: v_readlane_b32 s31, v3, 1 4292; GFX9-NEXT: v_readlane_b32 s30, v3, 0 4293; GFX9-NEXT: s_mov_b32 s32, s33 4294; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 4295; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload 4296; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4297; GFX9-NEXT: s_mov_b32 s33, s18 4298; GFX9-NEXT: s_waitcnt vmcnt(0) 4299; GFX9-NEXT: s_setpc_b64 s[30:31] 4300; 4301; GFX10-LABEL: test_call_v3bf16: 4302; GFX10: ; %bb.0: ; %entry 4303; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4304; GFX10-NEXT: s_mov_b32 s18, s33 4305; GFX10-NEXT: s_mov_b32 s33, s32 4306; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 4307; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill 4308; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4309; GFX10-NEXT: s_mov_b32 exec_lo, s16 4310; GFX10-NEXT: s_addk_i32 s32, 0x200 4311; GFX10-NEXT: s_getpc_b64 s[16:17] 4312; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4313; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4314; GFX10-NEXT: v_writelane_b32 v3, s30, 0 4315; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4316; GFX10-NEXT: v_writelane_b32 v3, s31, 1 4317; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4318; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 4319; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 4320; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4321; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4322; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4323; GFX10-NEXT: v_readlane_b32 s31, v3, 1 4324; GFX10-NEXT: v_readlane_b32 s30, v3, 0 4325; GFX10-NEXT: s_mov_b32 s32, s33 4326; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 4327; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload 4328; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4329; GFX10-NEXT: s_mov_b32 exec_lo, s4 4330; GFX10-NEXT: s_mov_b32 s33, s18 4331; GFX10-NEXT: s_waitcnt vmcnt(0) 4332; GFX10-NEXT: s_setpc_b64 s[30:31] 4333; 4334; GFX11-LABEL: test_call_v3bf16: 4335; GFX11: ; %bb.0: ; %entry 4336; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4337; GFX11-NEXT: s_mov_b32 s2, s33 4338; GFX11-NEXT: s_mov_b32 s33, s32 4339; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4340; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill 4341; GFX11-NEXT: s_mov_b32 exec_lo, s0 4342; GFX11-NEXT: s_add_i32 s32, s32, 16 4343; GFX11-NEXT: s_getpc_b64 s[0:1] 4344; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 4345; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 4346; GFX11-NEXT: v_writelane_b32 v3, s30, 0 4347; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 4348; GFX11-NEXT: v_writelane_b32 v3, s31, 1 4349; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4350; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 4351; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc 4352; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4353; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc 4354; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4355; GFX11-NEXT: v_readlane_b32 s31, v3, 1 4356; GFX11-NEXT: v_readlane_b32 s30, v3, 0 4357; GFX11-NEXT: s_mov_b32 s32, s33 4358; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4359; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload 4360; GFX11-NEXT: s_mov_b32 exec_lo, s0 4361; GFX11-NEXT: s_mov_b32 s33, s2 4362; GFX11-NEXT: s_waitcnt vmcnt(0) 4363; GFX11-NEXT: s_setpc_b64 s[30:31] 4364entry: 4365 %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in) 4366 store volatile <3 x bfloat> %result, ptr addrspace(5) %out 4367 ret void 4368} 4369 4370define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { 4371; GCN-LABEL: test_call_v4bf16: 4372; GCN: ; %bb.0: ; %entry 4373; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4374; GCN-NEXT: s_mov_b32 s18, s33 4375; GCN-NEXT: s_mov_b32 s33, s32 4376; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 4377; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill 4378; GCN-NEXT: s_mov_b64 exec, s[16:17] 4379; GCN-NEXT: s_addk_i32 s32, 0x400 4380; GCN-NEXT: s_waitcnt expcnt(0) 4381; GCN-NEXT: v_writelane_b32 v8, s30, 0 4382; GCN-NEXT: v_writelane_b32 v8, s31, 1 4383; GCN-NEXT: s_getpc_b64 s[16:17] 4384; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4385; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4386; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4387; GCN-NEXT: s_waitcnt lgkmcnt(0) 4388; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 4389; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 4390; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 4391; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 4392; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 4393; GCN-NEXT: v_add_i32_e32 v5, vcc, 6, v4 4394; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4 4395; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v4 4396; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4397; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4398; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4399; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 4400; GCN-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen 4401; GCN-NEXT: s_waitcnt vmcnt(0) 4402; GCN-NEXT: buffer_store_short v2, v6, s[0:3], 0 offen 4403; GCN-NEXT: s_waitcnt vmcnt(0) 4404; GCN-NEXT: buffer_store_short v1, v7, s[0:3], 0 offen 4405; GCN-NEXT: s_waitcnt vmcnt(0) 4406; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen 4407; GCN-NEXT: s_waitcnt vmcnt(0) 4408; GCN-NEXT: v_readlane_b32 s31, v8, 1 4409; GCN-NEXT: v_readlane_b32 s30, v8, 0 4410; GCN-NEXT: s_mov_b32 s32, s33 4411; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 4412; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload 4413; GCN-NEXT: s_mov_b64 exec, s[4:5] 4414; GCN-NEXT: s_mov_b32 s33, s18 4415; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4416; GCN-NEXT: s_setpc_b64 s[30:31] 4417; 4418; GFX7-LABEL: test_call_v4bf16: 4419; GFX7: ; %bb.0: ; %entry 4420; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4421; GFX7-NEXT: s_mov_b32 s18, s33 4422; GFX7-NEXT: s_mov_b32 s33, s32 4423; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 4424; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill 4425; GFX7-NEXT: s_mov_b64 exec, s[16:17] 4426; GFX7-NEXT: s_addk_i32 s32, 0x400 4427; GFX7-NEXT: s_getpc_b64 s[16:17] 4428; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4429; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4430; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4431; GFX7-NEXT: v_writelane_b32 v6, s30, 0 4432; GFX7-NEXT: v_writelane_b32 v6, s31, 1 4433; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4434; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 4435; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 4436; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 4437; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 4438; GFX7-NEXT: v_add_i32_e32 v5, vcc, 6, v4 4439; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 4440; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4441; GFX7-NEXT: buffer_store_short v3, v5, s[0:3], 0 offen 4442; GFX7-NEXT: s_waitcnt vmcnt(0) 4443; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v4 4444; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4445; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4446; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen 4447; GFX7-NEXT: s_waitcnt vmcnt(0) 4448; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4 4449; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4450; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen 4451; GFX7-NEXT: s_waitcnt vmcnt(0) 4452; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen 4453; GFX7-NEXT: s_waitcnt vmcnt(0) 4454; GFX7-NEXT: v_readlane_b32 s31, v6, 1 4455; GFX7-NEXT: v_readlane_b32 s30, v6, 0 4456; GFX7-NEXT: s_mov_b32 s32, s33 4457; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 4458; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload 4459; GFX7-NEXT: s_mov_b64 exec, s[4:5] 4460; GFX7-NEXT: s_mov_b32 s33, s18 4461; GFX7-NEXT: s_waitcnt vmcnt(0) 4462; GFX7-NEXT: s_setpc_b64 s[30:31] 4463; 4464; GFX8-LABEL: test_call_v4bf16: 4465; GFX8: ; %bb.0: ; %entry 4466; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4467; GFX8-NEXT: s_mov_b32 s18, s33 4468; GFX8-NEXT: s_mov_b32 s33, s32 4469; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 4470; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill 4471; GFX8-NEXT: s_mov_b64 exec, s[16:17] 4472; GFX8-NEXT: s_addk_i32 s32, 0x400 4473; GFX8-NEXT: s_getpc_b64 s[16:17] 4474; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4475; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4476; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4477; GFX8-NEXT: v_writelane_b32 v4, s30, 0 4478; GFX8-NEXT: v_writelane_b32 v4, s31, 1 4479; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4480; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 4481; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 4482; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen 4483; GFX8-NEXT: s_waitcnt vmcnt(0) 4484; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4485; GFX8-NEXT: s_waitcnt vmcnt(0) 4486; GFX8-NEXT: v_readlane_b32 s31, v4, 1 4487; GFX8-NEXT: v_readlane_b32 s30, v4, 0 4488; GFX8-NEXT: s_mov_b32 s32, s33 4489; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 4490; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload 4491; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4492; GFX8-NEXT: s_mov_b32 s33, s18 4493; GFX8-NEXT: s_waitcnt vmcnt(0) 4494; GFX8-NEXT: s_setpc_b64 s[30:31] 4495; 4496; GFX9-LABEL: test_call_v4bf16: 4497; GFX9: ; %bb.0: ; %entry 4498; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4499; GFX9-NEXT: s_mov_b32 s18, s33 4500; GFX9-NEXT: s_mov_b32 s33, s32 4501; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 4502; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill 4503; GFX9-NEXT: s_mov_b64 exec, s[16:17] 4504; GFX9-NEXT: s_addk_i32 s32, 0x400 4505; GFX9-NEXT: s_getpc_b64 s[16:17] 4506; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4507; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4508; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4509; GFX9-NEXT: v_writelane_b32 v3, s30, 0 4510; GFX9-NEXT: v_writelane_b32 v3, s31, 1 4511; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 4513; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 4514; GFX9-NEXT: s_waitcnt vmcnt(0) 4515; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4516; GFX9-NEXT: s_waitcnt vmcnt(0) 4517; GFX9-NEXT: v_readlane_b32 s31, v3, 1 4518; GFX9-NEXT: v_readlane_b32 s30, v3, 0 4519; GFX9-NEXT: s_mov_b32 s32, s33 4520; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 4521; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload 4522; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4523; GFX9-NEXT: s_mov_b32 s33, s18 4524; GFX9-NEXT: s_waitcnt vmcnt(0) 4525; GFX9-NEXT: s_setpc_b64 s[30:31] 4526; 4527; GFX10-LABEL: test_call_v4bf16: 4528; GFX10: ; %bb.0: ; %entry 4529; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4530; GFX10-NEXT: s_mov_b32 s18, s33 4531; GFX10-NEXT: s_mov_b32 s33, s32 4532; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 4533; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill 4534; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4535; GFX10-NEXT: s_mov_b32 exec_lo, s16 4536; GFX10-NEXT: s_addk_i32 s32, 0x200 4537; GFX10-NEXT: s_getpc_b64 s[16:17] 4538; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4539; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4540; GFX10-NEXT: v_writelane_b32 v3, s30, 0 4541; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4542; GFX10-NEXT: v_writelane_b32 v3, s31, 1 4543; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4544; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 4545; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 4546; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4547; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4548; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4549; GFX10-NEXT: v_readlane_b32 s31, v3, 1 4550; GFX10-NEXT: v_readlane_b32 s30, v3, 0 4551; GFX10-NEXT: s_mov_b32 s32, s33 4552; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 4553; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload 4554; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4555; GFX10-NEXT: s_mov_b32 exec_lo, s4 4556; GFX10-NEXT: s_mov_b32 s33, s18 4557; GFX10-NEXT: s_waitcnt vmcnt(0) 4558; GFX10-NEXT: s_setpc_b64 s[30:31] 4559; 4560; GFX11-LABEL: test_call_v4bf16: 4561; GFX11: ; %bb.0: ; %entry 4562; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4563; GFX11-NEXT: s_mov_b32 s2, s33 4564; GFX11-NEXT: s_mov_b32 s33, s32 4565; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4566; GFX11-NEXT: scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill 4567; GFX11-NEXT: s_mov_b32 exec_lo, s0 4568; GFX11-NEXT: s_add_i32 s32, s32, 16 4569; GFX11-NEXT: s_getpc_b64 s[0:1] 4570; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 4571; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 4572; GFX11-NEXT: v_writelane_b32 v3, s30, 0 4573; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 4574; GFX11-NEXT: v_writelane_b32 v3, s31, 1 4575; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4576; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 4577; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc 4578; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4579; GFX11-NEXT: v_readlane_b32 s31, v3, 1 4580; GFX11-NEXT: v_readlane_b32 s30, v3, 0 4581; GFX11-NEXT: s_mov_b32 s32, s33 4582; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4583; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload 4584; GFX11-NEXT: s_mov_b32 exec_lo, s0 4585; GFX11-NEXT: s_mov_b32 s33, s2 4586; GFX11-NEXT: s_waitcnt vmcnt(0) 4587; GFX11-NEXT: s_setpc_b64 s[30:31] 4588entry: 4589 %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in) 4590 store volatile <4 x bfloat> %result, ptr addrspace(5) %out 4591 ret void 4592} 4593 4594define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { 4595; GCN-LABEL: test_call_v8bf16: 4596; GCN: ; %bb.0: ; %entry 4597; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4598; GCN-NEXT: s_mov_b32 s18, s33 4599; GCN-NEXT: s_mov_b32 s33, s32 4600; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 4601; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill 4602; GCN-NEXT: s_mov_b64 exec, s[16:17] 4603; GCN-NEXT: s_addk_i32 s32, 0x400 4604; GCN-NEXT: s_waitcnt expcnt(0) 4605; GCN-NEXT: v_writelane_b32 v16, s30, 0 4606; GCN-NEXT: v_writelane_b32 v16, s31, 1 4607; GCN-NEXT: s_getpc_b64 s[16:17] 4608; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4609; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4610; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4611; GCN-NEXT: s_waitcnt lgkmcnt(0) 4612; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 4613; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 4614; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 4615; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 4616; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 4617; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 4618; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 4619; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 4620; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 4621; GCN-NEXT: v_add_i32_e32 v9, vcc, 14, v8 4622; GCN-NEXT: v_add_i32_e32 v10, vcc, 12, v8 4623; GCN-NEXT: v_add_i32_e32 v11, vcc, 10, v8 4624; GCN-NEXT: v_add_i32_e32 v12, vcc, 8, v8 4625; GCN-NEXT: v_add_i32_e32 v13, vcc, 6, v8 4626; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v8 4627; GCN-NEXT: v_add_i32_e32 v15, vcc, 2, v8 4628; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4629; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4630; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4631; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 4632; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 4633; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 4634; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 4635; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 4636; GCN-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen 4637; GCN-NEXT: s_waitcnt vmcnt(0) 4638; GCN-NEXT: buffer_store_short v6, v10, s[0:3], 0 offen 4639; GCN-NEXT: s_waitcnt vmcnt(0) 4640; GCN-NEXT: buffer_store_short v5, v11, s[0:3], 0 offen 4641; GCN-NEXT: s_waitcnt vmcnt(0) 4642; GCN-NEXT: buffer_store_short v4, v12, s[0:3], 0 offen 4643; GCN-NEXT: s_waitcnt vmcnt(0) 4644; GCN-NEXT: buffer_store_short v3, v13, s[0:3], 0 offen 4645; GCN-NEXT: s_waitcnt vmcnt(0) 4646; GCN-NEXT: buffer_store_short v2, v14, s[0:3], 0 offen 4647; GCN-NEXT: s_waitcnt vmcnt(0) 4648; GCN-NEXT: buffer_store_short v1, v15, s[0:3], 0 offen 4649; GCN-NEXT: s_waitcnt vmcnt(0) 4650; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen 4651; GCN-NEXT: s_waitcnt vmcnt(0) 4652; GCN-NEXT: v_readlane_b32 s31, v16, 1 4653; GCN-NEXT: v_readlane_b32 s30, v16, 0 4654; GCN-NEXT: s_mov_b32 s32, s33 4655; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 4656; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload 4657; GCN-NEXT: s_mov_b64 exec, s[4:5] 4658; GCN-NEXT: s_mov_b32 s33, s18 4659; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4660; GCN-NEXT: s_setpc_b64 s[30:31] 4661; 4662; GFX7-LABEL: test_call_v8bf16: 4663; GFX7: ; %bb.0: ; %entry 4664; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4665; GFX7-NEXT: s_mov_b32 s18, s33 4666; GFX7-NEXT: s_mov_b32 s33, s32 4667; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 4668; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill 4669; GFX7-NEXT: s_mov_b64 exec, s[16:17] 4670; GFX7-NEXT: s_addk_i32 s32, 0x400 4671; GFX7-NEXT: s_getpc_b64 s[16:17] 4672; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4673; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4674; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4675; GFX7-NEXT: v_writelane_b32 v10, s30, 0 4676; GFX7-NEXT: v_writelane_b32 v10, s31, 1 4677; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4678; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 4679; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 4680; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 4681; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 4682; GFX7-NEXT: v_add_i32_e32 v9, vcc, 14, v8 4683; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 4684; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 4685; GFX7-NEXT: buffer_store_short v7, v9, s[0:3], 0 offen 4686; GFX7-NEXT: s_waitcnt vmcnt(0) 4687; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v8 4688; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 4689; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 4690; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen 4691; GFX7-NEXT: s_waitcnt vmcnt(0) 4692; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v8 4693; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 4694; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 4695; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen 4696; GFX7-NEXT: s_waitcnt vmcnt(0) 4697; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v8 4698; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 4699; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 4700; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen 4701; GFX7-NEXT: s_waitcnt vmcnt(0) 4702; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v8 4703; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 4704; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4705; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen 4706; GFX7-NEXT: s_waitcnt vmcnt(0) 4707; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v8 4708; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4709; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4710; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen 4711; GFX7-NEXT: s_waitcnt vmcnt(0) 4712; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8 4713; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4714; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen 4715; GFX7-NEXT: s_waitcnt vmcnt(0) 4716; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen 4717; GFX7-NEXT: s_waitcnt vmcnt(0) 4718; GFX7-NEXT: v_readlane_b32 s31, v10, 1 4719; GFX7-NEXT: v_readlane_b32 s30, v10, 0 4720; GFX7-NEXT: s_mov_b32 s32, s33 4721; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 4722; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload 4723; GFX7-NEXT: s_mov_b64 exec, s[4:5] 4724; GFX7-NEXT: s_mov_b32 s33, s18 4725; GFX7-NEXT: s_waitcnt vmcnt(0) 4726; GFX7-NEXT: s_setpc_b64 s[30:31] 4727; 4728; GFX8-LABEL: test_call_v8bf16: 4729; GFX8: ; %bb.0: ; %entry 4730; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4731; GFX8-NEXT: s_mov_b32 s18, s33 4732; GFX8-NEXT: s_mov_b32 s33, s32 4733; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 4734; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill 4735; GFX8-NEXT: s_mov_b64 exec, s[16:17] 4736; GFX8-NEXT: s_addk_i32 s32, 0x400 4737; GFX8-NEXT: s_getpc_b64 s[16:17] 4738; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4739; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4740; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4741; GFX8-NEXT: v_writelane_b32 v6, s30, 0 4742; GFX8-NEXT: v_writelane_b32 v6, s31, 1 4743; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4744; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 4745; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4 4746; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen 4747; GFX8-NEXT: s_waitcnt vmcnt(0) 4748; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v4 4749; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 4750; GFX8-NEXT: s_waitcnt vmcnt(0) 4751; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 4752; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 4753; GFX8-NEXT: s_waitcnt vmcnt(0) 4754; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4755; GFX8-NEXT: s_waitcnt vmcnt(0) 4756; GFX8-NEXT: v_readlane_b32 s31, v6, 1 4757; GFX8-NEXT: v_readlane_b32 s30, v6, 0 4758; GFX8-NEXT: s_mov_b32 s32, s33 4759; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 4760; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload 4761; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4762; GFX8-NEXT: s_mov_b32 s33, s18 4763; GFX8-NEXT: s_waitcnt vmcnt(0) 4764; GFX8-NEXT: s_setpc_b64 s[30:31] 4765; 4766; GFX9-LABEL: test_call_v8bf16: 4767; GFX9: ; %bb.0: ; %entry 4768; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4769; GFX9-NEXT: s_mov_b32 s18, s33 4770; GFX9-NEXT: s_mov_b32 s33, s32 4771; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 4772; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill 4773; GFX9-NEXT: s_mov_b64 exec, s[16:17] 4774; GFX9-NEXT: s_addk_i32 s32, 0x400 4775; GFX9-NEXT: s_getpc_b64 s[16:17] 4776; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4777; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4778; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4779; GFX9-NEXT: v_writelane_b32 v5, s30, 0 4780; GFX9-NEXT: v_writelane_b32 v5, s31, 1 4781; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4782; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 4783; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 4784; GFX9-NEXT: s_waitcnt vmcnt(0) 4785; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 4786; GFX9-NEXT: s_waitcnt vmcnt(0) 4787; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4788; GFX9-NEXT: s_waitcnt vmcnt(0) 4789; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4790; GFX9-NEXT: s_waitcnt vmcnt(0) 4791; GFX9-NEXT: v_readlane_b32 s31, v5, 1 4792; GFX9-NEXT: v_readlane_b32 s30, v5, 0 4793; GFX9-NEXT: s_mov_b32 s32, s33 4794; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 4795; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload 4796; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4797; GFX9-NEXT: s_mov_b32 s33, s18 4798; GFX9-NEXT: s_waitcnt vmcnt(0) 4799; GFX9-NEXT: s_setpc_b64 s[30:31] 4800; 4801; GFX10-LABEL: test_call_v8bf16: 4802; GFX10: ; %bb.0: ; %entry 4803; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4804; GFX10-NEXT: s_mov_b32 s18, s33 4805; GFX10-NEXT: s_mov_b32 s33, s32 4806; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 4807; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill 4808; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4809; GFX10-NEXT: s_mov_b32 exec_lo, s16 4810; GFX10-NEXT: s_addk_i32 s32, 0x200 4811; GFX10-NEXT: s_getpc_b64 s[16:17] 4812; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4813; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4814; GFX10-NEXT: v_writelane_b32 v5, s30, 0 4815; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4816; GFX10-NEXT: v_writelane_b32 v5, s31, 1 4817; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 4819; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 4820; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4821; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 4822; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4823; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4824; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4825; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4826; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4827; GFX10-NEXT: v_readlane_b32 s31, v5, 1 4828; GFX10-NEXT: v_readlane_b32 s30, v5, 0 4829; GFX10-NEXT: s_mov_b32 s32, s33 4830; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 4831; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload 4832; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4833; GFX10-NEXT: s_mov_b32 exec_lo, s4 4834; GFX10-NEXT: s_mov_b32 s33, s18 4835; GFX10-NEXT: s_waitcnt vmcnt(0) 4836; GFX10-NEXT: s_setpc_b64 s[30:31] 4837; 4838; GFX11-LABEL: test_call_v8bf16: 4839; GFX11: ; %bb.0: ; %entry 4840; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4841; GFX11-NEXT: s_mov_b32 s2, s33 4842; GFX11-NEXT: s_mov_b32 s33, s32 4843; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4844; GFX11-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill 4845; GFX11-NEXT: s_mov_b32 exec_lo, s0 4846; GFX11-NEXT: s_add_i32 s32, s32, 16 4847; GFX11-NEXT: s_getpc_b64 s[0:1] 4848; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 4849; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 4850; GFX11-NEXT: v_writelane_b32 v5, s30, 0 4851; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 4852; GFX11-NEXT: v_writelane_b32 v5, s31, 1 4853; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4854; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 4855; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc 4856; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4857; GFX11-NEXT: v_readlane_b32 s31, v5, 1 4858; GFX11-NEXT: v_readlane_b32 s30, v5, 0 4859; GFX11-NEXT: s_mov_b32 s32, s33 4860; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 4861; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload 4862; GFX11-NEXT: s_mov_b32 exec_lo, s0 4863; GFX11-NEXT: s_mov_b32 s33, s2 4864; GFX11-NEXT: s_waitcnt vmcnt(0) 4865; GFX11-NEXT: s_setpc_b64 s[30:31] 4866entry: 4867 %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in) 4868 store volatile <8 x bfloat> %result, ptr addrspace(5) %out 4869 ret void 4870} 4871 4872define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { 4873; GCN-LABEL: test_call_v16bf16: 4874; GCN: ; %bb.0: ; %entry 4875; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4876; GCN-NEXT: s_mov_b32 s18, s33 4877; GCN-NEXT: s_mov_b32 s33, s32 4878; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 4879; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill 4880; GCN-NEXT: s_mov_b64 exec, s[16:17] 4881; GCN-NEXT: s_addk_i32 s32, 0x400 4882; GCN-NEXT: s_waitcnt expcnt(0) 4883; GCN-NEXT: v_writelane_b32 v20, s30, 0 4884; GCN-NEXT: v_writelane_b32 v20, s31, 1 4885; GCN-NEXT: s_getpc_b64 s[16:17] 4886; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4887; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4888; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4889; GCN-NEXT: s_waitcnt lgkmcnt(0) 4890; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] 4891; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 4892; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 4893; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 4894; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 4895; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 4896; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 4897; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 4898; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 4899; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 4900; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 4901; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 4902; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 4903; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 4904; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 4905; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 4906; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 4907; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16 4908; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16 4909; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16 4910; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 4911; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen 4912; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4913; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16 4914; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16 4915; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 4916; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen 4917; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4918; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16 4919; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16 4920; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 4921; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen 4922; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4923; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16 4924; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16 4925; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 4926; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen 4927; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4928; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16 4929; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16 4930; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 4931; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen 4932; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4933; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16 4934; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16 4935; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 4936; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen 4937; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4938; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16 4939; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16 4940; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4941; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 4942; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 4943; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 4944; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 4945; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 4946; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 4947; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 4948; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 4949; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 4950; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen 4951; GCN-NEXT: s_waitcnt vmcnt(0) 4952; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen 4953; GCN-NEXT: s_waitcnt vmcnt(0) 4954; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen 4955; GCN-NEXT: s_waitcnt vmcnt(0) 4956; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen 4957; GCN-NEXT: s_waitcnt vmcnt(0) 4958; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen 4959; GCN-NEXT: s_waitcnt vmcnt(0) 4960; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen 4961; GCN-NEXT: s_waitcnt vmcnt(0) 4962; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen 4963; GCN-NEXT: s_waitcnt vmcnt(0) 4964; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen 4965; GCN-NEXT: s_waitcnt vmcnt(0) 4966; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen 4967; GCN-NEXT: s_waitcnt vmcnt(0) 4968; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen 4969; GCN-NEXT: s_waitcnt vmcnt(0) 4970; GCN-NEXT: v_readlane_b32 s31, v20, 1 4971; GCN-NEXT: v_readlane_b32 s30, v20, 0 4972; GCN-NEXT: s_mov_b32 s32, s33 4973; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 4974; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload 4975; GCN-NEXT: s_mov_b64 exec, s[4:5] 4976; GCN-NEXT: s_mov_b32 s33, s18 4977; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 4978; GCN-NEXT: s_setpc_b64 s[30:31] 4979; 4980; GFX7-LABEL: test_call_v16bf16: 4981; GFX7: ; %bb.0: ; %entry 4982; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4983; GFX7-NEXT: s_mov_b32 s18, s33 4984; GFX7-NEXT: s_mov_b32 s33, s32 4985; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 4986; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill 4987; GFX7-NEXT: s_mov_b64 exec, s[16:17] 4988; GFX7-NEXT: s_addk_i32 s32, 0x400 4989; GFX7-NEXT: s_getpc_b64 s[16:17] 4990; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 4991; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 4992; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 4993; GFX7-NEXT: v_writelane_b32 v18, s30, 0 4994; GFX7-NEXT: v_writelane_b32 v18, s31, 1 4995; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4996; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] 4997; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 4998; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 4999; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 5000; GFX7-NEXT: v_add_i32_e32 v17, vcc, 30, v16 5001; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 5002; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 5003; GFX7-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen 5004; GFX7-NEXT: s_waitcnt vmcnt(0) 5005; GFX7-NEXT: v_add_i32_e32 v15, vcc, 28, v16 5006; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 5007; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 5008; GFX7-NEXT: buffer_store_short v14, v15, s[0:3], 0 offen 5009; GFX7-NEXT: s_waitcnt vmcnt(0) 5010; GFX7-NEXT: v_add_i32_e32 v14, vcc, 26, v16 5011; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 5012; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 5013; GFX7-NEXT: buffer_store_short v13, v14, s[0:3], 0 offen 5014; GFX7-NEXT: s_waitcnt vmcnt(0) 5015; GFX7-NEXT: v_add_i32_e32 v13, vcc, 24, v16 5016; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 5017; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 5018; GFX7-NEXT: buffer_store_short v12, v13, s[0:3], 0 offen 5019; GFX7-NEXT: s_waitcnt vmcnt(0) 5020; GFX7-NEXT: v_add_i32_e32 v12, vcc, 22, v16 5021; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 5022; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 5023; GFX7-NEXT: buffer_store_short v11, v12, s[0:3], 0 offen 5024; GFX7-NEXT: s_waitcnt vmcnt(0) 5025; GFX7-NEXT: v_add_i32_e32 v11, vcc, 20, v16 5026; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 5027; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 5028; GFX7-NEXT: buffer_store_short v10, v11, s[0:3], 0 offen 5029; GFX7-NEXT: s_waitcnt vmcnt(0) 5030; GFX7-NEXT: v_add_i32_e32 v10, vcc, 18, v16 5031; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 5032; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 5033; GFX7-NEXT: buffer_store_short v9, v10, s[0:3], 0 offen 5034; GFX7-NEXT: s_waitcnt vmcnt(0) 5035; GFX7-NEXT: v_add_i32_e32 v9, vcc, 16, v16 5036; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 5037; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 5038; GFX7-NEXT: buffer_store_short v8, v9, s[0:3], 0 offen 5039; GFX7-NEXT: s_waitcnt vmcnt(0) 5040; GFX7-NEXT: v_add_i32_e32 v8, vcc, 14, v16 5041; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 5042; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 5043; GFX7-NEXT: buffer_store_short v7, v8, s[0:3], 0 offen 5044; GFX7-NEXT: s_waitcnt vmcnt(0) 5045; GFX7-NEXT: v_add_i32_e32 v7, vcc, 12, v16 5046; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 5047; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 5048; GFX7-NEXT: buffer_store_short v6, v7, s[0:3], 0 offen 5049; GFX7-NEXT: s_waitcnt vmcnt(0) 5050; GFX7-NEXT: v_add_i32_e32 v6, vcc, 10, v16 5051; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 5052; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 5053; GFX7-NEXT: buffer_store_short v5, v6, s[0:3], 0 offen 5054; GFX7-NEXT: s_waitcnt vmcnt(0) 5055; GFX7-NEXT: v_add_i32_e32 v5, vcc, 8, v16 5056; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 5057; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 5058; GFX7-NEXT: buffer_store_short v4, v5, s[0:3], 0 offen 5059; GFX7-NEXT: s_waitcnt vmcnt(0) 5060; GFX7-NEXT: v_add_i32_e32 v4, vcc, 6, v16 5061; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 5062; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 5063; GFX7-NEXT: buffer_store_short v3, v4, s[0:3], 0 offen 5064; GFX7-NEXT: s_waitcnt vmcnt(0) 5065; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v16 5066; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 5067; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 5068; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen 5069; GFX7-NEXT: s_waitcnt vmcnt(0) 5070; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16 5071; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5072; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen 5073; GFX7-NEXT: s_waitcnt vmcnt(0) 5074; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen 5075; GFX7-NEXT: s_waitcnt vmcnt(0) 5076; GFX7-NEXT: v_readlane_b32 s31, v18, 1 5077; GFX7-NEXT: v_readlane_b32 s30, v18, 0 5078; GFX7-NEXT: s_mov_b32 s32, s33 5079; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 5080; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload 5081; GFX7-NEXT: s_mov_b64 exec, s[4:5] 5082; GFX7-NEXT: s_mov_b32 s33, s18 5083; GFX7-NEXT: s_waitcnt vmcnt(0) 5084; GFX7-NEXT: s_setpc_b64 s[30:31] 5085; 5086; GFX8-LABEL: test_call_v16bf16: 5087; GFX8: ; %bb.0: ; %entry 5088; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5089; GFX8-NEXT: s_mov_b32 s18, s33 5090; GFX8-NEXT: s_mov_b32 s33, s32 5091; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 5092; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill 5093; GFX8-NEXT: s_mov_b64 exec, s[16:17] 5094; GFX8-NEXT: s_addk_i32 s32, 0x400 5095; GFX8-NEXT: s_getpc_b64 s[16:17] 5096; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 5097; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 5098; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 5099; GFX8-NEXT: v_writelane_b32 v10, s30, 0 5100; GFX8-NEXT: v_writelane_b32 v10, s31, 1 5101; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5102; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] 5103; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8 5104; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen 5105; GFX8-NEXT: s_waitcnt vmcnt(0) 5106; GFX8-NEXT: v_add_u32_e32 v7, vcc, 24, v8 5107; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 5108; GFX8-NEXT: s_waitcnt vmcnt(0) 5109; GFX8-NEXT: v_add_u32_e32 v6, vcc, 20, v8 5110; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen 5111; GFX8-NEXT: s_waitcnt vmcnt(0) 5112; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v8 5113; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen 5114; GFX8-NEXT: s_waitcnt vmcnt(0) 5115; GFX8-NEXT: v_add_u32_e32 v4, vcc, 12, v8 5116; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen 5117; GFX8-NEXT: s_waitcnt vmcnt(0) 5118; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v8 5119; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 5120; GFX8-NEXT: s_waitcnt vmcnt(0) 5121; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8 5122; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 5123; GFX8-NEXT: s_waitcnt vmcnt(0) 5124; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen 5125; GFX8-NEXT: s_waitcnt vmcnt(0) 5126; GFX8-NEXT: v_readlane_b32 s31, v10, 1 5127; GFX8-NEXT: v_readlane_b32 s30, v10, 0 5128; GFX8-NEXT: s_mov_b32 s32, s33 5129; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 5130; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload 5131; GFX8-NEXT: s_mov_b64 exec, s[4:5] 5132; GFX8-NEXT: s_mov_b32 s33, s18 5133; GFX8-NEXT: s_waitcnt vmcnt(0) 5134; GFX8-NEXT: s_setpc_b64 s[30:31] 5135; 5136; GFX9-LABEL: test_call_v16bf16: 5137; GFX9: ; %bb.0: ; %entry 5138; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5139; GFX9-NEXT: s_mov_b32 s18, s33 5140; GFX9-NEXT: s_mov_b32 s33, s32 5141; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 5142; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill 5143; GFX9-NEXT: s_mov_b64 exec, s[16:17] 5144; GFX9-NEXT: s_addk_i32 s32, 0x400 5145; GFX9-NEXT: s_getpc_b64 s[16:17] 5146; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 5147; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 5148; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 5149; GFX9-NEXT: v_writelane_b32 v9, s30, 0 5150; GFX9-NEXT: v_writelane_b32 v9, s31, 1 5151; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5152; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 5153; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 5154; GFX9-NEXT: s_waitcnt vmcnt(0) 5155; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 5156; GFX9-NEXT: s_waitcnt vmcnt(0) 5157; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 5158; GFX9-NEXT: s_waitcnt vmcnt(0) 5159; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 5160; GFX9-NEXT: s_waitcnt vmcnt(0) 5161; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 5162; GFX9-NEXT: s_waitcnt vmcnt(0) 5163; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 5164; GFX9-NEXT: s_waitcnt vmcnt(0) 5165; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 5166; GFX9-NEXT: s_waitcnt vmcnt(0) 5167; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen 5168; GFX9-NEXT: s_waitcnt vmcnt(0) 5169; GFX9-NEXT: v_readlane_b32 s31, v9, 1 5170; GFX9-NEXT: v_readlane_b32 s30, v9, 0 5171; GFX9-NEXT: s_mov_b32 s32, s33 5172; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 5173; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload 5174; GFX9-NEXT: s_mov_b64 exec, s[4:5] 5175; GFX9-NEXT: s_mov_b32 s33, s18 5176; GFX9-NEXT: s_waitcnt vmcnt(0) 5177; GFX9-NEXT: s_setpc_b64 s[30:31] 5178; 5179; GFX10-LABEL: test_call_v16bf16: 5180; GFX10: ; %bb.0: ; %entry 5181; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5182; GFX10-NEXT: s_mov_b32 s18, s33 5183; GFX10-NEXT: s_mov_b32 s33, s32 5184; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 5185; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill 5186; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5187; GFX10-NEXT: s_mov_b32 exec_lo, s16 5188; GFX10-NEXT: s_addk_i32 s32, 0x200 5189; GFX10-NEXT: s_getpc_b64 s[16:17] 5190; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 5191; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 5192; GFX10-NEXT: v_writelane_b32 v9, s30, 0 5193; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 5194; GFX10-NEXT: v_writelane_b32 v9, s31, 1 5195; GFX10-NEXT: s_waitcnt lgkmcnt(0) 5196; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 5197; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 5198; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5199; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 5200; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5201; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 5202; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5203; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 5204; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5205; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 5206; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5207; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 5208; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5209; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 5210; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5211; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen 5212; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5213; GFX10-NEXT: v_readlane_b32 s31, v9, 1 5214; GFX10-NEXT: v_readlane_b32 s30, v9, 0 5215; GFX10-NEXT: s_mov_b32 s32, s33 5216; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 5217; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload 5218; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5219; GFX10-NEXT: s_mov_b32 exec_lo, s4 5220; GFX10-NEXT: s_mov_b32 s33, s18 5221; GFX10-NEXT: s_waitcnt vmcnt(0) 5222; GFX10-NEXT: s_setpc_b64 s[30:31] 5223; 5224; GFX11-LABEL: test_call_v16bf16: 5225; GFX11: ; %bb.0: ; %entry 5226; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5227; GFX11-NEXT: s_mov_b32 s2, s33 5228; GFX11-NEXT: s_mov_b32 s33, s32 5229; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 5230; GFX11-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill 5231; GFX11-NEXT: s_mov_b32 exec_lo, s0 5232; GFX11-NEXT: s_add_i32 s32, s32, 16 5233; GFX11-NEXT: s_getpc_b64 s[0:1] 5234; GFX11-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 5235; GFX11-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 5236; GFX11-NEXT: v_writelane_b32 v9, s30, 0 5237; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 5238; GFX11-NEXT: v_writelane_b32 v9, s31, 1 5239; GFX11-NEXT: s_waitcnt lgkmcnt(0) 5240; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] 5241; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc 5242; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5243; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc 5244; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5245; GFX11-NEXT: v_readlane_b32 s31, v9, 1 5246; GFX11-NEXT: v_readlane_b32 s30, v9, 0 5247; GFX11-NEXT: s_mov_b32 s32, s33 5248; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 5249; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload 5250; GFX11-NEXT: s_mov_b32 exec_lo, s0 5251; GFX11-NEXT: s_mov_b32 s33, s2 5252; GFX11-NEXT: s_waitcnt vmcnt(0) 5253; GFX11-NEXT: s_setpc_b64 s[30:31] 5254entry: 5255 %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in) 5256 store volatile <16 x bfloat> %result, ptr addrspace(5) %out 5257 ret void 5258} 5259 5260define bfloat @test_alloca_load_store_ret(bfloat %in) { 5261; GCN-LABEL: test_alloca_load_store_ret: 5262; GCN: ; %bb.0: ; %entry 5263; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5264; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 5265; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5266; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 5267; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5268; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc 5269; GCN-NEXT: s_waitcnt vmcnt(0) 5270; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5271; GCN-NEXT: s_setpc_b64 s[30:31] 5272; 5273; GFX7-LABEL: test_alloca_load_store_ret: 5274; GFX7: ; %bb.0: ; %entry 5275; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5276; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 5277; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 5278; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 5279; GFX7-NEXT: s_waitcnt vmcnt(0) 5280; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc 5281; GFX7-NEXT: s_waitcnt vmcnt(0) 5282; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5283; GFX7-NEXT: s_setpc_b64 s[30:31] 5284; 5285; GFX8-LABEL: test_alloca_load_store_ret: 5286; GFX8: ; %bb.0: ; %entry 5287; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5288; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32 5289; GFX8-NEXT: s_waitcnt vmcnt(0) 5290; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc 5291; GFX8-NEXT: s_waitcnt vmcnt(0) 5292; GFX8-NEXT: s_setpc_b64 s[30:31] 5293; 5294; GFX9-LABEL: test_alloca_load_store_ret: 5295; GFX9: ; %bb.0: ; %entry 5296; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5297; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 5298; GFX9-NEXT: s_waitcnt vmcnt(0) 5299; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc 5300; GFX9-NEXT: s_waitcnt vmcnt(0) 5301; GFX9-NEXT: s_setpc_b64 s[30:31] 5302; 5303; GFX10-LABEL: test_alloca_load_store_ret: 5304; GFX10: ; %bb.0: ; %entry 5305; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5306; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32 5307; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5308; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc 5309; GFX10-NEXT: s_waitcnt vmcnt(0) 5310; GFX10-NEXT: s_setpc_b64 s[30:31] 5311; 5312; GFX11-LABEL: test_alloca_load_store_ret: 5313; GFX11: ; %bb.0: ; %entry 5314; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5315; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc 5316; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5317; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc 5318; GFX11-NEXT: s_waitcnt vmcnt(0) 5319; GFX11-NEXT: s_setpc_b64 s[30:31] 5320entry: 5321 %in.addr = alloca bfloat, align 2, addrspace(5) 5322 store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 5323 %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2 5324 ret bfloat %loaded 5325} 5326 5327define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { 5328; GCN-LABEL: test_overflow_stack: 5329; GCN: ; %bb.0: 5330; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5331; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen 5332; GCN-NEXT: s_waitcnt expcnt(0) 5333; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 5334; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 5335; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 5336; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 5337; GCN-NEXT: s_waitcnt vmcnt(2) 5338; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5339; GCN-NEXT: s_waitcnt expcnt(0) 5340; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 5341; GCN-NEXT: s_waitcnt vmcnt(2) 5342; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen 5343; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 5344; GCN-NEXT: s_waitcnt vmcnt(2) 5345; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen 5346; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 5347; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0 5348; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen 5349; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 5350; GCN-NEXT: s_waitcnt expcnt(0) 5351; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 5352; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen 5353; GCN-NEXT: s_waitcnt expcnt(0) 5354; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x60, v0 5355; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x5c, v0 5356; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen 5357; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 5358; GCN-NEXT: s_waitcnt expcnt(0) 5359; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 5360; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen 5361; GCN-NEXT: s_waitcnt expcnt(0) 5362; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0 5363; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 5364; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen 5365; GCN-NEXT: s_waitcnt expcnt(0) 5366; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 5367; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 5368; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 5369; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen 5370; GCN-NEXT: s_waitcnt expcnt(0) 5371; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 5372; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 5373; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen 5374; GCN-NEXT: v_add_i32_e32 v2, vcc, 56, v0 5375; GCN-NEXT: s_waitcnt expcnt(0) 5376; GCN-NEXT: v_add_i32_e32 v24, vcc, 52, v0 5377; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen 5378; GCN-NEXT: s_waitcnt expcnt(0) 5379; GCN-NEXT: v_add_i32_e32 v23, vcc, 48, v0 5380; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 5381; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen 5382; GCN-NEXT: s_waitcnt expcnt(0) 5383; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 5384; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 5385; GCN-NEXT: buffer_store_dword v21, v30, s[0:3], 0 offen 5386; GCN-NEXT: s_waitcnt expcnt(0) 5387; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 5388; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 5389; GCN-NEXT: buffer_store_dword v20, v26, s[0:3], 0 offen 5390; GCN-NEXT: s_waitcnt expcnt(0) 5391; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v0 5392; GCN-NEXT: v_add_i32_e32 v26, vcc, 20, v0 5393; GCN-NEXT: buffer_store_dword v19, v29, s[0:3], 0 offen 5394; GCN-NEXT: s_waitcnt expcnt(0) 5395; GCN-NEXT: v_add_i32_e32 v19, vcc, 16, v0 5396; GCN-NEXT: v_add_i32_e32 v29, vcc, 12, v0 5397; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen 5398; GCN-NEXT: s_waitcnt expcnt(0) 5399; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 5400; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 5401; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 5402; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 5403; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen 5404; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen 5405; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen 5406; GCN-NEXT: buffer_store_dword v14, v23, s[0:3], 0 offen 5407; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen 5408; GCN-NEXT: buffer_store_dword v12, v22, s[0:3], 0 offen 5409; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen 5410; GCN-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen 5411; GCN-NEXT: buffer_store_dword v9, v30, s[0:3], 0 offen 5412; GCN-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen 5413; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen 5414; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen 5415; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen 5416; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen 5417; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen 5418; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 5419; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5420; GCN-NEXT: s_setpc_b64 s[30:31] 5421; 5422; GFX7-LABEL: test_overflow_stack: 5423; GFX7: ; %bb.0: 5424; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5425; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen 5426; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 5427; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 5428; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 5429; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 5430; GFX7-NEXT: s_waitcnt vmcnt(0) 5431; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5432; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 5433; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 5434; GFX7-NEXT: s_waitcnt vmcnt(0) 5435; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5436; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 5437; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x74, v0 5438; GFX7-NEXT: s_waitcnt vmcnt(0) 5439; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5440; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 5441; GFX7-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen 5442; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 5443; GFX7-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen 5444; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 5445; GFX7-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen 5446; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 5447; GFX7-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen 5448; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 5449; GFX7-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen 5450; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 5451; GFX7-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen 5452; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 5453; GFX7-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen 5454; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 5455; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen 5456; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 5457; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen 5458; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 5459; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen 5460; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 5461; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen 5462; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 5463; GFX7-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen 5464; GFX7-NEXT: v_add_i32_e32 v2, vcc, 64, v0 5465; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen 5466; GFX7-NEXT: v_add_i32_e32 v2, vcc, 60, v0 5467; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen 5468; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0 5469; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen 5470; GFX7-NEXT: v_add_i32_e32 v2, vcc, 52, v0 5471; GFX7-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen 5472; GFX7-NEXT: v_add_i32_e32 v2, vcc, 48, v0 5473; GFX7-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen 5474; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 5475; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen 5476; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0 5477; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen 5478; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0 5479; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen 5480; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0 5481; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen 5482; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0 5483; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen 5484; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0 5485; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen 5486; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0 5487; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen 5488; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 5489; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen 5490; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0 5491; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen 5492; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 5493; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen 5494; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 5495; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 5496; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen 5497; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 5498; GFX7-NEXT: s_waitcnt vmcnt(0) 5499; GFX7-NEXT: s_setpc_b64 s[30:31] 5500; 5501; GFX8-LABEL: test_overflow_stack: 5502; GFX8: ; %bb.0: 5503; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5504; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen 5505; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 5506; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x7c, v0 5507; GFX8-NEXT: s_waitcnt vmcnt(0) 5508; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5509; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 5510; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x78, v0 5511; GFX8-NEXT: s_waitcnt vmcnt(0) 5512; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5513; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 5514; GFX8-NEXT: v_add_u32_e32 v31, vcc, 0x74, v0 5515; GFX8-NEXT: s_waitcnt vmcnt(0) 5516; GFX8-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen 5517; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 5518; GFX8-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen 5519; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 5520; GFX8-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen 5521; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 5522; GFX8-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen 5523; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 5524; GFX8-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen 5525; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 5526; GFX8-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen 5527; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 5528; GFX8-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen 5529; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 5530; GFX8-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen 5531; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 5532; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen 5533; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 5534; GFX8-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen 5535; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 5536; GFX8-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen 5537; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 5538; GFX8-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen 5539; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 5540; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen 5541; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 5542; GFX8-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen 5543; GFX8-NEXT: v_add_u32_e32 v2, vcc, 60, v0 5544; GFX8-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen 5545; GFX8-NEXT: v_add_u32_e32 v2, vcc, 56, v0 5546; GFX8-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen 5547; GFX8-NEXT: v_add_u32_e32 v2, vcc, 52, v0 5548; GFX8-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen 5549; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v0 5550; GFX8-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen 5551; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0 5552; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen 5553; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 5554; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen 5555; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0 5556; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen 5557; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 5558; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen 5559; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0 5560; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen 5561; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0 5562; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen 5563; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0 5564; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen 5565; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 5566; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen 5567; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0 5568; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen 5569; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 5570; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen 5571; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 5572; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 5573; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen 5574; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen 5575; GFX8-NEXT: s_waitcnt vmcnt(0) 5576; GFX8-NEXT: s_setpc_b64 s[30:31] 5577; 5578; GFX9-LABEL: test_overflow_stack: 5579; GFX9: ; %bb.0: 5580; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5581; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 5582; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 5583; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 5584; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 5585; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 5586; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 5587; GFX9-NEXT: s_nop 0 5588; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8 5589; GFX9-NEXT: s_nop 0 5590; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 5591; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 5592; GFX9-NEXT: s_nop 0 5593; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 5594; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 5595; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 5596; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 5597; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 5598; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 5599; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 5600; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 5601; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 5602; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 5603; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 5604; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 5605; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 5606; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 5607; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 5608; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 5609; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 5610; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 5611; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 5612; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 5613; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 5614; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 5615; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen 5616; GFX9-NEXT: s_waitcnt vmcnt(25) 5617; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124 5618; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 5619; GFX9-NEXT: s_waitcnt vmcnt(25) 5620; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116 5621; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 5622; GFX9-NEXT: s_waitcnt vmcnt(0) 5623; GFX9-NEXT: s_setpc_b64 s[30:31] 5624; 5625; GFX10-LABEL: test_overflow_stack: 5626; GFX10: ; %bb.0: 5627; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5628; GFX10-NEXT: s_clause 0x2 5629; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 5630; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 5631; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 5632; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 5633; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 5634; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 5635; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 5636; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 5637; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 5638; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 5639; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 5640; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 5641; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 5642; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 5643; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 5644; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 5645; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 5646; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 5647; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 5648; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 5649; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 5650; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 5651; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 5652; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 5653; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 5654; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 5655; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 5656; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 5657; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 5658; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 5659; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 5660; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen 5661; GFX10-NEXT: s_waitcnt vmcnt(2) 5662; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:124 5663; GFX10-NEXT: s_waitcnt vmcnt(1) 5664; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120 5665; GFX10-NEXT: s_waitcnt vmcnt(0) 5666; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:116 5667; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 5668; GFX10-NEXT: s_setpc_b64 s[30:31] 5669; 5670; GFX11-LABEL: test_overflow_stack: 5671; GFX11: ; %bb.0: 5672; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5673; GFX11-NEXT: s_clause 0x2 5674; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 5675; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 5676; GFX11-NEXT: scratch_load_b32 v31, off, s32 5677; GFX11-NEXT: s_clause 0x5 5678; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 5679; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 5680; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 5681; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 5682; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 5683; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off 5684; GFX11-NEXT: s_waitcnt vmcnt(0) 5685; GFX11-NEXT: s_clause 0x2 5686; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 5687; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 5688; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128 5689; GFX11-NEXT: s_setpc_b64 s[30:31] 5690 %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 5691 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 5692 ret { <32 x i32>, bfloat } %ins.1 5693} 5694 5695define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) { 5696; GCN-LABEL: global_extload_v2bf16_to_v2f32: 5697; GCN: ; %bb.0: 5698; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5699; GCN-NEXT: s_mov_b32 s6, 0 5700; GCN-NEXT: s_mov_b32 s7, 0xf000 5701; GCN-NEXT: s_mov_b32 s4, s6 5702; GCN-NEXT: s_mov_b32 s5, s6 5703; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 5704; GCN-NEXT: s_waitcnt vmcnt(0) 5705; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5706; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5707; GCN-NEXT: s_setpc_b64 s[30:31] 5708; 5709; GFX7-LABEL: global_extload_v2bf16_to_v2f32: 5710; GFX7: ; %bb.0: 5711; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5712; GFX7-NEXT: s_mov_b32 s6, 0 5713; GFX7-NEXT: s_mov_b32 s7, 0xf000 5714; GFX7-NEXT: s_mov_b32 s4, s6 5715; GFX7-NEXT: s_mov_b32 s5, s6 5716; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 5717; GFX7-NEXT: s_waitcnt vmcnt(0) 5718; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5719; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5720; GFX7-NEXT: s_setpc_b64 s[30:31] 5721; 5722; GFX8-LABEL: global_extload_v2bf16_to_v2f32: 5723; GFX8: ; %bb.0: 5724; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5725; GFX8-NEXT: flat_load_dword v1, v[0:1] 5726; GFX8-NEXT: s_waitcnt vmcnt(0) 5727; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5728; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5729; GFX8-NEXT: s_setpc_b64 s[30:31] 5730; 5731; GFX9-LABEL: global_extload_v2bf16_to_v2f32: 5732; GFX9: ; %bb.0: 5733; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5734; GFX9-NEXT: global_load_dword v1, v[0:1], off 5735; GFX9-NEXT: s_waitcnt vmcnt(0) 5736; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5737; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5738; GFX9-NEXT: s_setpc_b64 s[30:31] 5739; 5740; GFX10-LABEL: global_extload_v2bf16_to_v2f32: 5741; GFX10: ; %bb.0: 5742; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5743; GFX10-NEXT: global_load_dword v1, v[0:1], off 5744; GFX10-NEXT: s_waitcnt vmcnt(0) 5745; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5746; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5747; GFX10-NEXT: s_setpc_b64 s[30:31] 5748; 5749; GFX11-LABEL: global_extload_v2bf16_to_v2f32: 5750; GFX11: ; %bb.0: 5751; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5752; GFX11-NEXT: global_load_b32 v1, v[0:1], off 5753; GFX11-NEXT: s_waitcnt vmcnt(0) 5754; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5755; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5756; GFX11-NEXT: s_setpc_b64 s[30:31] 5757 %load = load <2 x bfloat>, ptr addrspace(1) %ptr 5758 %fpext = fpext <2 x bfloat> %load to <2 x float> 5759 ret <2 x float> %fpext 5760} 5761 5762define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { 5763; GCN-LABEL: global_extload_v3bf16_to_v3f32: 5764; GCN: ; %bb.0: 5765; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5766; GCN-NEXT: s_mov_b32 s6, 0 5767; GCN-NEXT: s_mov_b32 s7, 0xf000 5768; GCN-NEXT: s_mov_b32 s4, s6 5769; GCN-NEXT: s_mov_b32 s5, s6 5770; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 5771; GCN-NEXT: s_waitcnt vmcnt(0) 5772; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5773; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5774; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5775; GCN-NEXT: s_setpc_b64 s[30:31] 5776; 5777; GFX7-LABEL: global_extload_v3bf16_to_v3f32: 5778; GFX7: ; %bb.0: 5779; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5780; GFX7-NEXT: s_mov_b32 s6, 0 5781; GFX7-NEXT: s_mov_b32 s7, 0xf000 5782; GFX7-NEXT: s_mov_b32 s4, s6 5783; GFX7-NEXT: s_mov_b32 s5, s6 5784; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 5785; GFX7-NEXT: s_waitcnt vmcnt(0) 5786; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5787; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5788; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5789; GFX7-NEXT: s_setpc_b64 s[30:31] 5790; 5791; GFX8-LABEL: global_extload_v3bf16_to_v3f32: 5792; GFX8: ; %bb.0: 5793; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5794; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 5795; GFX8-NEXT: s_waitcnt vmcnt(0) 5796; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5797; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5798; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5799; GFX8-NEXT: s_setpc_b64 s[30:31] 5800; 5801; GFX9-LABEL: global_extload_v3bf16_to_v3f32: 5802; GFX9: ; %bb.0: 5803; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5804; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 5805; GFX9-NEXT: s_waitcnt vmcnt(0) 5806; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5807; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5808; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5809; GFX9-NEXT: s_setpc_b64 s[30:31] 5810; 5811; GFX10-LABEL: global_extload_v3bf16_to_v3f32: 5812; GFX10: ; %bb.0: 5813; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5814; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 5815; GFX10-NEXT: s_waitcnt vmcnt(0) 5816; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5817; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5818; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5819; GFX10-NEXT: s_setpc_b64 s[30:31] 5820; 5821; GFX11-LABEL: global_extload_v3bf16_to_v3f32: 5822; GFX11: ; %bb.0: 5823; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5824; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 5825; GFX11-NEXT: s_waitcnt vmcnt(0) 5826; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 5827; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5828; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 5829; GFX11-NEXT: s_setpc_b64 s[30:31] 5830 %load = load <3 x bfloat>, ptr addrspace(1) %ptr 5831 %fpext = fpext <3 x bfloat> %load to <3 x float> 5832 ret <3 x float> %fpext 5833} 5834 5835define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) { 5836; GCN-LABEL: global_extload_v4bf16_to_v4f32: 5837; GCN: ; %bb.0: 5838; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5839; GCN-NEXT: s_mov_b32 s6, 0 5840; GCN-NEXT: s_mov_b32 s7, 0xf000 5841; GCN-NEXT: s_mov_b32 s4, s6 5842; GCN-NEXT: s_mov_b32 s5, s6 5843; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 5844; GCN-NEXT: s_waitcnt vmcnt(0) 5845; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5846; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5847; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5848; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5849; GCN-NEXT: s_setpc_b64 s[30:31] 5850; 5851; GFX7-LABEL: global_extload_v4bf16_to_v4f32: 5852; GFX7: ; %bb.0: 5853; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5854; GFX7-NEXT: s_mov_b32 s6, 0 5855; GFX7-NEXT: s_mov_b32 s7, 0xf000 5856; GFX7-NEXT: s_mov_b32 s4, s6 5857; GFX7-NEXT: s_mov_b32 s5, s6 5858; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 5859; GFX7-NEXT: s_waitcnt vmcnt(0) 5860; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5861; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5862; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5863; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5864; GFX7-NEXT: s_setpc_b64 s[30:31] 5865; 5866; GFX8-LABEL: global_extload_v4bf16_to_v4f32: 5867; GFX8: ; %bb.0: 5868; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5869; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5870; GFX8-NEXT: s_waitcnt vmcnt(0) 5871; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5872; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5873; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5874; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5875; GFX8-NEXT: s_setpc_b64 s[30:31] 5876; 5877; GFX9-LABEL: global_extload_v4bf16_to_v4f32: 5878; GFX9: ; %bb.0: 5879; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5880; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 5881; GFX9-NEXT: s_waitcnt vmcnt(0) 5882; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5883; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5884; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5885; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5886; GFX9-NEXT: s_setpc_b64 s[30:31] 5887; 5888; GFX10-LABEL: global_extload_v4bf16_to_v4f32: 5889; GFX10: ; %bb.0: 5890; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5891; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 5892; GFX10-NEXT: s_waitcnt vmcnt(0) 5893; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5894; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5895; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5896; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5897; GFX10-NEXT: s_setpc_b64 s[30:31] 5898; 5899; GFX11-LABEL: global_extload_v4bf16_to_v4f32: 5900; GFX11: ; %bb.0: 5901; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5902; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off 5903; GFX11-NEXT: s_waitcnt vmcnt(0) 5904; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5905; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5906; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5907; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5908; GFX11-NEXT: s_setpc_b64 s[30:31] 5909 %load = load <4 x bfloat>, ptr addrspace(1) %ptr 5910 %fpext = fpext <4 x bfloat> %load to <4 x float> 5911 ret <4 x float> %fpext 5912} 5913 5914define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) { 5915; GCN-LABEL: global_extload_v5bf16_to_v5f32: 5916; GCN: ; %bb.0: 5917; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5918; GCN-NEXT: s_mov_b32 s6, 0 5919; GCN-NEXT: s_mov_b32 s7, 0xf000 5920; GCN-NEXT: s_mov_b32 s4, s6 5921; GCN-NEXT: s_mov_b32 s5, s6 5922; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 5923; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 5924; GCN-NEXT: s_waitcnt vmcnt(1) 5925; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5926; GCN-NEXT: s_waitcnt vmcnt(0) 5927; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5928; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5929; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5930; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5931; GCN-NEXT: s_setpc_b64 s[30:31] 5932; 5933; GFX7-LABEL: global_extload_v5bf16_to_v5f32: 5934; GFX7: ; %bb.0: 5935; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5936; GFX7-NEXT: s_mov_b32 s6, 0 5937; GFX7-NEXT: s_mov_b32 s7, 0xf000 5938; GFX7-NEXT: s_mov_b32 s4, s6 5939; GFX7-NEXT: s_mov_b32 s5, s6 5940; GFX7-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 5941; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 5942; GFX7-NEXT: s_waitcnt vmcnt(1) 5943; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5944; GFX7-NEXT: s_waitcnt vmcnt(0) 5945; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5946; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5947; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5948; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5949; GFX7-NEXT: s_setpc_b64 s[30:31] 5950; 5951; GFX8-LABEL: global_extload_v5bf16_to_v5f32: 5952; GFX8: ; %bb.0: 5953; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5954; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] 5955; GFX8-NEXT: s_waitcnt vmcnt(0) 5956; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5957; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5958; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5959; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5960; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5961; GFX8-NEXT: s_setpc_b64 s[30:31] 5962; 5963; GFX9-LABEL: global_extload_v5bf16_to_v5f32: 5964; GFX9: ; %bb.0: 5965; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5966; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 5967; GFX9-NEXT: s_waitcnt vmcnt(0) 5968; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5969; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5970; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5971; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5972; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5973; GFX9-NEXT: s_setpc_b64 s[30:31] 5974; 5975; GFX10-LABEL: global_extload_v5bf16_to_v5f32: 5976; GFX10: ; %bb.0: 5977; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5978; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 5979; GFX10-NEXT: s_waitcnt vmcnt(0) 5980; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5981; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5982; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5983; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5984; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5985; GFX10-NEXT: s_setpc_b64 s[30:31] 5986; 5987; GFX11-LABEL: global_extload_v5bf16_to_v5f32: 5988; GFX11: ; %bb.0: 5989; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5990; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off 5991; GFX11-NEXT: s_waitcnt vmcnt(0) 5992; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 5993; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 5994; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 5995; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5996; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5997; GFX11-NEXT: s_setpc_b64 s[30:31] 5998 %load = load <5 x bfloat>, ptr addrspace(1) %ptr 5999 %fpext = fpext <5 x bfloat> %load to <5 x float> 6000 ret <5 x float> %fpext 6001} 6002 6003define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { 6004; GCN-LABEL: global_extload_v6bf16_to_v6f32: 6005; GCN: ; %bb.0: 6006; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6007; GCN-NEXT: s_mov_b32 s6, 0 6008; GCN-NEXT: s_mov_b32 s7, 0xf000 6009; GCN-NEXT: s_mov_b32 s4, s6 6010; GCN-NEXT: s_mov_b32 s5, s6 6011; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 6012; GCN-NEXT: s_waitcnt vmcnt(0) 6013; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6014; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6015; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6016; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6017; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6018; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6019; GCN-NEXT: s_setpc_b64 s[30:31] 6020; 6021; GFX7-LABEL: global_extload_v6bf16_to_v6f32: 6022; GFX7: ; %bb.0: 6023; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6024; GFX7-NEXT: s_mov_b32 s6, 0 6025; GFX7-NEXT: s_mov_b32 s7, 0xf000 6026; GFX7-NEXT: s_mov_b32 s4, s6 6027; GFX7-NEXT: s_mov_b32 s5, s6 6028; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64 6029; GFX7-NEXT: s_waitcnt vmcnt(0) 6030; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6031; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6032; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6033; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6034; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6035; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6036; GFX7-NEXT: s_setpc_b64 s[30:31] 6037; 6038; GFX8-LABEL: global_extload_v6bf16_to_v6f32: 6039; GFX8: ; %bb.0: 6040; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6041; GFX8-NEXT: flat_load_dwordx3 v[3:5], v[0:1] 6042; GFX8-NEXT: s_waitcnt vmcnt(0) 6043; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6044; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6045; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6046; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6047; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6048; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6049; GFX8-NEXT: s_setpc_b64 s[30:31] 6050; 6051; GFX9-LABEL: global_extload_v6bf16_to_v6f32: 6052; GFX9: ; %bb.0: 6053; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6054; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off 6055; GFX9-NEXT: s_waitcnt vmcnt(0) 6056; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6057; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6058; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6059; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6060; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6061; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6062; GFX9-NEXT: s_setpc_b64 s[30:31] 6063; 6064; GFX10-LABEL: global_extload_v6bf16_to_v6f32: 6065; GFX10: ; %bb.0: 6066; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6067; GFX10-NEXT: global_load_dwordx3 v[3:5], v[0:1], off 6068; GFX10-NEXT: s_waitcnt vmcnt(0) 6069; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6070; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6071; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6072; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6073; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6074; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6075; GFX10-NEXT: s_setpc_b64 s[30:31] 6076; 6077; GFX11-LABEL: global_extload_v6bf16_to_v6f32: 6078; GFX11: ; %bb.0: 6079; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6080; GFX11-NEXT: global_load_b96 v[3:5], v[0:1], off 6081; GFX11-NEXT: s_waitcnt vmcnt(0) 6082; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v3 6083; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 6084; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4 6085; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 6086; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 6087; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 6088; GFX11-NEXT: s_setpc_b64 s[30:31] 6089 %load = load <6 x bfloat>, ptr addrspace(1) %ptr 6090 %fpext = fpext <6 x bfloat> %load to <6 x float> 6091 ret <6 x float> %fpext 6092} 6093 6094define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) { 6095; GCN-LABEL: global_extload_v8bf16_to_v8f32: 6096; GCN: ; %bb.0: 6097; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6098; GCN-NEXT: s_mov_b32 s6, 0 6099; GCN-NEXT: s_mov_b32 s7, 0xf000 6100; GCN-NEXT: s_mov_b32 s4, s6 6101; GCN-NEXT: s_mov_b32 s5, s6 6102; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6103; GCN-NEXT: s_waitcnt vmcnt(0) 6104; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6105; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6106; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6107; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6108; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6109; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6110; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6111; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6112; GCN-NEXT: s_setpc_b64 s[30:31] 6113; 6114; GFX7-LABEL: global_extload_v8bf16_to_v8f32: 6115; GFX7: ; %bb.0: 6116; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6117; GFX7-NEXT: s_mov_b32 s6, 0 6118; GFX7-NEXT: s_mov_b32 s7, 0xf000 6119; GFX7-NEXT: s_mov_b32 s4, s6 6120; GFX7-NEXT: s_mov_b32 s5, s6 6121; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6122; GFX7-NEXT: s_waitcnt vmcnt(0) 6123; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6124; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6125; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6126; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6127; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6128; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6129; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6130; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6131; GFX7-NEXT: s_setpc_b64 s[30:31] 6132; 6133; GFX8-LABEL: global_extload_v8bf16_to_v8f32: 6134; GFX8: ; %bb.0: 6135; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6136; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 6137; GFX8-NEXT: s_waitcnt vmcnt(0) 6138; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6139; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6140; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6141; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6142; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6143; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6144; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6145; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6146; GFX8-NEXT: s_setpc_b64 s[30:31] 6147; 6148; GFX9-LABEL: global_extload_v8bf16_to_v8f32: 6149; GFX9: ; %bb.0: 6150; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6151; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6152; GFX9-NEXT: s_waitcnt vmcnt(0) 6153; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6154; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6155; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6156; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6157; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6158; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6159; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6160; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6161; GFX9-NEXT: s_setpc_b64 s[30:31] 6162; 6163; GFX10-LABEL: global_extload_v8bf16_to_v8f32: 6164; GFX10: ; %bb.0: 6165; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6166; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6167; GFX10-NEXT: s_waitcnt vmcnt(0) 6168; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6169; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6170; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6171; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6172; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6173; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6174; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6175; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6176; GFX10-NEXT: s_setpc_b64 s[30:31] 6177; 6178; GFX11-LABEL: global_extload_v8bf16_to_v8f32: 6179; GFX11: ; %bb.0: 6180; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6181; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 6182; GFX11-NEXT: s_waitcnt vmcnt(0) 6183; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6184; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6185; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6186; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6187; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6188; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6189; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6190; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6191; GFX11-NEXT: s_setpc_b64 s[30:31] 6192 %load = load <8 x bfloat>, ptr addrspace(1) %ptr 6193 %fpext = fpext <8 x bfloat> %load to <8 x float> 6194 ret <8 x float> %fpext 6195} 6196 6197define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { 6198; GCN-LABEL: global_extload_v16bf16_to_v16f32: 6199; GCN: ; %bb.0: 6200; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6201; GCN-NEXT: s_mov_b32 s6, 0 6202; GCN-NEXT: s_mov_b32 s7, 0xf000 6203; GCN-NEXT: s_mov_b32 s4, s6 6204; GCN-NEXT: s_mov_b32 s5, s6 6205; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6206; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 6207; GCN-NEXT: s_waitcnt vmcnt(1) 6208; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6209; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6210; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6211; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6212; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6213; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6214; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6215; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6216; GCN-NEXT: s_waitcnt vmcnt(0) 6217; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6218; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6219; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6220; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6221; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6222; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6223; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6224; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6225; GCN-NEXT: s_setpc_b64 s[30:31] 6226; 6227; GFX7-LABEL: global_extload_v16bf16_to_v16f32: 6228; GFX7: ; %bb.0: 6229; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6230; GFX7-NEXT: s_mov_b32 s6, 0 6231; GFX7-NEXT: s_mov_b32 s7, 0xf000 6232; GFX7-NEXT: s_mov_b32 s4, s6 6233; GFX7-NEXT: s_mov_b32 s5, s6 6234; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6235; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 6236; GFX7-NEXT: s_waitcnt vmcnt(1) 6237; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6238; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6239; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6240; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6241; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6242; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6243; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6244; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6245; GFX7-NEXT: s_waitcnt vmcnt(0) 6246; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6247; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6248; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6249; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6250; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6251; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6252; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6253; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6254; GFX7-NEXT: s_setpc_b64 s[30:31] 6255; 6256; GFX8-LABEL: global_extload_v16bf16_to_v16f32: 6257; GFX8: ; %bb.0: 6258; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6259; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 6260; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 6261; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6262; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] 6263; GFX8-NEXT: s_waitcnt vmcnt(1) 6264; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6265; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6266; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6267; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6268; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6269; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6270; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6271; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6272; GFX8-NEXT: s_waitcnt vmcnt(0) 6273; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6274; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6275; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6276; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6277; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6278; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6279; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6280; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6281; GFX8-NEXT: s_setpc_b64 s[30:31] 6282; 6283; GFX9-LABEL: global_extload_v16bf16_to_v16f32: 6284; GFX9: ; %bb.0: 6285; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6286; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6287; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 6288; GFX9-NEXT: s_waitcnt vmcnt(1) 6289; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6290; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6291; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6292; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6293; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6294; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6295; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6296; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6297; GFX9-NEXT: s_waitcnt vmcnt(0) 6298; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6299; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6300; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6301; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6302; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6303; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6304; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6305; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6306; GFX9-NEXT: s_setpc_b64 s[30:31] 6307; 6308; GFX10-LABEL: global_extload_v16bf16_to_v16f32: 6309; GFX10: ; %bb.0: 6310; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6311; GFX10-NEXT: s_clause 0x1 6312; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6313; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 6314; GFX10-NEXT: s_waitcnt vmcnt(1) 6315; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6316; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6317; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6318; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6319; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6320; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6321; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6322; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6323; GFX10-NEXT: s_waitcnt vmcnt(0) 6324; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6325; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6326; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6327; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6328; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6329; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6330; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6331; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6332; GFX10-NEXT: s_setpc_b64 s[30:31] 6333; 6334; GFX11-LABEL: global_extload_v16bf16_to_v16f32: 6335; GFX11: ; %bb.0: 6336; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6337; GFX11-NEXT: s_clause 0x1 6338; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 6339; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 6340; GFX11-NEXT: s_waitcnt vmcnt(1) 6341; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6342; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6343; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6344; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6345; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6346; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6347; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6348; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6349; GFX11-NEXT: s_waitcnt vmcnt(0) 6350; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6351; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6352; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6353; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6354; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6355; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6356; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6357; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6358; GFX11-NEXT: s_setpc_b64 s[30:31] 6359 %load = load <16 x bfloat>, ptr addrspace(1) %ptr 6360 %fpext = fpext <16 x bfloat> %load to <16 x float> 6361 ret <16 x float> %fpext 6362} 6363 6364define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { 6365; GCN-LABEL: global_extload_v32bf16_to_v32f32: 6366; GCN: ; %bb.0: 6367; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6368; GCN-NEXT: s_mov_b32 s6, 0 6369; GCN-NEXT: s_mov_b32 s7, 0xf000 6370; GCN-NEXT: s_mov_b32 s4, s6 6371; GCN-NEXT: s_mov_b32 s5, s6 6372; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6373; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 6374; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 6375; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 6376; GCN-NEXT: s_waitcnt vmcnt(3) 6377; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6378; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6379; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6380; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6381; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6382; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6383; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6384; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6385; GCN-NEXT: s_waitcnt vmcnt(2) 6386; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6387; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6388; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6389; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6390; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6391; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6392; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6393; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6394; GCN-NEXT: s_waitcnt vmcnt(1) 6395; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6396; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6397; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6398; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6399; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6400; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6401; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6402; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6403; GCN-NEXT: s_waitcnt vmcnt(0) 6404; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6405; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6406; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6407; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6408; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6409; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6410; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6411; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6412; GCN-NEXT: s_setpc_b64 s[30:31] 6413; 6414; GFX7-LABEL: global_extload_v32bf16_to_v32f32: 6415; GFX7: ; %bb.0: 6416; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6417; GFX7-NEXT: s_mov_b32 s6, 0 6418; GFX7-NEXT: s_mov_b32 s7, 0xf000 6419; GFX7-NEXT: s_mov_b32 s4, s6 6420; GFX7-NEXT: s_mov_b32 s5, s6 6421; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 6422; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16 6423; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32 6424; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48 6425; GFX7-NEXT: s_waitcnt vmcnt(3) 6426; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6427; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6428; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6429; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6430; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6431; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6432; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6433; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6434; GFX7-NEXT: s_waitcnt vmcnt(2) 6435; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6436; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6437; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6438; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6439; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6440; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6441; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6442; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6443; GFX7-NEXT: s_waitcnt vmcnt(1) 6444; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6445; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6446; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6447; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6448; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6449; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6450; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6451; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6452; GFX7-NEXT: s_waitcnt vmcnt(0) 6453; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6454; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6455; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6456; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6457; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6458; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6459; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6460; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6461; GFX7-NEXT: s_setpc_b64 s[30:31] 6462; 6463; GFX8-LABEL: global_extload_v32bf16_to_v32f32: 6464; GFX8: ; %bb.0: 6465; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6466; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 6467; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 6468; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 6469; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[2:3] 6470; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 6471; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 6472; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 6473; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6474; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[2:3] 6475; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[0:1] 6476; GFX8-NEXT: s_waitcnt vmcnt(3) 6477; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6478; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6479; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6480; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6481; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6482; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6483; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6484; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6485; GFX8-NEXT: s_waitcnt vmcnt(2) 6486; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6487; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6488; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6489; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6490; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6491; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6492; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6493; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6494; GFX8-NEXT: s_waitcnt vmcnt(1) 6495; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6496; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6497; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6498; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6499; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6500; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6501; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6502; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6503; GFX8-NEXT: s_waitcnt vmcnt(0) 6504; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6505; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6506; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6507; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6508; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6509; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6510; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6511; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6512; GFX8-NEXT: s_setpc_b64 s[30:31] 6513; 6514; GFX9-LABEL: global_extload_v32bf16_to_v32f32: 6515; GFX9: ; %bb.0: 6516; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6517; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6518; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 6519; GFX9-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 6520; GFX9-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 6521; GFX9-NEXT: s_waitcnt vmcnt(3) 6522; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6523; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6524; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6525; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6526; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6527; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6528; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6529; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6530; GFX9-NEXT: s_waitcnt vmcnt(2) 6531; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6532; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6533; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6534; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6535; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6536; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6537; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6538; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6539; GFX9-NEXT: s_waitcnt vmcnt(1) 6540; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6541; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6542; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6543; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6544; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6545; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6546; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6547; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6548; GFX9-NEXT: s_waitcnt vmcnt(0) 6549; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6550; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6551; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6552; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6553; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6554; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6555; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6556; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6557; GFX9-NEXT: s_setpc_b64 s[30:31] 6558; 6559; GFX10-LABEL: global_extload_v32bf16_to_v32f32: 6560; GFX10: ; %bb.0: 6561; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6562; GFX10-NEXT: s_clause 0x3 6563; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off 6564; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:16 6565; GFX10-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 6566; GFX10-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 6567; GFX10-NEXT: s_waitcnt vmcnt(3) 6568; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6569; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6570; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6571; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6572; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6573; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6574; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6575; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6576; GFX10-NEXT: s_waitcnt vmcnt(2) 6577; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6578; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6579; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6580; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6581; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6582; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6583; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6584; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6585; GFX10-NEXT: s_waitcnt vmcnt(1) 6586; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6587; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6588; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6589; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6590; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6591; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6592; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6593; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6594; GFX10-NEXT: s_waitcnt vmcnt(0) 6595; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6596; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6597; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6598; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6599; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6600; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6601; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6602; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6603; GFX10-NEXT: s_setpc_b64 s[30:31] 6604; 6605; GFX11-LABEL: global_extload_v32bf16_to_v32f32: 6606; GFX11: ; %bb.0: 6607; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6608; GFX11-NEXT: s_clause 0x3 6609; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off 6610; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 6611; GFX11-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32 6612; GFX11-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48 6613; GFX11-NEXT: s_waitcnt vmcnt(3) 6614; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6615; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6616; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 6617; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 6618; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 6619; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 6620; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 6621; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 6622; GFX11-NEXT: s_waitcnt vmcnt(2) 6623; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v12 6624; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 6625; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v13 6626; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 6627; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v14 6628; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 6629; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 6630; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 6631; GFX11-NEXT: s_waitcnt vmcnt(1) 6632; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v20 6633; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 6634; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v21 6635; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 6636; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v22 6637; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 6638; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v23 6639; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 6640; GFX11-NEXT: s_waitcnt vmcnt(0) 6641; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v28 6642; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 6643; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v29 6644; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 6645; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v30 6646; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 6647; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31 6648; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 6649; GFX11-NEXT: s_setpc_b64 s[30:31] 6650 %load = load <32 x bfloat>, ptr addrspace(1) %ptr 6651 %fpext = fpext <32 x bfloat> %load to <32 x float> 6652 ret <32 x float> %fpext 6653} 6654 6655define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { 6656; GCN-LABEL: global_extload_v2bf16_to_v2f64: 6657; GCN: ; %bb.0: 6658; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6659; GCN-NEXT: s_mov_b32 s6, 0 6660; GCN-NEXT: s_mov_b32 s7, 0xf000 6661; GCN-NEXT: s_mov_b32 s4, s6 6662; GCN-NEXT: s_mov_b32 s5, s6 6663; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 6664; GCN-NEXT: s_waitcnt vmcnt(0) 6665; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v0 6666; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 6667; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 6668; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6669; GCN-NEXT: s_setpc_b64 s[30:31] 6670; 6671; GFX7-LABEL: global_extload_v2bf16_to_v2f64: 6672; GFX7: ; %bb.0: 6673; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6674; GFX7-NEXT: s_mov_b32 s6, 0 6675; GFX7-NEXT: s_mov_b32 s7, 0xf000 6676; GFX7-NEXT: s_mov_b32 s4, s6 6677; GFX7-NEXT: s_mov_b32 s5, s6 6678; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 6679; GFX7-NEXT: s_waitcnt vmcnt(0) 6680; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 6681; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 6682; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6683; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6684; GFX7-NEXT: s_setpc_b64 s[30:31] 6685; 6686; GFX8-LABEL: global_extload_v2bf16_to_v2f64: 6687; GFX8: ; %bb.0: 6688; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6689; GFX8-NEXT: flat_load_dword v2, v[0:1] 6690; GFX8-NEXT: s_waitcnt vmcnt(0) 6691; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 6692; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 6693; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6694; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6695; GFX8-NEXT: s_setpc_b64 s[30:31] 6696; 6697; GFX9-LABEL: global_extload_v2bf16_to_v2f64: 6698; GFX9: ; %bb.0: 6699; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6700; GFX9-NEXT: global_load_dword v2, v[0:1], off 6701; GFX9-NEXT: s_waitcnt vmcnt(0) 6702; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 6703; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 6704; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6705; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6706; GFX9-NEXT: s_setpc_b64 s[30:31] 6707; 6708; GFX10-LABEL: global_extload_v2bf16_to_v2f64: 6709; GFX10: ; %bb.0: 6710; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6711; GFX10-NEXT: global_load_dword v0, v[0:1], off 6712; GFX10-NEXT: s_waitcnt vmcnt(0) 6713; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 6714; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 6715; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 6716; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6717; GFX10-NEXT: s_setpc_b64 s[30:31] 6718; 6719; GFX11-LABEL: global_extload_v2bf16_to_v2f64: 6720; GFX11: ; %bb.0: 6721; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6722; GFX11-NEXT: global_load_b32 v0, v[0:1], off 6723; GFX11-NEXT: s_waitcnt vmcnt(0) 6724; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 6725; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 6726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 6727; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 6728; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 6729; GFX11-NEXT: s_setpc_b64 s[30:31] 6730 %load = load <2 x bfloat>, ptr addrspace(1) %ptr 6731 %fpext = fpext <2 x bfloat> %load to <2 x double> 6732 ret <2 x double> %fpext 6733} 6734 6735define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { 6736; GCN-LABEL: global_extload_v3bf16_to_v3f64: 6737; GCN: ; %bb.0: 6738; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6739; GCN-NEXT: s_mov_b32 s6, 0 6740; GCN-NEXT: s_mov_b32 s7, 0xf000 6741; GCN-NEXT: s_mov_b32 s4, s6 6742; GCN-NEXT: s_mov_b32 s5, s6 6743; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 6744; GCN-NEXT: s_waitcnt vmcnt(0) 6745; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6746; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6747; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6748; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6749; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6750; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6751; GCN-NEXT: s_setpc_b64 s[30:31] 6752; 6753; GFX7-LABEL: global_extload_v3bf16_to_v3f64: 6754; GFX7: ; %bb.0: 6755; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6756; GFX7-NEXT: s_mov_b32 s6, 0 6757; GFX7-NEXT: s_mov_b32 s7, 0xf000 6758; GFX7-NEXT: s_mov_b32 s4, s6 6759; GFX7-NEXT: s_mov_b32 s5, s6 6760; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 6761; GFX7-NEXT: s_waitcnt vmcnt(0) 6762; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 6763; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6764; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 6765; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6766; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6767; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6768; GFX7-NEXT: s_setpc_b64 s[30:31] 6769; 6770; GFX8-LABEL: global_extload_v3bf16_to_v3f64: 6771; GFX8: ; %bb.0: 6772; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6773; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 6774; GFX8-NEXT: s_waitcnt vmcnt(0) 6775; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 6776; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6777; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 6778; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6779; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6780; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6781; GFX8-NEXT: s_setpc_b64 s[30:31] 6782; 6783; GFX9-LABEL: global_extload_v3bf16_to_v3f64: 6784; GFX9: ; %bb.0: 6785; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6786; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 6787; GFX9-NEXT: s_waitcnt vmcnt(0) 6788; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 6789; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6790; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 6791; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6792; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6793; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6794; GFX9-NEXT: s_setpc_b64 s[30:31] 6795; 6796; GFX10-LABEL: global_extload_v3bf16_to_v3f64: 6797; GFX10: ; %bb.0: 6798; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6799; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 6800; GFX10-NEXT: s_waitcnt vmcnt(0) 6801; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6802; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6803; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6804; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6805; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6806; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6807; GFX10-NEXT: s_setpc_b64 s[30:31] 6808; 6809; GFX11-LABEL: global_extload_v3bf16_to_v3f64: 6810; GFX11: ; %bb.0: 6811; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6812; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 6813; GFX11-NEXT: s_waitcnt vmcnt(0) 6814; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6815; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6816; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6817; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 6818; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6819; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6820; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 6821; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6822; GFX11-NEXT: s_setpc_b64 s[30:31] 6823 %load = load <3 x bfloat>, ptr addrspace(1) %ptr 6824 %fpext = fpext <3 x bfloat> %load to <3 x double> 6825 ret <3 x double> %fpext 6826} 6827 6828define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) { 6829; GCN-LABEL: global_extload_v4bf16_to_v4f64: 6830; GCN: ; %bb.0: 6831; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6832; GCN-NEXT: s_mov_b32 s6, 0 6833; GCN-NEXT: s_mov_b32 s7, 0xf000 6834; GCN-NEXT: s_mov_b32 s4, s6 6835; GCN-NEXT: s_mov_b32 s5, s6 6836; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 6837; GCN-NEXT: s_waitcnt vmcnt(0) 6838; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6839; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6840; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6841; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6842; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6843; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6844; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6845; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6846; GCN-NEXT: s_setpc_b64 s[30:31] 6847; 6848; GFX7-LABEL: global_extload_v4bf16_to_v4f64: 6849; GFX7: ; %bb.0: 6850; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6851; GFX7-NEXT: s_mov_b32 s6, 0 6852; GFX7-NEXT: s_mov_b32 s7, 0xf000 6853; GFX7-NEXT: s_mov_b32 s4, s6 6854; GFX7-NEXT: s_mov_b32 s5, s6 6855; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 6856; GFX7-NEXT: s_waitcnt vmcnt(0) 6857; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6858; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6859; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6860; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6861; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6862; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6863; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6864; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6865; GFX7-NEXT: s_setpc_b64 s[30:31] 6866; 6867; GFX8-LABEL: global_extload_v4bf16_to_v4f64: 6868; GFX8: ; %bb.0: 6869; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6870; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 6871; GFX8-NEXT: s_waitcnt vmcnt(0) 6872; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6873; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6874; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6875; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6876; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6877; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6878; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6879; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6880; GFX8-NEXT: s_setpc_b64 s[30:31] 6881; 6882; GFX9-LABEL: global_extload_v4bf16_to_v4f64: 6883; GFX9: ; %bb.0: 6884; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6885; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 6886; GFX9-NEXT: s_waitcnt vmcnt(0) 6887; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 6888; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6889; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 6890; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6891; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 6892; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6893; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6894; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6895; GFX9-NEXT: s_setpc_b64 s[30:31] 6896; 6897; GFX10-LABEL: global_extload_v4bf16_to_v4f64: 6898; GFX10: ; %bb.0: 6899; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6900; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 6901; GFX10-NEXT: s_waitcnt vmcnt(0) 6902; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 6903; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6904; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 6905; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 6906; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6907; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6908; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6909; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6910; GFX10-NEXT: s_setpc_b64 s[30:31] 6911; 6912; GFX11-LABEL: global_extload_v4bf16_to_v4f64: 6913; GFX11: ; %bb.0: 6914; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6915; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 6916; GFX11-NEXT: s_waitcnt vmcnt(0) 6917; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 6918; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6919; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 6920; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 6921; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 6922; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 6923; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 6924; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 6925; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 6926; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6927; GFX11-NEXT: s_setpc_b64 s[30:31] 6928 %load = load <4 x bfloat>, ptr addrspace(1) %ptr 6929 %fpext = fpext <4 x bfloat> %load to <4 x double> 6930 ret <4 x double> %fpext 6931} 6932 6933define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) { 6934; GCN-LABEL: global_extload_v5bf16_to_v5f64: 6935; GCN: ; %bb.0: 6936; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6937; GCN-NEXT: s_mov_b32 s6, 0 6938; GCN-NEXT: s_mov_b32 s7, 0xf000 6939; GCN-NEXT: s_mov_b32 s4, s6 6940; GCN-NEXT: s_mov_b32 s5, s6 6941; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 6942; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 6943; GCN-NEXT: s_waitcnt vmcnt(1) 6944; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 6945; GCN-NEXT: s_waitcnt vmcnt(0) 6946; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 6947; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 6948; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 6949; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6950; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 6951; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 6952; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 6953; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 6954; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6955; GCN-NEXT: s_setpc_b64 s[30:31] 6956; 6957; GFX7-LABEL: global_extload_v5bf16_to_v5f64: 6958; GFX7: ; %bb.0: 6959; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6960; GFX7-NEXT: s_mov_b32 s6, 0 6961; GFX7-NEXT: s_mov_b32 s7, 0xf000 6962; GFX7-NEXT: s_mov_b32 s4, s6 6963; GFX7-NEXT: s_mov_b32 s5, s6 6964; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 6965; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 6966; GFX7-NEXT: s_waitcnt vmcnt(1) 6967; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 6968; GFX7-NEXT: s_waitcnt vmcnt(0) 6969; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 6970; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 6971; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 6972; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6973; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 6974; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 6975; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 6976; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 6977; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6978; GFX7-NEXT: s_setpc_b64 s[30:31] 6979; 6980; GFX8-LABEL: global_extload_v5bf16_to_v5f64: 6981; GFX8: ; %bb.0: 6982; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6983; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 6984; GFX8-NEXT: s_waitcnt vmcnt(0) 6985; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 6986; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 6987; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 6988; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 6989; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2 6990; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 6991; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 6992; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 6993; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 6994; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 6995; GFX8-NEXT: s_setpc_b64 s[30:31] 6996; 6997; GFX9-LABEL: global_extload_v5bf16_to_v5f64: 6998; GFX9: ; %bb.0: 6999; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7000; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 7001; GFX9-NEXT: s_waitcnt vmcnt(0) 7002; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 7003; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 7004; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 7005; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 7006; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7007; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 7008; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 7009; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7010; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7011; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7012; GFX9-NEXT: s_setpc_b64 s[30:31] 7013; 7014; GFX10-LABEL: global_extload_v5bf16_to_v5f64: 7015; GFX10: ; %bb.0: 7016; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7017; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 7018; GFX10-NEXT: s_waitcnt vmcnt(0) 7019; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7020; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7021; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 7022; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 7023; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4 7024; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7025; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7026; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7027; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7028; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7029; GFX10-NEXT: s_setpc_b64 s[30:31] 7030; 7031; GFX11-LABEL: global_extload_v5bf16_to_v5f64: 7032; GFX11: ; %bb.0: 7033; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7034; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off 7035; GFX11-NEXT: s_waitcnt vmcnt(0) 7036; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7037; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7038; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v3 7039; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 7040; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v4 7041; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7042; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7043; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7044; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7045; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7046; GFX11-NEXT: s_setpc_b64 s[30:31] 7047 %load = load <5 x bfloat>, ptr addrspace(1) %ptr 7048 %fpext = fpext <5 x bfloat> %load to <5 x double> 7049 ret <5 x double> %fpext 7050} 7051 7052define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) { 7053; GCN-LABEL: global_extload_v6bf16_to_v6f64: 7054; GCN: ; %bb.0: 7055; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7056; GCN-NEXT: s_mov_b32 s6, 0 7057; GCN-NEXT: s_mov_b32 s7, 0xf000 7058; GCN-NEXT: s_mov_b32 s4, s6 7059; GCN-NEXT: s_mov_b32 s5, s6 7060; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 7061; GCN-NEXT: s_waitcnt vmcnt(0) 7062; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 7063; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 7064; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v1 7065; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 7066; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7067; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7068; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 7069; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 7070; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7071; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7072; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7073; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7074; GCN-NEXT: s_setpc_b64 s[30:31] 7075; 7076; GFX7-LABEL: global_extload_v6bf16_to_v6f64: 7077; GFX7: ; %bb.0: 7078; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7079; GFX7-NEXT: s_mov_b32 s6, 0 7080; GFX7-NEXT: s_mov_b32 s7, 0xf000 7081; GFX7-NEXT: s_mov_b32 s4, s6 7082; GFX7-NEXT: s_mov_b32 s5, s6 7083; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 7084; GFX7-NEXT: s_waitcnt vmcnt(0) 7085; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v0 7086; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 7087; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v1 7088; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 7089; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7090; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7091; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 7092; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 7093; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7094; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7095; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7096; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7097; GFX7-NEXT: s_setpc_b64 s[30:31] 7098; 7099; GFX8-LABEL: global_extload_v6bf16_to_v6f64: 7100; GFX8: ; %bb.0: 7101; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7102; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 7103; GFX8-NEXT: s_waitcnt vmcnt(0) 7104; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 7105; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 7106; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 7107; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 7108; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7109; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7110; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 7111; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 7112; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7113; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7114; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7115; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7116; GFX8-NEXT: s_setpc_b64 s[30:31] 7117; 7118; GFX9-LABEL: global_extload_v6bf16_to_v6f64: 7119; GFX9: ; %bb.0: 7120; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7121; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 7122; GFX9-NEXT: s_waitcnt vmcnt(0) 7123; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 7124; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 7125; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 7126; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 7127; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7128; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7129; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 7130; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v4 7131; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 7132; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7133; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7134; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7135; GFX9-NEXT: s_setpc_b64 s[30:31] 7136; 7137; GFX10-LABEL: global_extload_v6bf16_to_v6f64: 7138; GFX10: ; %bb.0: 7139; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7140; GFX10-NEXT: global_load_dwordx3 v[4:6], v[0:1], off 7141; GFX10-NEXT: s_waitcnt vmcnt(0) 7142; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v4 7143; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 7144; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 7145; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 7146; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6 7147; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 7148; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7149; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7150; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 7151; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7152; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7153; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7154; GFX10-NEXT: s_setpc_b64 s[30:31] 7155; 7156; GFX11-LABEL: global_extload_v6bf16_to_v6f64: 7157; GFX11: ; %bb.0: 7158; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7159; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off 7160; GFX11-NEXT: s_waitcnt vmcnt(0) 7161; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v4 7162; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 7163; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 7164; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 7165; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6 7166; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 7167; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7168; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7169; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 7170; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7171; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7172; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7173; GFX11-NEXT: s_setpc_b64 s[30:31] 7174 %load = load <6 x bfloat>, ptr addrspace(1) %ptr 7175 %fpext = fpext <6 x bfloat> %load to <6 x double> 7176 ret <6 x double> %fpext 7177} 7178 7179define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) { 7180; GCN-LABEL: global_extload_v8bf16_to_v8f64: 7181; GCN: ; %bb.0: 7182; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7183; GCN-NEXT: s_mov_b32 s6, 0 7184; GCN-NEXT: s_mov_b32 s7, 0xf000 7185; GCN-NEXT: s_mov_b32 s4, s6 7186; GCN-NEXT: s_mov_b32 s5, s6 7187; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 7188; GCN-NEXT: s_waitcnt vmcnt(0) 7189; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 7190; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 7191; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 7192; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 7193; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7194; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7195; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 7196; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 7197; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 7198; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 7199; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 7200; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7201; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7202; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7203; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7204; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7205; GCN-NEXT: s_setpc_b64 s[30:31] 7206; 7207; GFX7-LABEL: global_extload_v8bf16_to_v8f64: 7208; GFX7: ; %bb.0: 7209; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7210; GFX7-NEXT: s_mov_b32 s6, 0 7211; GFX7-NEXT: s_mov_b32 s7, 0xf000 7212; GFX7-NEXT: s_mov_b32 s4, s6 7213; GFX7-NEXT: s_mov_b32 s5, s6 7214; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 7215; GFX7-NEXT: s_waitcnt vmcnt(0) 7216; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v0 7217; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 7218; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v1 7219; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 7220; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7221; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7222; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v3 7223; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 7224; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 7225; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 7226; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 7227; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7228; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7229; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7230; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7231; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7232; GFX7-NEXT: s_setpc_b64 s[30:31] 7233; 7234; GFX8-LABEL: global_extload_v8bf16_to_v8f64: 7235; GFX8: ; %bb.0: 7236; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7237; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 7238; GFX8-NEXT: s_waitcnt vmcnt(0) 7239; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 7240; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 7241; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 7242; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 7243; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7244; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7245; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3 7246; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 7247; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 7248; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 7249; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 7250; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7251; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7252; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7253; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7254; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7255; GFX8-NEXT: s_setpc_b64 s[30:31] 7256; 7257; GFX9-LABEL: global_extload_v8bf16_to_v8f64: 7258; GFX9: ; %bb.0: 7259; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7260; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 7261; GFX9-NEXT: s_waitcnt vmcnt(0) 7262; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 7263; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 7264; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v1 7265; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 7266; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 7267; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 7268; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v3 7269; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 7270; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 7271; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 7272; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 7273; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7274; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7275; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 7276; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7277; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7278; GFX9-NEXT: s_setpc_b64 s[30:31] 7279; 7280; GFX10-LABEL: global_extload_v8bf16_to_v8f64: 7281; GFX10: ; %bb.0: 7282; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7283; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off 7284; GFX10-NEXT: s_waitcnt vmcnt(0) 7285; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v7 7286; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 7287; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v8 7288; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 7289; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9 7290; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v9 7291; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v10 7292; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v10 7293; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7294; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7295; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 7296; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7297; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7298; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 7299; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7300; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7301; GFX10-NEXT: s_setpc_b64 s[30:31] 7302; 7303; GFX11-LABEL: global_extload_v8bf16_to_v8f64: 7304; GFX11: ; %bb.0: 7305; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7306; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off 7307; GFX11-NEXT: s_waitcnt vmcnt(0) 7308; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7 7309; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 7310; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8 7311; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 7312; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9 7313; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9 7314; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10 7315; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10 7316; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7317; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7318; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 7319; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7320; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7321; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 7322; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7323; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7324; GFX11-NEXT: s_setpc_b64 s[30:31] 7325 %load = load <8 x bfloat>, ptr addrspace(1) %ptr 7326 %fpext = fpext <8 x bfloat> %load to <8 x double> 7327 ret <8 x double> %fpext 7328} 7329 7330define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { 7331; GCN-LABEL: global_extload_v16bf16_to_v16f64: 7332; GCN: ; %bb.0: 7333; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7334; GCN-NEXT: s_mov_b32 s6, 0 7335; GCN-NEXT: s_mov_b32 s7, 0xf000 7336; GCN-NEXT: s_mov_b32 s4, s6 7337; GCN-NEXT: s_mov_b32 s5, s6 7338; GCN-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 7339; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 7340; GCN-NEXT: s_waitcnt vmcnt(1) 7341; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7342; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7343; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3 7344; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 7345; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 7346; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 7347; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7348; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 7349; GCN-NEXT: s_waitcnt vmcnt(0) 7350; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6 7351; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 7352; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7 7353; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 7354; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v8 7355; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 7356; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v9 7357; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 7358; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7359; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7360; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 7361; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 7362; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 7363; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 7364; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 7365; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 7366; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7367; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7368; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7369; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7370; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7371; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 7372; GCN-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7373; GCN-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7374; GCN-NEXT: s_setpc_b64 s[30:31] 7375; 7376; GFX7-LABEL: global_extload_v16bf16_to_v16f64: 7377; GFX7: ; %bb.0: 7378; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7379; GFX7-NEXT: s_mov_b32 s6, 0 7380; GFX7-NEXT: s_mov_b32 s7, 0xf000 7381; GFX7-NEXT: s_mov_b32 s4, s6 7382; GFX7-NEXT: s_mov_b32 s5, s6 7383; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 7384; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 7385; GFX7-NEXT: s_waitcnt vmcnt(1) 7386; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7387; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7388; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v3 7389; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 7390; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v4 7391; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 7392; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7393; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 7394; GFX7-NEXT: s_waitcnt vmcnt(0) 7395; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v6 7396; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 7397; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v7 7398; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 7399; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v8 7400; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 7401; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v9 7402; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 7403; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7404; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7405; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 7406; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 7407; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 7408; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 7409; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 7410; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 7411; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7412; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7413; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7414; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7415; GFX7-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7416; GFX7-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 7417; GFX7-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7418; GFX7-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7419; GFX7-NEXT: s_setpc_b64 s[30:31] 7420; 7421; GFX8-LABEL: global_extload_v16bf16_to_v16f64: 7422; GFX8: ; %bb.0: 7423; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7424; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] 7425; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 7426; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7427; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] 7428; GFX8-NEXT: s_waitcnt vmcnt(1) 7429; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7430; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7431; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3 7432; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 7433; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4 7434; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 7435; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7436; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 7437; GFX8-NEXT: s_waitcnt vmcnt(0) 7438; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6 7439; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 7440; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7 7441; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 7442; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8 7443; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 7444; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9 7445; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 7446; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7447; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7448; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 7449; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 7450; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 7451; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 7452; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 7453; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 7454; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7455; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7456; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7457; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7458; GFX8-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7459; GFX8-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 7460; GFX8-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7461; GFX8-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7462; GFX8-NEXT: s_setpc_b64 s[30:31] 7463; 7464; GFX9-LABEL: global_extload_v16bf16_to_v16f64: 7465; GFX9: ; %bb.0: 7466; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7467; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 7468; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 7469; GFX9-NEXT: s_waitcnt vmcnt(1) 7470; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7471; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7472; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v3 7473; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 7474; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 7475; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 7476; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7477; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 7478; GFX9-NEXT: s_waitcnt vmcnt(0) 7479; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 7480; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 7481; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v7 7482; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 7483; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 7484; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 7485; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v9 7486; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 7487; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7488; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7489; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 7490; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 7491; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 7492; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 7493; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 7494; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 7495; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7496; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7497; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7498; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7499; GFX9-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7500; GFX9-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 7501; GFX9-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7502; GFX9-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7503; GFX9-NEXT: s_setpc_b64 s[30:31] 7504; 7505; GFX10-LABEL: global_extload_v16bf16_to_v16f64: 7506; GFX10: ; %bb.0: 7507; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7508; GFX10-NEXT: s_clause 0x1 7509; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off 7510; GFX10-NEXT: global_load_dwordx4 v[9:12], v[0:1], off offset:16 7511; GFX10-NEXT: s_waitcnt vmcnt(1) 7512; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 7513; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 7514; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3 7515; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 7516; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v4 7517; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 7518; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7519; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 7520; GFX10-NEXT: s_waitcnt vmcnt(0) 7521; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v9 7522; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v9 7523; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v10 7524; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v10 7525; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v11 7526; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v11 7527; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12 7528; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v12 7529; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7530; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7531; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 7532; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 7533; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7534; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 7535; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 7536; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 7537; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7538; GFX10-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7539; GFX10-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7540; GFX10-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7541; GFX10-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7542; GFX10-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 7543; GFX10-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7544; GFX10-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7545; GFX10-NEXT: s_setpc_b64 s[30:31] 7546; 7547; GFX11-LABEL: global_extload_v16bf16_to_v16f64: 7548; GFX11: ; %bb.0: 7549; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7550; GFX11-NEXT: s_clause 0x1 7551; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off 7552; GFX11-NEXT: global_load_b128 v[23:26], v[0:1], off offset:16 7553; GFX11-NEXT: s_waitcnt vmcnt(1) 7554; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v7 7555; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 7556; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v8 7557; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 7558; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v9 7559; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v9 7560; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v10 7561; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v10 7562; GFX11-NEXT: s_waitcnt vmcnt(0) 7563; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v23 7564; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v23 7565; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v24 7566; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v24 7567; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 7568; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v25 7569; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v26 7570; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v26 7571; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 7572; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 7573; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 7574; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 7575; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 7576; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 7577; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 7578; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 7579; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7580; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 7581; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 7582; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 7583; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 7584; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v27 7585; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 7586; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 7587; GFX11-NEXT: s_setpc_b64 s[30:31] 7588 %load = load <16 x bfloat>, ptr addrspace(1) %ptr 7589 %fpext = fpext <16 x bfloat> %load to <16 x double> 7590 ret <16 x double> %fpext 7591} 7592 7593define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { 7594; GCN-LABEL: global_extload_v32bf16_to_v32f64: 7595; GCN: ; %bb.0: 7596; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7597; GCN-NEXT: s_mov_b32 s6, 0 7598; GCN-NEXT: s_mov_b32 s7, 0xf000 7599; GCN-NEXT: s_mov_b32 s4, s6 7600; GCN-NEXT: s_mov_b32 s5, s6 7601; GCN-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 7602; GCN-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2 7603; GCN-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4 7604; GCN-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6 7605; GCN-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8 7606; GCN-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 7607; GCN-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12 7608; GCN-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14 7609; GCN-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16 7610; GCN-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18 7611; GCN-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20 7612; GCN-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22 7613; GCN-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24 7614; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26 7615; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28 7616; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30 7617; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 7618; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50 7619; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52 7620; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54 7621; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56 7622; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58 7623; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60 7624; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62 7625; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32 7626; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34 7627; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36 7628; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38 7629; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40 7630; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42 7631; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44 7632; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46 7633; GCN-NEXT: s_waitcnt vmcnt(8) 7634; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 7635; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0 7636; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7637; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen 7638; GCN-NEXT: s_waitcnt expcnt(0) 7639; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 7640; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7641; GCN-NEXT: s_waitcnt expcnt(0) 7642; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 7643; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0 7644; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7645; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen 7646; GCN-NEXT: s_waitcnt expcnt(0) 7647; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 7648; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7649; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0 7650; GCN-NEXT: s_waitcnt expcnt(0) 7651; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 7652; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7653; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen 7654; GCN-NEXT: s_waitcnt expcnt(0) 7655; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 7656; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7657; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0 7658; GCN-NEXT: s_waitcnt expcnt(0) 7659; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 7660; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7661; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen 7662; GCN-NEXT: s_waitcnt expcnt(0) 7663; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 7664; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7665; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0 7666; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0 7667; GCN-NEXT: s_waitcnt expcnt(0) 7668; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 7669; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7670; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen 7671; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0 7672; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen 7673; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0 7674; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0 7675; GCN-NEXT: s_waitcnt expcnt(0) 7676; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 7677; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7678; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen 7679; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0 7680; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen 7681; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0 7682; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0 7683; GCN-NEXT: s_waitcnt expcnt(0) 7684; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 7685; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7686; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen 7687; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0 7688; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen 7689; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0 7690; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0 7691; GCN-NEXT: s_waitcnt expcnt(0) 7692; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 7693; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7694; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen 7695; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0 7696; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen 7697; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 7698; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 7699; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) 7700; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 7701; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7702; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen 7703; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0 7704; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen 7705; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0 7706; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0 7707; GCN-NEXT: s_waitcnt expcnt(0) 7708; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 7709; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7710; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen 7711; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0 7712; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen 7713; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 7714; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0 7715; GCN-NEXT: s_waitcnt expcnt(0) 7716; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 7717; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7718; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen 7719; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0 7720; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen 7721; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0 7722; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0 7723; GCN-NEXT: s_waitcnt expcnt(0) 7724; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 7725; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7726; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen 7727; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0 7728; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen 7729; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 7730; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 7731; GCN-NEXT: s_waitcnt expcnt(0) 7732; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 7733; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7734; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen 7735; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 7736; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen 7737; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0 7738; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0 7739; GCN-NEXT: s_waitcnt expcnt(0) 7740; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 7741; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7742; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen 7743; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0 7744; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen 7745; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0 7746; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 7747; GCN-NEXT: s_waitcnt expcnt(0) 7748; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 7749; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7750; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen 7751; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0 7752; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen 7753; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 7754; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 7755; GCN-NEXT: s_waitcnt expcnt(0) 7756; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 7757; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7758; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen 7759; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 7760; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen 7761; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0 7762; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 7763; GCN-NEXT: s_waitcnt expcnt(0) 7764; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 7765; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7766; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen 7767; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0 7768; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen 7769; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 7770; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 7771; GCN-NEXT: s_waitcnt expcnt(0) 7772; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 7773; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7774; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen 7775; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 7776; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen 7777; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 7778; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 7779; GCN-NEXT: s_waitcnt expcnt(0) 7780; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 7781; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7782; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen 7783; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0 7784; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen 7785; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 7786; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0 7787; GCN-NEXT: s_waitcnt expcnt(0) 7788; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 7789; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7790; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen 7791; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 7792; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen 7793; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 7794; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 7795; GCN-NEXT: s_waitcnt expcnt(0) 7796; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 7797; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7798; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen 7799; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 7800; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen 7801; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 7802; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 7803; GCN-NEXT: s_waitcnt expcnt(0) 7804; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 7805; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7806; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen 7807; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 7808; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen 7809; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0 7810; GCN-NEXT: s_waitcnt expcnt(0) 7811; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 7812; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 7813; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 7814; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 7815; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 7816; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v4 7817; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5 7818; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6 7819; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 7820; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8 7821; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7822; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 7823; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen 7824; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 7825; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen 7826; GCN-NEXT: s_waitcnt expcnt(0) 7827; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9 7828; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12 7829; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 7830; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen 7831; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13 7832; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen 7833; GCN-NEXT: s_waitcnt expcnt(0) 7834; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14 7835; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15 7836; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 7837; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen 7838; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen 7839; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen 7840; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen 7841; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen 7842; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen 7843; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen 7844; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen 7845; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen 7846; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen 7847; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen 7848; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen 7849; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen 7850; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen 7851; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen 7852; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen 7853; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 7854; GCN-NEXT: s_setpc_b64 s[30:31] 7855; 7856; GFX7-LABEL: global_extload_v32bf16_to_v32f64: 7857; GFX7: ; %bb.0: 7858; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7859; GFX7-NEXT: s_mov_b32 s6, 0 7860; GFX7-NEXT: s_mov_b32 s7, 0xf000 7861; GFX7-NEXT: s_mov_b32 s4, s6 7862; GFX7-NEXT: s_mov_b32 s5, s6 7863; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62 7864; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60 7865; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58 7866; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56 7867; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54 7868; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 7869; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 7870; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 7871; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 7872; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 7873; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 7874; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 7875; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40 7876; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42 7877; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 7878; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 7879; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 7880; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 7881; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 7882; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 7883; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8 7884; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 7885; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12 7886; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14 7887; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16 7888; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18 7889; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20 7890; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22 7891; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 7892; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 7893; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 7894; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 7895; GFX7-NEXT: s_waitcnt vmcnt(14) 7896; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 7897; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7898; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0 7899; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7900; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 7901; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7902; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 7903; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7904; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0 7905; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 7906; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7907; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 7908; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7909; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19 7910; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7911; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0 7912; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 7913; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7914; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 7915; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7916; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 7917; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7918; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0 7919; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0 7920; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7921; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 7922; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21 7923; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7924; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17 7925; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0 7926; GFX7-NEXT: s_waitcnt vmcnt(14) 7927; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 7928; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7929; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 7930; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen 7931; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 7932; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23 7933; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 7934; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen 7935; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen 7936; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0 7937; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 7938; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 7939; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7940; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0 7941; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 7942; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0 7943; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7944; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31 7945; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 7946; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 7947; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7948; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0 7949; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 7950; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 7951; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7952; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0 7953; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 7954; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0 7955; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7956; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29 7957; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 7958; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0 7959; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7960; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0 7961; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 7962; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 7963; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7964; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0 7965; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 7966; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0 7967; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7968; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27 7969; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 7970; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0 7971; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7972; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 7973; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 7974; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26 7975; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7976; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0 7977; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 7978; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0 7979; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7980; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25 7981; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 7982; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0 7983; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7984; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 7985; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 7986; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 7987; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen 7988; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 7989; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 7990; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0 7991; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 7992; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen 7993; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 7994; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 7995; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 7996; GFX7-NEXT: s_waitcnt vmcnt(14) 7997; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 7998; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 7999; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 8000; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0 8001; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 8002; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 8003; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8004; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 8005; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 8006; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 8007; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 8008; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 8009; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen 8010; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0 8011; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 8012; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13 8013; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14 8014; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 8015; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0 8016; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 8017; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen 8018; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0 8019; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen 8020; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 8021; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12 8022; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 8023; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0 8024; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 8025; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen 8026; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0 8027; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen 8028; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 8029; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10 8030; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 8031; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0 8032; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 8033; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen 8034; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0 8035; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7 8036; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen 8037; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4 8038; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 8039; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 8040; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 8041; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen 8042; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 8043; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen 8044; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 8045; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen 8046; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 8047; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen 8048; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 8049; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10 8050; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 8051; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 8052; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen 8053; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0 8054; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen 8055; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0 8056; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 8057; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen 8058; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0 8059; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen 8060; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0 8061; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen 8062; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 8063; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen 8064; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0 8065; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen 8066; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 8067; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen 8068; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0 8069; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen 8070; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 8071; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 8072; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen 8073; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0 8074; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8075; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen 8076; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 8077; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen 8078; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 8079; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen 8080; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0 8081; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen 8082; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 8083; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen 8084; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0 8085; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen 8086; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 8087; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 8088; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 8089; GFX7-NEXT: s_waitcnt vmcnt(0) 8090; GFX7-NEXT: s_setpc_b64 s[30:31] 8091; 8092; GFX8-LABEL: global_extload_v32bf16_to_v32f64: 8093; GFX8: ; %bb.0: 8094; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8095; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1 8096; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc 8097; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1 8098; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc 8099; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1 8100; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc 8101; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1 8102; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc 8103; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1 8104; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc 8105; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1 8106; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc 8107; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1 8108; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc 8109; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1 8110; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc 8111; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1 8112; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc 8113; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1 8114; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc 8115; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1 8116; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc 8117; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1 8118; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc 8119; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1 8120; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc 8121; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1 8122; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc 8123; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1 8124; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc 8125; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1 8126; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc 8127; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 8128; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc 8129; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill 8130; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill 8131; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill 8132; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill 8133; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill 8134; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill 8135; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill 8136; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill 8137; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill 8138; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill 8139; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill 8140; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 8141; GFX8-NEXT: flat_load_ushort v44, v[1:2] 8142; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc 8143; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 8144; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc 8145; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1 8146; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc 8147; GFX8-NEXT: flat_load_ushort v45, v[50:51] 8148; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1 8149; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc 8150; GFX8-NEXT: flat_load_ushort v46, v[50:51] 8151; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1 8152; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc 8153; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1 8154; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc 8155; GFX8-NEXT: flat_load_ushort v47, v[52:53] 8156; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1 8157; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc 8158; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1 8159; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc 8160; GFX8-NEXT: flat_load_ushort v56, v[54:55] 8161; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1 8162; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc 8163; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1 8164; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc 8165; GFX8-NEXT: flat_load_ushort v57, v[39:40] 8166; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 8167; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc 8168; GFX8-NEXT: flat_load_ushort v58, v[39:40] 8169; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1 8170; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc 8171; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 8172; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc 8173; GFX8-NEXT: flat_load_ushort v42, v[42:43] 8174; GFX8-NEXT: flat_load_ushort v34, v[33:34] 8175; GFX8-NEXT: flat_load_ushort v36, v[35:36] 8176; GFX8-NEXT: flat_load_ushort v38, v[37:38] 8177; GFX8-NEXT: flat_load_ushort v39, v[48:49] 8178; GFX8-NEXT: flat_load_ushort v48, v[50:51] 8179; GFX8-NEXT: flat_load_ushort v51, v[52:53] 8180; GFX8-NEXT: flat_load_ushort v52, v[54:55] 8181; GFX8-NEXT: flat_load_ushort v53, v[40:41] 8182; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 8183; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc 8184; GFX8-NEXT: flat_load_ushort v37, v[3:4] 8185; GFX8-NEXT: flat_load_ushort v35, v[5:6] 8186; GFX8-NEXT: flat_load_ushort v33, v[7:8] 8187; GFX8-NEXT: flat_load_ushort v8, v[9:10] 8188; GFX8-NEXT: flat_load_ushort v6, v[11:12] 8189; GFX8-NEXT: flat_load_ushort v4, v[13:14] 8190; GFX8-NEXT: flat_load_ushort v2, v[15:16] 8191; GFX8-NEXT: flat_load_ushort v1, v[19:20] 8192; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0 8193; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0 8194; GFX8-NEXT: s_waitcnt vmcnt(14) 8195; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 8196; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 8197; GFX8-NEXT: flat_load_ushort v3, v[17:18] 8198; GFX8-NEXT: flat_load_ushort v5, v[21:22] 8199; GFX8-NEXT: flat_load_ushort v7, v[23:24] 8200; GFX8-NEXT: flat_load_ushort v9, v[25:26] 8201; GFX8-NEXT: flat_load_ushort v10, v[27:28] 8202; GFX8-NEXT: flat_load_ushort v11, v[29:30] 8203; GFX8-NEXT: flat_load_ushort v12, v[31:32] 8204; GFX8-NEXT: flat_load_ushort v13, v[49:50] 8205; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 8206; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen 8207; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen 8208; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0 8209; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45 8210; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8211; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8212; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46 8213; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8214; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0 8215; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen 8216; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0 8217; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen 8218; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47 8219; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 8220; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0 8221; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8222; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0 8223; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen 8224; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0 8225; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56 8226; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8227; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen 8228; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0 8229; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8230; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0 8231; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57 8232; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8233; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen 8234; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0 8235; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen 8236; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58 8237; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 8238; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0 8239; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8240; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0 8241; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen 8242; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42 8243; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8244; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0 8245; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen 8246; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0 8247; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8248; GFX8-NEXT: s_waitcnt vmcnt(14) 8249; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53 8250; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8251; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0 8252; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen 8253; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0 8254; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen 8255; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52 8256; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 8257; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0 8258; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8259; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0 8260; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen 8261; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 8262; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8263; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 8264; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen 8265; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 8266; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8267; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48 8268; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8269; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0 8270; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen 8271; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0 8272; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen 8273; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39 8274; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 8275; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0 8276; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8277; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0 8278; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen 8279; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38 8280; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 8281; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0 8282; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen 8283; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0 8284; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8285; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36 8286; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8287; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0 8288; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen 8289; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0 8290; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen 8291; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0 8292; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen 8293; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34 8294; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 8295; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0 8296; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37 8297; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen 8298; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0 8299; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 8300; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen 8301; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 8302; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 8303; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35 8304; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12 8305; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11 8306; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen 8307; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 8308; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen 8309; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 8310; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 8311; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 8312; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 8313; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 8314; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen 8315; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0 8316; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen 8317; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 8318; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11 8319; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0 8320; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 8321; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen 8322; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0 8323; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen 8324; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8 8325; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 8326; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 8327; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0 8328; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8329; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen 8330; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0 8331; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen 8332; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 8333; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 8334; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 8335; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 8336; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 8337; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen 8338; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0 8339; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen 8340; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4 8341; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 8342; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 8343; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 8344; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen 8345; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0 8346; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen 8347; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 8348; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 8349; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 8350; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 8351; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0 8352; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen 8353; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 8354; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 8355; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 8356; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4 8357; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 8358; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen 8359; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 8360; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 8361; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 8362; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen 8363; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0 8364; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen 8365; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0 8366; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen 8367; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 8368; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen 8369; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0 8370; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen 8371; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0 8372; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen 8373; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0 8374; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen 8375; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0 8376; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen 8377; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0 8378; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen 8379; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 8380; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen 8381; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 8382; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen 8383; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 8384; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen 8385; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 8386; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen 8387; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 8388; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen 8389; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 8390; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 8391; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen 8392; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen 8393; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload 8394; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload 8395; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload 8396; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload 8397; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload 8398; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload 8399; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload 8400; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload 8401; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload 8402; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload 8403; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload 8404; GFX8-NEXT: s_waitcnt vmcnt(0) 8405; GFX8-NEXT: s_setpc_b64 s[30:31] 8406; 8407; GFX9-LABEL: global_extload_v32bf16_to_v32f64: 8408; GFX9: ; %bb.0: 8409; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8410; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62 8411; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60 8412; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58 8413; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56 8414; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54 8415; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52 8416; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50 8417; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48 8418; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46 8419; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44 8420; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42 8421; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40 8422; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38 8423; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36 8424; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34 8425; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32 8426; GFX9-NEXT: global_load_ushort v25, v[1:2], off 8427; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2 8428; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30 8429; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 8430; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 8431; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 8432; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 8433; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24 8434; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 8435; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 8436; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 8437; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 8438; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 8439; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 8440; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 8441; GFX9-NEXT: s_nop 0 8442; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 8443; GFX9-NEXT: s_waitcnt vmcnt(31) 8444; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 8445; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 8446; GFX9-NEXT: s_waitcnt vmcnt(30) 8447; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10 8448; GFX9-NEXT: s_waitcnt vmcnt(28) 8449; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12 8450; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252 8451; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248 8452; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 8453; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 8454; GFX9-NEXT: s_waitcnt vmcnt(29) 8455; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 8456; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244 8457; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240 8458; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 8459; GFX9-NEXT: s_waitcnt vmcnt(30) 8460; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 8461; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236 8462; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232 8463; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 8464; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 8465; GFX9-NEXT: s_waitcnt vmcnt(31) 8466; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 8467; GFX9-NEXT: s_waitcnt vmcnt(30) 8468; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 8469; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228 8470; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224 8471; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 8472; GFX9-NEXT: s_waitcnt vmcnt(31) 8473; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 8474; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 8475; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220 8476; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216 8477; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14 8478; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 8479; GFX9-NEXT: s_waitcnt vmcnt(32) 8480; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18 8481; GFX9-NEXT: s_waitcnt vmcnt(30) 8482; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20 8483; GFX9-NEXT: s_waitcnt vmcnt(28) 8484; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 8485; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212 8486; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208 8487; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 8488; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19 8489; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 8490; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204 8491; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200 8492; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196 8493; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192 8494; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20 8495; GFX9-NEXT: s_waitcnt vmcnt(33) 8496; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23 8497; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 8498; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 8499; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19 8500; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188 8501; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184 8502; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180 8503; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176 8504; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172 8505; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168 8506; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164 8507; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160 8508; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156 8509; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152 8510; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148 8511; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 8512; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144 8513; GFX9-NEXT: s_waitcnt vmcnt(44) 8514; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24 8515; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140 8516; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136 8517; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 8518; GFX9-NEXT: s_waitcnt vmcnt(43) 8519; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27 8520; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132 8521; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128 8522; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 8523; GFX9-NEXT: s_waitcnt vmcnt(38) 8524; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30 8525; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124 8526; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120 8527; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 8528; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29 8529; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116 8530; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112 8531; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16 8532; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25 8533; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 8534; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 8535; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2 8536; GFX9-NEXT: s_waitcnt vmcnt(41) 8537; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 8538; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28 8539; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 8540; GFX9-NEXT: s_waitcnt vmcnt(40) 8541; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 8542; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108 8543; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104 8544; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 8545; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2 8546; GFX9-NEXT: s_waitcnt vmcnt(41) 8547; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 8548; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2 8549; GFX9-NEXT: s_waitcnt vmcnt(40) 8550; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 8551; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 8552; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2 8553; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 8554; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 8555; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 8556; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 8557; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 8558; GFX9-NEXT: s_waitcnt vmcnt(41) 8559; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7 8560; GFX9-NEXT: s_waitcnt vmcnt(40) 8561; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 8562; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 8563; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 8564; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 8565; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 8566; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 8567; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 8568; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 8569; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 8570; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 8571; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 8572; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 8573; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22 8574; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 8575; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 8576; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 8577; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 8578; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 8579; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 8580; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 8581; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40 8582; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36 8583; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32 8584; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28 8585; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 8586; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 8587; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16 8588; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 8589; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8 8590; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4 8591; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen 8592; GFX9-NEXT: s_waitcnt vmcnt(0) 8593; GFX9-NEXT: s_setpc_b64 s[30:31] 8594; 8595; GFX10-LABEL: global_extload_v32bf16_to_v32f64: 8596; GFX10: ; %bb.0: 8597; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8598; GFX10-NEXT: s_clause 0x1f 8599; GFX10-NEXT: global_load_ushort v3, v[1:2], off 8600; GFX10-NEXT: global_load_ushort v4, v[1:2], off offset:2 8601; GFX10-NEXT: global_load_ushort v5, v[1:2], off offset:4 8602; GFX10-NEXT: global_load_ushort v6, v[1:2], off offset:6 8603; GFX10-NEXT: global_load_ushort v7, v[1:2], off offset:8 8604; GFX10-NEXT: global_load_ushort v8, v[1:2], off offset:10 8605; GFX10-NEXT: global_load_ushort v9, v[1:2], off offset:12 8606; GFX10-NEXT: global_load_ushort v10, v[1:2], off offset:14 8607; GFX10-NEXT: global_load_ushort v11, v[1:2], off offset:16 8608; GFX10-NEXT: global_load_ushort v12, v[1:2], off offset:18 8609; GFX10-NEXT: global_load_ushort v13, v[1:2], off offset:20 8610; GFX10-NEXT: global_load_ushort v14, v[1:2], off offset:22 8611; GFX10-NEXT: global_load_ushort v15, v[1:2], off offset:24 8612; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26 8613; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28 8614; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30 8615; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62 8616; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32 8617; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34 8618; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36 8619; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60 8620; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38 8621; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40 8622; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58 8623; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42 8624; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44 8625; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56 8626; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46 8627; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48 8628; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54 8629; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50 8630; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52 8631; GFX10-NEXT: s_waitcnt vmcnt(31) 8632; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3 8633; GFX10-NEXT: s_waitcnt vmcnt(30) 8634; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4 8635; GFX10-NEXT: s_waitcnt vmcnt(29) 8636; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5 8637; GFX10-NEXT: s_waitcnt vmcnt(28) 8638; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6 8639; GFX10-NEXT: s_waitcnt vmcnt(27) 8640; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7 8641; GFX10-NEXT: s_waitcnt vmcnt(26) 8642; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8 8643; GFX10-NEXT: s_waitcnt vmcnt(25) 8644; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 8645; GFX10-NEXT: s_waitcnt vmcnt(24) 8646; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 8647; GFX10-NEXT: s_waitcnt vmcnt(23) 8648; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11 8649; GFX10-NEXT: s_waitcnt vmcnt(22) 8650; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12 8651; GFX10-NEXT: s_waitcnt vmcnt(21) 8652; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13 8653; GFX10-NEXT: s_waitcnt vmcnt(20) 8654; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14 8655; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35 8656; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36 8657; GFX10-NEXT: s_waitcnt vmcnt(17) 8658; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17 8659; GFX10-NEXT: s_waitcnt vmcnt(16) 8660; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18 8661; GFX10-NEXT: s_waitcnt vmcnt(15) 8662; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19 8663; GFX10-NEXT: s_waitcnt vmcnt(14) 8664; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20 8665; GFX10-NEXT: s_waitcnt vmcnt(13) 8666; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21 8667; GFX10-NEXT: s_waitcnt vmcnt(12) 8668; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22 8669; GFX10-NEXT: s_waitcnt vmcnt(11) 8670; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23 8671; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 8672; GFX10-NEXT: s_waitcnt vmcnt(9) 8673; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25 8674; GFX10-NEXT: s_waitcnt vmcnt(8) 8675; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26 8676; GFX10-NEXT: s_waitcnt vmcnt(7) 8677; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27 8678; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 8679; GFX10-NEXT: s_waitcnt vmcnt(5) 8680; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29 8681; GFX10-NEXT: s_waitcnt vmcnt(4) 8682; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30 8683; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 8684; GFX10-NEXT: s_waitcnt vmcnt(2) 8685; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 8686; GFX10-NEXT: s_waitcnt vmcnt(1) 8687; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33 8688; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 8689; GFX10-NEXT: s_waitcnt vmcnt(0) 8690; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34 8691; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 8692; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 8693; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28 8694; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 8695; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 8696; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71 8697; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68 8698; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 8699; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70 8700; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15 8701; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 8702; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 8703; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 8704; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37 8705; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38 8706; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 8707; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240 8708; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25 8709; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66 8710; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 8711; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232 8712; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27 8713; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48 8714; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 8715; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224 8716; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81 8717; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49 8718; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 8719; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 8720; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80 8721; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 8722; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 8723; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69 8724; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64 8725; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50 8726; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51 8727; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54 8728; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204 8729; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 8730; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67 8731; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 8732; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196 8733; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192 8734; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65 8735; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188 8736; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184 8737; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55 8738; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180 8739; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176 8740; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53 8741; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172 8742; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168 8743; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52 8744; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 8745; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160 8746; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156 8747; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 8748; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 8749; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 8750; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 8751; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136 8752; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132 8753; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 8754; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124 8755; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120 8756; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116 8757; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 8758; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108 8759; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104 8760; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 8761; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 8762; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92 8763; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88 8764; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84 8765; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 8766; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76 8767; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 8768; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68 8769; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64 8770; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60 8771; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56 8772; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52 8773; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48 8774; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 8775; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 8776; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 8777; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 8778; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 8779; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 8780; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 8781; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 8782; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 8783; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 8784; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4 8785; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen 8786; GFX10-NEXT: s_setpc_b64 s[30:31] 8787; 8788; GFX11-LABEL: global_extload_v32bf16_to_v32f64: 8789; GFX11: ; %bb.0: 8790; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8791; GFX11-NEXT: s_clause 0x1f 8792; GFX11-NEXT: global_load_u16 v3, v[1:2], off offset:12 8793; GFX11-NEXT: global_load_u16 v4, v[1:2], off offset:8 8794; GFX11-NEXT: global_load_u16 v5, v[1:2], off offset:4 8795; GFX11-NEXT: global_load_u16 v6, v[1:2], off offset:2 8796; GFX11-NEXT: global_load_u16 v7, v[1:2], off 8797; GFX11-NEXT: global_load_u16 v8, v[1:2], off offset:6 8798; GFX11-NEXT: global_load_u16 v9, v[1:2], off offset:10 8799; GFX11-NEXT: global_load_u16 v10, v[1:2], off offset:14 8800; GFX11-NEXT: global_load_u16 v11, v[1:2], off offset:28 8801; GFX11-NEXT: global_load_u16 v12, v[1:2], off offset:24 8802; GFX11-NEXT: global_load_u16 v13, v[1:2], off offset:20 8803; GFX11-NEXT: global_load_u16 v14, v[1:2], off offset:18 8804; GFX11-NEXT: global_load_u16 v15, v[1:2], off offset:16 8805; GFX11-NEXT: global_load_u16 v16, v[1:2], off offset:22 8806; GFX11-NEXT: global_load_u16 v17, v[1:2], off offset:26 8807; GFX11-NEXT: global_load_u16 v18, v[1:2], off offset:30 8808; GFX11-NEXT: global_load_u16 v19, v[1:2], off offset:44 8809; GFX11-NEXT: global_load_u16 v20, v[1:2], off offset:40 8810; GFX11-NEXT: global_load_u16 v21, v[1:2], off offset:36 8811; GFX11-NEXT: global_load_u16 v22, v[1:2], off offset:34 8812; GFX11-NEXT: global_load_u16 v23, v[1:2], off offset:32 8813; GFX11-NEXT: global_load_u16 v24, v[1:2], off offset:38 8814; GFX11-NEXT: global_load_u16 v25, v[1:2], off offset:42 8815; GFX11-NEXT: global_load_u16 v26, v[1:2], off offset:46 8816; GFX11-NEXT: global_load_u16 v27, v[1:2], off offset:60 8817; GFX11-NEXT: global_load_u16 v28, v[1:2], off offset:56 8818; GFX11-NEXT: global_load_u16 v29, v[1:2], off offset:52 8819; GFX11-NEXT: global_load_u16 v30, v[1:2], off offset:50 8820; GFX11-NEXT: global_load_u16 v31, v[1:2], off offset:48 8821; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54 8822; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58 8823; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62 8824; GFX11-NEXT: s_waitcnt vmcnt(31) 8825; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3 8826; GFX11-NEXT: s_waitcnt vmcnt(30) 8827; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v4 8828; GFX11-NEXT: s_waitcnt vmcnt(29) 8829; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8830; GFX11-NEXT: s_waitcnt vmcnt(28) 8831; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 8832; GFX11-NEXT: s_waitcnt vmcnt(27) 8833; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 8834; GFX11-NEXT: s_waitcnt vmcnt(26) 8835; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 8836; GFX11-NEXT: s_waitcnt vmcnt(25) 8837; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 8838; GFX11-NEXT: s_waitcnt vmcnt(24) 8839; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 8840; GFX11-NEXT: s_waitcnt vmcnt(23) 8841; GFX11-NEXT: v_lshlrev_b32_e32 v102, 16, v11 8842; GFX11-NEXT: s_waitcnt vmcnt(22) 8843; GFX11-NEXT: v_lshlrev_b32_e32 v101, 16, v12 8844; GFX11-NEXT: s_waitcnt vmcnt(21) 8845; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 8846; GFX11-NEXT: s_waitcnt vmcnt(20) 8847; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 8848; GFX11-NEXT: s_waitcnt vmcnt(19) 8849; GFX11-NEXT: v_lshlrev_b32_e32 v100, 16, v15 8850; GFX11-NEXT: s_waitcnt vmcnt(18) 8851; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v16 8852; GFX11-NEXT: s_waitcnt vmcnt(17) 8853; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 8854; GFX11-NEXT: s_waitcnt vmcnt(16) 8855; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 8856; GFX11-NEXT: s_waitcnt vmcnt(15) 8857; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v19 8858; GFX11-NEXT: s_waitcnt vmcnt(14) 8859; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v20 8860; GFX11-NEXT: s_waitcnt vmcnt(13) 8861; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 8862; GFX11-NEXT: s_waitcnt vmcnt(12) 8863; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 8864; GFX11-NEXT: s_waitcnt vmcnt(11) 8865; GFX11-NEXT: v_lshlrev_b32_e32 v103, 16, v23 8866; GFX11-NEXT: s_waitcnt vmcnt(10) 8867; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v24 8868; GFX11-NEXT: s_waitcnt vmcnt(9) 8869; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 8870; GFX11-NEXT: s_waitcnt vmcnt(8) 8871; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 8872; GFX11-NEXT: s_waitcnt vmcnt(7) 8873; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v27 8874; GFX11-NEXT: s_waitcnt vmcnt(6) 8875; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v28 8876; GFX11-NEXT: s_waitcnt vmcnt(5) 8877; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 8878; GFX11-NEXT: s_waitcnt vmcnt(4) 8879; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 8880; GFX11-NEXT: s_waitcnt vmcnt(3) 8881; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v31 8882; GFX11-NEXT: s_waitcnt vmcnt(2) 8883; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v32 8884; GFX11-NEXT: s_waitcnt vmcnt(1) 8885; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 8886; GFX11-NEXT: s_waitcnt vmcnt(0) 8887; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 8888; GFX11-NEXT: v_cvt_f64_f32_e32 v[96:97], v68 8889; GFX11-NEXT: v_cvt_f64_f32_e32 v[84:85], v65 8890; GFX11-NEXT: v_cvt_f64_f32_e32 v[82:83], v64 8891; GFX11-NEXT: v_cvt_f64_f32_e32 v[86:87], v33 8892; GFX11-NEXT: v_cvt_f64_f32_e32 v[98:99], v1 8893; GFX11-NEXT: v_cvt_f64_f32_e32 v[80:81], v29 8894; GFX11-NEXT: v_cvt_f64_f32_e32 v[70:71], v30 8895; GFX11-NEXT: v_cvt_f64_f32_e32 v[68:69], v53 8896; GFX11-NEXT: v_cvt_f64_f32_e32 v[66:67], v26 8897; GFX11-NEXT: v_cvt_f64_f32_e32 v[64:65], v52 8898; GFX11-NEXT: v_cvt_f64_f32_e32 v[54:55], v25 8899; GFX11-NEXT: v_cvt_f64_f32_e32 v[52:53], v49 8900; GFX11-NEXT: v_cvt_f64_f32_e32 v[50:51], v48 8901; GFX11-NEXT: v_cvt_f64_f32_e32 v[48:49], v21 8902; GFX11-NEXT: v_cvt_f64_f32_e32 v[23:24], v34 8903; GFX11-NEXT: v_cvt_f64_f32_e32 v[35:36], v22 8904; GFX11-NEXT: v_cvt_f64_f32_e32 v[33:34], v103 8905; GFX11-NEXT: v_cvt_f64_f32_e32 v[31:32], v18 8906; GFX11-NEXT: v_cvt_f64_f32_e32 v[29:30], v102 8907; GFX11-NEXT: v_cvt_f64_f32_e32 v[27:28], v17 8908; GFX11-NEXT: v_cvt_f64_f32_e32 v[25:26], v101 8909; GFX11-NEXT: v_cvt_f64_f32_e32 v[21:22], v13 8910; GFX11-NEXT: v_cvt_f64_f32_e32 v[19:20], v14 8911; GFX11-NEXT: v_cvt_f64_f32_e32 v[17:18], v100 8912; GFX11-NEXT: v_cvt_f64_f32_e32 v[15:16], v10 8913; GFX11-NEXT: v_cvt_f64_f32_e32 v[13:14], v39 8914; GFX11-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 8915; GFX11-NEXT: v_cvt_f64_f32_e32 v[9:10], v38 8916; GFX11-NEXT: v_cvt_f64_f32_e32 v[7:8], v6 8917; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 8918; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2 8919; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37 8920; GFX11-NEXT: s_clause 0xf 8921; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240 8922; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224 8923; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208 8924; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192 8925; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176 8926; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 8927; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 8928; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128 8929; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 8930; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 8931; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 8932; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 8933; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 8934; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 8935; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 8936; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off 8937; GFX11-NEXT: s_setpc_b64 s[30:31] 8938 %load = load <32 x bfloat>, ptr addrspace(1) %ptr 8939 %fpext = fpext <32 x bfloat> %load to <32 x double> 8940 ret <32 x double> %fpext 8941} 8942 8943define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { 8944; GCN-LABEL: v_fadd_bf16: 8945; GCN: ; %bb.0: 8946; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8947; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 8948; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 8949; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 8950; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 8951; GCN-NEXT: v_add_f32_e32 v0, v0, v1 8952; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 8953; GCN-NEXT: s_setpc_b64 s[30:31] 8954; 8955; GFX7-LABEL: v_fadd_bf16: 8956; GFX7: ; %bb.0: 8957; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8958; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 8959; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 8960; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 8961; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 8962; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 8963; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 8964; GFX7-NEXT: s_setpc_b64 s[30:31] 8965; 8966; GFX8-LABEL: v_fadd_bf16: 8967; GFX8: ; %bb.0: 8968; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8969; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 8970; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 8971; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 8972; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 8973; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 8974; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 8975; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 8976; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 8977; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 8978; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 8979; GFX8-NEXT: s_setpc_b64 s[30:31] 8980; 8981; GFX9-LABEL: v_fadd_bf16: 8982; GFX9: ; %bb.0: 8983; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8984; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 8985; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 8986; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 8987; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 8988; GFX9-NEXT: s_movk_i32 s4, 0x7fff 8989; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 8990; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 8991; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 8992; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 8993; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 8994; GFX9-NEXT: s_setpc_b64 s[30:31] 8995; 8996; GFX10-LABEL: v_fadd_bf16: 8997; GFX10: ; %bb.0: 8998; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8999; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 9000; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9001; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 9002; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 9003; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 9004; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9005; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 9006; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 9007; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9008; GFX10-NEXT: s_setpc_b64 s[30:31] 9009; 9010; GFX11-LABEL: v_fadd_bf16: 9011; GFX11: ; %bb.0: 9012; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9013; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 9014; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9015; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9016; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 9017; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 9018; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 9019; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9020; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9021; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 9022; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 9023; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9024; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9025; GFX11-NEXT: s_setpc_b64 s[30:31] 9026 %op = fadd bfloat %a, %b 9027 ret bfloat %op 9028} 9029 9030define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { 9031; GCN-LABEL: v_fadd_v2bf16: 9032; GCN: ; %bb.0: 9033; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9034; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 9035; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 9036; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 9037; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 9038; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9039; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9040; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9041; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9042; GCN-NEXT: v_add_f32_e32 v1, v1, v3 9043; GCN-NEXT: v_add_f32_e32 v0, v0, v2 9044; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9045; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9046; GCN-NEXT: s_setpc_b64 s[30:31] 9047; 9048; GFX7-LABEL: v_fadd_v2bf16: 9049; GFX7: ; %bb.0: 9050; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9051; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 9052; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9053; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 9054; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 9055; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9056; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9057; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9058; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9059; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 9060; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 9061; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9062; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9063; GFX7-NEXT: s_setpc_b64 s[30:31] 9064; 9065; GFX8-LABEL: v_fadd_v2bf16: 9066; GFX8: ; %bb.0: 9067; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9068; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9069; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 9070; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 9071; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 9072; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 9073; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9074; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9075; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 9076; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 9077; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 9078; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 9079; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 9080; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 9081; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 9082; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 9083; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 9084; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9085; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 9086; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9087; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 9088; GFX8-NEXT: s_setpc_b64 s[30:31] 9089; 9090; GFX9-LABEL: v_fadd_v2bf16: 9091; GFX9: ; %bb.0: 9092; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9093; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9094; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 9095; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 9096; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9097; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9098; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 9099; GFX9-NEXT: s_movk_i32 s4, 0x7fff 9100; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 9101; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 9102; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 9103; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 9104; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 9105; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 9106; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 9107; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 9108; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9109; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 9110; GFX9-NEXT: s_mov_b32 s4, 0x7060302 9111; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 9112; GFX9-NEXT: s_setpc_b64 s[30:31] 9113; 9114; GFX10-LABEL: v_fadd_v2bf16: 9115; GFX10: ; %bb.0: 9116; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9117; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9118; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 9119; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9120; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9121; GFX10-NEXT: v_add_f32_e32 v2, v3, v2 9122; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 9123; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 9124; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 9125; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 9126; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 9127; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 9128; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 9129; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 9130; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 9131; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9132; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 9133; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 9134; GFX10-NEXT: s_setpc_b64 s[30:31] 9135; 9136; GFX11-LABEL: v_fadd_v2bf16: 9137; GFX11: ; %bb.0: 9138; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9139; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 9140; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9141; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 9142; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9143; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 9144; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 9145; GFX11-NEXT: v_add_f32_e32 v2, v3, v2 9146; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 9147; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 9148; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 9149; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 9150; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 9151; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 9152; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 9153; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 9154; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 9155; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 9156; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9157; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 9158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9159; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 9160; GFX11-NEXT: s_setpc_b64 s[30:31] 9161 %op = fadd <2 x bfloat> %a, %b 9162 ret <2 x bfloat> %op 9163} 9164 9165define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { 9166; GCN-LABEL: v_fadd_v3bf16: 9167; GCN: ; %bb.0: 9168; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9169; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 9170; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 9171; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 9172; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 9173; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 9174; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 9175; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9176; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9177; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9178; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9179; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9180; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9181; GCN-NEXT: v_add_f32_e32 v2, v2, v5 9182; GCN-NEXT: v_add_f32_e32 v1, v1, v4 9183; GCN-NEXT: v_add_f32_e32 v0, v0, v3 9184; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9185; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9186; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9187; GCN-NEXT: s_setpc_b64 s[30:31] 9188; 9189; GFX7-LABEL: v_fadd_v3bf16: 9190; GFX7: ; %bb.0: 9191; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9192; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 9193; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 9194; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 9195; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 9196; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9197; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9198; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9199; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9200; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9201; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9202; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9203; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9204; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 9205; GFX7-NEXT: v_add_f32_e32 v1, v1, v4 9206; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 9207; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9208; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9209; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9210; GFX7-NEXT: s_setpc_b64 s[30:31] 9211; 9212; GFX8-LABEL: v_fadd_v3bf16: 9213; GFX8: ; %bb.0: 9214; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9215; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9216; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 9217; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 9218; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 9219; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 9220; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 9221; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 9222; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9223; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 9224; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 9225; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 9226; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 9227; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 9228; GFX8-NEXT: s_movk_i32 s4, 0x7fff 9229; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 9230; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9231; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9232; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 9233; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 9234; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 9235; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9236; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 9237; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 9238; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 9239; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 9240; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 9241; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9242; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 9243; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9244; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 9245; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 9246; GFX8-NEXT: s_setpc_b64 s[30:31] 9247; 9248; GFX9-LABEL: v_fadd_v3bf16: 9249; GFX9: ; %bb.0: 9250; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9251; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9252; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 9253; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 9254; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 9255; GFX9-NEXT: s_movk_i32 s4, 0x7fff 9256; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 9257; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 9258; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9259; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 9260; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 9261; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 9262; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 9263; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9264; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9265; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 9266; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 9267; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 9268; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 9269; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9270; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 9271; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 9272; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 9273; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 9274; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9275; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 9276; GFX9-NEXT: s_mov_b32 s4, 0x7060302 9277; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 9278; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 9279; GFX9-NEXT: s_setpc_b64 s[30:31] 9280; 9281; GFX10-LABEL: v_fadd_v3bf16: 9282; GFX10: ; %bb.0: 9283; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9284; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 9285; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 9286; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9287; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9288; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9289; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 9290; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 9291; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 9292; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 9293; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 9294; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 9295; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 9296; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9297; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 9298; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 9299; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 9300; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 9301; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 9302; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 9303; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 9304; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9305; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 9306; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9307; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 9308; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 9309; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 9310; GFX10-NEXT: s_setpc_b64 s[30:31] 9311; 9312; GFX11TRUE16-LABEL: v_fadd_v3bf16: 9313; GFX11TRUE16: ; %bb.0: 9314; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9315; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 9316; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 9317; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9318; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9319; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9320; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 9321; GFX11TRUE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 9322; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 9323; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 9324; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 9325; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 9326; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9327; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 9328; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 9329; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 9330; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 9331; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 9332; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 9333; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 9334; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 9335; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9336; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9337; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 9338; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9339; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 9340; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 9341; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 9342; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 9343; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 9344; 9345; GFX11FAKE16-LABEL: v_fadd_v3bf16: 9346; GFX11FAKE16: ; %bb.0: 9347; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9348; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 9349; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 9350; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9351; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9352; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9353; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 9354; GFX11FAKE16-NEXT: v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 9355; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 9356; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 9357; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 9358; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 9359; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9360; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 9361; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 9362; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 9363; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 9364; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 9365; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 9366; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 9367; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 9368; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9369; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9370; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 9371; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9372; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 9373; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 9374; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 9375; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 9376; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 9377 %op = fadd <3 x bfloat> %a, %b 9378 ret <3 x bfloat> %op 9379} 9380 9381define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 9382; GCN-LABEL: v_fadd_v4bf16: 9383; GCN: ; %bb.0: 9384; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9385; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 9386; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 9387; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 9388; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 9389; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 9390; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 9391; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 9392; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 9393; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9394; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9395; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9396; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9397; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9398; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9399; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9400; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9401; GCN-NEXT: v_add_f32_e32 v3, v3, v7 9402; GCN-NEXT: v_add_f32_e32 v2, v2, v6 9403; GCN-NEXT: v_add_f32_e32 v1, v1, v5 9404; GCN-NEXT: v_add_f32_e32 v0, v0, v4 9405; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9406; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9407; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9408; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9409; GCN-NEXT: s_setpc_b64 s[30:31] 9410; 9411; GFX7-LABEL: v_fadd_v4bf16: 9412; GFX7: ; %bb.0: 9413; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9414; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 9415; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 9416; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 9417; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9418; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9419; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 9420; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 9421; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 9422; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9423; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9424; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9425; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9426; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9427; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9428; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9429; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9430; GFX7-NEXT: v_add_f32_e32 v3, v3, v7 9431; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 9432; GFX7-NEXT: v_add_f32_e32 v1, v1, v5 9433; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 9434; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9435; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9436; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9437; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9438; GFX7-NEXT: s_setpc_b64 s[30:31] 9439; 9440; GFX8-LABEL: v_fadd_v4bf16: 9441; GFX8: ; %bb.0: 9442; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9443; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 9444; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 9445; GFX8-NEXT: v_add_f32_e32 v4, v5, v4 9446; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 9447; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 9448; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9449; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9450; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 9451; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 9452; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 9453; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9454; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 9455; GFX8-NEXT: s_movk_i32 s4, 0x7fff 9456; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 9457; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 9458; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 9459; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 9460; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9461; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 9462; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 9463; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 9464; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 9465; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 9466; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 9467; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9468; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9469; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 9470; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 9471; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 9472; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9473; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 9474; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 9475; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 9476; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 9477; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 9478; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9479; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 9480; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 9481; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9482; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 9483; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 9484; GFX8-NEXT: s_setpc_b64 s[30:31] 9485; 9486; GFX9-LABEL: v_fadd_v4bf16: 9487; GFX9: ; %bb.0: 9488; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9489; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 9490; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 9491; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 9492; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9493; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9494; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 9495; GFX9-NEXT: s_movk_i32 s4, 0x7fff 9496; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 9497; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 9498; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 9499; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 9500; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 9501; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 9502; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 9503; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 9504; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9505; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 9506; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 9507; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 9508; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 9509; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9510; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9511; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 9512; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 9513; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 9514; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 9515; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9516; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 9517; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 9518; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 9519; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 9520; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9521; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 9522; GFX9-NEXT: s_mov_b32 s4, 0x7060302 9523; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 9524; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 9525; GFX9-NEXT: s_setpc_b64 s[30:31] 9526; 9527; GFX10-LABEL: v_fadd_v4bf16: 9528; GFX10: ; %bb.0: 9529; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9530; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 9531; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 9532; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9533; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9534; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 9535; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 9536; GFX10-NEXT: v_add_f32_e32 v4, v5, v4 9537; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9538; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9539; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 9540; GFX10-NEXT: v_add_f32_e32 v3, v7, v6 9541; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 9542; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 9543; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 9544; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9545; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 9546; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 9547; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 9548; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 9549; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 9550; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 9551; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 9552; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 9553; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9554; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 9555; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 9556; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 9557; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 9558; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9559; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 9560; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9561; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 9562; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 9563; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 9564; GFX10-NEXT: s_setpc_b64 s[30:31] 9565; 9566; GFX11-LABEL: v_fadd_v4bf16: 9567; GFX11: ; %bb.0: 9568; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9569; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 9570; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 9571; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9572; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9573; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 9574; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 9575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9576; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 9577; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9578; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 9579; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9580; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 9581; GFX11-NEXT: v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4 9582; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 9583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 9584; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 9585; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 9586; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 9587; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 9588; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 9589; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 9590; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 9591; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 9592; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) 9593; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 9594; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 9595; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9596; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 9597; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 9598; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 9599; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9600; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9601; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 9602; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9603; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 9604; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 9605; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9606; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 9607; GFX11-NEXT: s_setpc_b64 s[30:31] 9608 %op = fadd <4 x bfloat> %a, %b 9609 ret <4 x bfloat> %op 9610} 9611 9612define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 9613; GCN-LABEL: v_fadd_v8bf16: 9614; GCN: ; %bb.0: 9615; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9616; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 9617; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 9618; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 9619; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 9620; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 9621; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 9622; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 9623; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 9624; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 9625; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 9626; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 9627; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 9628; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 9629; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 9630; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 9631; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 9632; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 9633; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9634; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 9635; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9636; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 9637; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9638; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 9639; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9640; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 9641; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9642; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 9643; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9644; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 9645; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9646; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 9647; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9648; GCN-NEXT: v_add_f32_e32 v7, v7, v15 9649; GCN-NEXT: v_add_f32_e32 v6, v6, v14 9650; GCN-NEXT: v_add_f32_e32 v5, v5, v13 9651; GCN-NEXT: v_add_f32_e32 v4, v4, v12 9652; GCN-NEXT: v_add_f32_e32 v3, v3, v11 9653; GCN-NEXT: v_add_f32_e32 v2, v2, v10 9654; GCN-NEXT: v_add_f32_e32 v1, v1, v9 9655; GCN-NEXT: v_add_f32_e32 v0, v0, v8 9656; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9657; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9658; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9659; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9660; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9661; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9662; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9663; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9664; GCN-NEXT: s_setpc_b64 s[30:31] 9665; 9666; GFX7-LABEL: v_fadd_v8bf16: 9667; GFX7: ; %bb.0: 9668; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9669; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 9670; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 9671; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 9672; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 9673; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9674; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 9675; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 9676; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 9677; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 9678; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 9679; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9680; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 9681; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 9682; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 9683; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 9684; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 9685; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 9686; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9687; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 9688; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9689; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 9690; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9691; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 9692; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9693; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 9694; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9695; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 9696; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9697; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 9698; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9699; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 9700; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9701; GFX7-NEXT: v_add_f32_e32 v7, v7, v15 9702; GFX7-NEXT: v_add_f32_e32 v6, v6, v14 9703; GFX7-NEXT: v_add_f32_e32 v5, v5, v13 9704; GFX7-NEXT: v_add_f32_e32 v4, v4, v12 9705; GFX7-NEXT: v_add_f32_e32 v3, v3, v11 9706; GFX7-NEXT: v_add_f32_e32 v2, v2, v10 9707; GFX7-NEXT: v_add_f32_e32 v1, v1, v9 9708; GFX7-NEXT: v_add_f32_e32 v0, v0, v8 9709; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9710; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9711; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9712; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9713; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9714; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9715; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9716; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9717; GFX7-NEXT: s_setpc_b64 s[30:31] 9718; 9719; GFX8-LABEL: v_fadd_v8bf16: 9720; GFX8: ; %bb.0: 9721; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9722; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 9723; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 9724; GFX8-NEXT: v_add_f32_e32 v8, v9, v8 9725; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 9726; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 9727; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9728; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9729; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 9730; GFX8-NEXT: v_add_f32_e32 v3, v3, v7 9731; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 9732; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 9733; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 9734; GFX8-NEXT: s_movk_i32 s4, 0x7fff 9735; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 9736; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 9737; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 9738; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 9739; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9740; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 9741; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 9742; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 9743; GFX8-NEXT: v_add_f32_e32 v7, v9, v7 9744; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 9745; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 9746; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9747; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9748; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 9749; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 9750; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 9751; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 9752; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 9753; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 9754; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 9755; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 9756; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 9757; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 9758; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 9759; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 9760; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 9761; GFX8-NEXT: v_add_f32_e32 v6, v9, v6 9762; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 9763; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 9764; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9765; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9766; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 9767; GFX8-NEXT: v_add_f32_e32 v1, v1, v5 9768; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 9769; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 9770; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 9771; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 9772; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 9773; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 9774; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 9775; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9776; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 9777; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 9778; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 9779; GFX8-NEXT: v_add_f32_e32 v5, v9, v5 9780; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 9781; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 9782; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9783; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9784; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 9785; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 9786; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 9787; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9788; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 9789; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 9790; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 9791; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 9792; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 9793; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9794; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 9795; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 9796; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 9797; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 9798; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 9799; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 9800; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 9801; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 9802; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 9803; GFX8-NEXT: s_setpc_b64 s[30:31] 9804; 9805; GFX9-LABEL: v_fadd_v8bf16: 9806; GFX9: ; %bb.0: 9807; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9808; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 9809; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 9810; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 9811; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9812; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9813; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 9814; GFX9-NEXT: s_movk_i32 s4, 0x7fff 9815; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 9816; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 9817; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 9818; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 9819; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 9820; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 9821; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 9822; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 9823; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 9824; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 9825; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 9826; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 9827; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 9828; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9829; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9830; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 9831; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 9832; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 9833; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 9834; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 9835; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 9836; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 9837; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 9838; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 9839; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 9840; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 9841; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 9842; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 9843; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 9844; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9845; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9846; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 9847; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 9848; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 9849; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 9850; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 9851; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 9852; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 9853; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 9854; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 9855; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 9856; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 9857; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 9858; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 9859; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 9860; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9861; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9862; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 9863; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 9864; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 9865; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 9866; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9867; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 9868; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 9869; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 9870; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 9871; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 9872; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 9873; GFX9-NEXT: s_mov_b32 s4, 0x7060302 9874; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 9875; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 9876; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 9877; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 9878; GFX9-NEXT: s_setpc_b64 s[30:31] 9879; 9880; GFX10-LABEL: v_fadd_v8bf16: 9881; GFX10: ; %bb.0: 9882; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9883; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 9884; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 9885; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 9886; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9887; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 9888; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9889; GFX10-NEXT: v_add_f32_e32 v8, v9, v8 9890; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 9891; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9892; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 9893; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 9894; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 9895; GFX10-NEXT: v_add_f32_e32 v7, v10, v9 9896; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 9897; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 9898; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 9899; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 9900; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 9901; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 9902; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 9903; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 9904; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 9905; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 9906; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 9907; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 9908; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 9909; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 9910; GFX10-NEXT: v_add_f32_e32 v6, v10, v6 9911; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 9912; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9913; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 9914; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 9915; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo 9916; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 9917; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 9918; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 9919; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9920; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 9921; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 9922; GFX10-NEXT: v_add_f32_e32 v5, v15, v13 9923; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 9924; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 9925; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 9926; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 9927; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 9928; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 9929; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 9930; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 9931; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 9932; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 9933; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 9934; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 9935; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo 9936; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 9937; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9938; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 9939; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 9940; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 9941; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 9942; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 9943; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 9944; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 9945; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 9946; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 9947; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 9948; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 9949; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 9950; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 9951; GFX10-NEXT: s_setpc_b64 s[30:31] 9952; 9953; GFX11-LABEL: v_fadd_v8bf16: 9954; GFX11: ; %bb.0: 9955; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9956; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 9957; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 9958; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9959; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 9960; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 9961; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 9962; GFX11-NEXT: v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 9963; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 9964; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 9965; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 9966; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 9967; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 9968; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 9969; GFX11-NEXT: v_add_f32_e32 v7, v10, v9 9970; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 9971; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 9972; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 9973; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 9974; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 9975; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 9976; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 9977; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 9978; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 9979; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 9980; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 9981; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 9982; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 9983; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 9984; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 9985; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 9986; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6 9987; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 9988; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 9989; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 9990; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 9991; GFX11-NEXT: v_add_f32_e32 v6, v10, v6 9992; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 9993; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 9994; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 9995; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 9996; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 9997; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 9998; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 9999; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 10000; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 10001; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10002; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10003; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 10004; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 10005; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 10006; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 10007; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 10008; GFX11-NEXT: v_add_f32_e32 v5, v15, v13 10009; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 10010; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 10011; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 10012; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 10013; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 10014; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 10015; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 10016; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 10017; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10018; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 10019; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 10020; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 10021; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 10022; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 10023; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 10024; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 10025; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 10026; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 10027; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 10028; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 10029; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10030; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 10031; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 10032; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10033; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 10034; GFX11-NEXT: s_setpc_b64 s[30:31] 10035 %op = fadd <8 x bfloat> %a, %b 10036 ret <8 x bfloat> %op 10037} 10038 10039define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { 10040; GCN-LABEL: v_fadd_v16bf16: 10041; GCN: ; %bb.0: 10042; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10043; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 10044; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 10045; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 10046; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10047; GCN-NEXT: v_add_f32_e32 v14, v14, v30 10048; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 10049; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 10050; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 10051; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10052; GCN-NEXT: v_add_f32_e32 v13, v13, v29 10053; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 10054; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 10055; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 10056; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10057; GCN-NEXT: v_add_f32_e32 v12, v12, v28 10058; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 10059; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 10060; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 10061; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 10062; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 10063; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 10064; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 10065; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 10066; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 10067; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 10068; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 10069; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 10070; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 10071; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 10072; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 10073; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 10074; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 10075; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 10076; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 10077; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 10078; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 10079; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 10080; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 10081; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 10082; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 10083; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 10084; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10085; GCN-NEXT: v_add_f32_e32 v11, v11, v27 10086; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 10087; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 10088; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10089; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 10090; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10091; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 10092; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10093; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 10094; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10095; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 10096; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10097; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 10098; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10099; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 10100; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10101; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 10102; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10103; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 10104; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10105; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 10106; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10107; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 10108; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10109; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10110; GCN-NEXT: v_add_f32_e32 v10, v10, v26 10111; GCN-NEXT: v_add_f32_e32 v9, v9, v25 10112; GCN-NEXT: v_add_f32_e32 v8, v8, v24 10113; GCN-NEXT: v_add_f32_e32 v7, v7, v23 10114; GCN-NEXT: v_add_f32_e32 v6, v6, v22 10115; GCN-NEXT: v_add_f32_e32 v5, v5, v21 10116; GCN-NEXT: v_add_f32_e32 v4, v4, v20 10117; GCN-NEXT: v_add_f32_e32 v3, v3, v19 10118; GCN-NEXT: v_add_f32_e32 v2, v2, v18 10119; GCN-NEXT: v_add_f32_e32 v1, v1, v17 10120; GCN-NEXT: v_add_f32_e32 v0, v0, v16 10121; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10122; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10123; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10124; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10125; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10126; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10127; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10128; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10129; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10130; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10131; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10132; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10133; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10134; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10135; GCN-NEXT: s_waitcnt vmcnt(0) 10136; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 10137; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 10138; GCN-NEXT: v_add_f32_e32 v15, v15, v16 10139; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10140; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10141; GCN-NEXT: s_setpc_b64 s[30:31] 10142; 10143; GFX7-LABEL: v_fadd_v16bf16: 10144; GFX7: ; %bb.0: 10145; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10146; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 10147; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 10148; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 10149; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10150; GFX7-NEXT: v_add_f32_e32 v11, v11, v27 10151; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 10152; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 10153; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 10154; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 10155; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10156; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 10157; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 10158; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 10159; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 10160; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 10161; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 10162; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 10163; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 10164; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 10165; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 10166; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 10167; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 10168; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 10169; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 10170; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 10171; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 10172; GFX7-NEXT: v_add_f32_e32 v6, v6, v22 10173; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 10174; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 10175; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 10176; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 10177; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 10178; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10179; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 10180; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 10181; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 10182; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 10183; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 10184; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 10185; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10186; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 10187; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10188; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 10189; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10190; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 10191; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10192; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 10193; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10194; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 10195; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10196; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 10197; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10198; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10199; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 10200; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10201; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 10202; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10203; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 10204; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10205; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 10206; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10207; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 10208; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10209; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 10210; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10211; GFX7-NEXT: v_add_f32_e32 v14, v14, v30 10212; GFX7-NEXT: v_add_f32_e32 v13, v13, v29 10213; GFX7-NEXT: v_add_f32_e32 v12, v12, v28 10214; GFX7-NEXT: v_add_f32_e32 v10, v10, v26 10215; GFX7-NEXT: v_add_f32_e32 v9, v9, v25 10216; GFX7-NEXT: v_add_f32_e32 v8, v8, v24 10217; GFX7-NEXT: v_add_f32_e32 v7, v7, v23 10218; GFX7-NEXT: v_add_f32_e32 v5, v5, v21 10219; GFX7-NEXT: v_add_f32_e32 v4, v4, v20 10220; GFX7-NEXT: v_add_f32_e32 v3, v3, v19 10221; GFX7-NEXT: v_add_f32_e32 v2, v2, v18 10222; GFX7-NEXT: v_add_f32_e32 v1, v1, v17 10223; GFX7-NEXT: v_add_f32_e32 v0, v0, v16 10224; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10225; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10226; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10227; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10228; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10229; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10230; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10231; GFX7-NEXT: s_waitcnt vmcnt(0) 10232; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 10233; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 10234; GFX7-NEXT: v_add_f32_e32 v15, v15, v22 10235; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10236; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10237; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10238; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10239; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10240; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10241; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10242; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10243; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10244; GFX7-NEXT: s_setpc_b64 s[30:31] 10245; 10246; GFX8-LABEL: v_fadd_v16bf16: 10247; GFX8: ; %bb.0: 10248; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10249; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 10250; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 10251; GFX8-NEXT: v_add_f32_e32 v16, v17, v16 10252; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 10253; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 10254; GFX8-NEXT: s_movk_i32 s4, 0x7fff 10255; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10256; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10257; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10258; GFX8-NEXT: v_add_f32_e32 v7, v7, v15 10259; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 10260; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 10261; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 10262; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 10263; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 10264; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 10265; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 10266; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 10267; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 10268; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 10269; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 10270; GFX8-NEXT: v_add_f32_e32 v15, v17, v15 10271; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 10272; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 10273; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10274; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10275; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10276; GFX8-NEXT: v_add_f32_e32 v6, v6, v14 10277; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 10278; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 10279; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 10280; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 10281; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 10282; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 10283; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 10284; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 10285; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 10286; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 10287; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 10288; GFX8-NEXT: v_add_f32_e32 v14, v17, v14 10289; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 10290; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 10291; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10292; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10293; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10294; GFX8-NEXT: v_add_f32_e32 v5, v5, v13 10295; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 10296; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 10297; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 10298; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 10299; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 10300; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 10301; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 10302; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10303; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 10304; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 10305; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 10306; GFX8-NEXT: v_add_f32_e32 v13, v17, v13 10307; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 10308; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 10309; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10310; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10311; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10312; GFX8-NEXT: v_add_f32_e32 v4, v4, v12 10313; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 10314; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 10315; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 10316; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 10317; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 10318; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 10319; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 10320; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 10321; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 10322; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 10323; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 10324; GFX8-NEXT: v_add_f32_e32 v12, v17, v12 10325; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 10326; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 10327; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10328; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10329; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10330; GFX8-NEXT: v_add_f32_e32 v3, v3, v11 10331; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 10332; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 10333; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 10334; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 10335; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 10336; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 10337; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 10338; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10339; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 10340; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 10341; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 10342; GFX8-NEXT: v_add_f32_e32 v11, v17, v11 10343; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 10344; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 10345; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10346; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10347; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10348; GFX8-NEXT: v_add_f32_e32 v2, v2, v10 10349; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 10350; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 10351; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 10352; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 10353; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 10354; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 10355; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 10356; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10357; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 10358; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 10359; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 10360; GFX8-NEXT: v_add_f32_e32 v10, v17, v10 10361; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 10362; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 10363; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10364; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10365; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10366; GFX8-NEXT: v_add_f32_e32 v1, v1, v9 10367; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 10368; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 10369; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 10370; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 10371; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 10372; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 10373; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 10374; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10375; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 10376; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 10377; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 10378; GFX8-NEXT: v_add_f32_e32 v9, v17, v9 10379; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 10380; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 10381; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10382; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10383; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 10384; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 10385; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 10386; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 10387; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 10388; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 10389; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 10390; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 10391; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 10392; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 10393; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 10394; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 10395; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 10396; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 10397; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 10398; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10399; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10400; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 10401; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 10402; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 10403; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 10404; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 10405; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 10406; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 10407; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 10408; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 10409; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 10410; GFX8-NEXT: s_setpc_b64 s[30:31] 10411; 10412; GFX9-LABEL: v_fadd_v16bf16: 10413; GFX9: ; %bb.0: 10414; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10415; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 10416; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 10417; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 10418; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10419; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10420; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 10421; GFX9-NEXT: s_movk_i32 s4, 0x7fff 10422; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 10423; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 10424; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 10425; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 10426; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 10427; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 10428; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 10429; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 10430; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 10431; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 10432; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 10433; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 10434; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 10435; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10436; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10437; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 10438; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 10439; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 10440; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 10441; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 10442; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 10443; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 10444; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 10445; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 10446; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 10447; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 10448; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 10449; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 10450; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 10451; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10452; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10453; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 10454; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 10455; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 10456; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 10457; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 10458; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 10459; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 10460; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 10461; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 10462; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 10463; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 10464; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 10465; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 10466; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 10467; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10468; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10469; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 10470; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 10471; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 10472; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 10473; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 10474; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 10475; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 10476; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 10477; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 10478; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 10479; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 10480; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 10481; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 10482; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 10483; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10484; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10485; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 10486; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 10487; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 10488; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 10489; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 10490; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 10491; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 10492; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 10493; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 10494; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10495; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 10496; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 10497; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 10498; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 10499; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10500; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10501; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 10502; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 10503; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 10504; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 10505; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 10506; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 10507; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 10508; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 10509; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 10510; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10511; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 10512; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 10513; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 10514; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 10515; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10516; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10517; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 10518; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 10519; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 10520; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 10521; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 10522; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 10523; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 10524; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 10525; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 10526; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 10527; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 10528; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 10529; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 10530; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 10531; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10532; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10533; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 10534; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 10535; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 10536; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 10537; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 10538; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 10539; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 10540; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 10541; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 10542; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 10543; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 10544; GFX9-NEXT: s_mov_b32 s4, 0x7060302 10545; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 10546; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 10547; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 10548; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 10549; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 10550; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 10551; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 10552; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 10553; GFX9-NEXT: s_setpc_b64 s[30:31] 10554; 10555; GFX10-LABEL: v_fadd_v16bf16: 10556; GFX10: ; %bb.0: 10557; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10558; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15 10559; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 10560; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10561; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10562; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 10563; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10564; GFX10-NEXT: v_add_f32_e32 v16, v17, v16 10565; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 10566; GFX10-NEXT: v_add_f32_e32 v7, v7, v15 10567; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10568; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 10569; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 10570; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 10571; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 10572; GFX10-NEXT: v_add_f32_e32 v17, v18, v17 10573; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 10574; GFX10-NEXT: v_add_f32_e32 v6, v6, v14 10575; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 10576; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 10577; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 10578; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo 10579; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 10580; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 10581; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 10582; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 10583; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10584; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 10585; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 10586; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 10587; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10588; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 10589; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 10590; GFX10-NEXT: v_add_f32_e32 v17, v20, v19 10591; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 10592; GFX10-NEXT: v_add_f32_e32 v5, v5, v13 10593; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo 10594; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 10595; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 10596; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 10597; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 10598; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 10599; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 10600; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10601; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10602; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo 10603; GFX10-NEXT: v_add_f32_e32 v13, v19, v18 10604; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 10605; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 10606; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 10607; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 10608; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 10609; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 10610; GFX10-NEXT: v_add_f32_e32 v4, v4, v12 10611; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 10612; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10613; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 10614; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 10615; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 10616; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10617; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 10618; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 10619; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10620; GFX10-NEXT: v_add_f32_e32 v12, v18, v12 10621; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 10622; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 10623; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 10624; GFX10-NEXT: v_add_f32_e32 v3, v3, v11 10625; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 10626; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 10627; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 10628; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 10629; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 10630; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 10631; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10632; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 10633; GFX10-NEXT: v_add_f32_e32 v18, v19, v18 10634; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10635; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 10636; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 10637; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 10638; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 10639; GFX10-NEXT: v_add_f32_e32 v2, v2, v10 10640; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 10641; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10642; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 10643; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 10644; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 10645; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 10646; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 10647; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 10648; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 10649; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 10650; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10651; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 10652; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 10653; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 10654; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 10655; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 10656; GFX10-NEXT: v_add_f32_e32 v19, v22, v20 10657; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 10658; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 10659; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10660; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10661; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 10662; GFX10-NEXT: v_add_f32_e32 v1, v1, v9 10663; GFX10-NEXT: v_add_f32_e32 v9, v22, v20 10664; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 10665; GFX10-NEXT: v_add_f32_e32 v0, v0, v8 10666; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 10667; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 10668; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 10669; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 10670; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 10671; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 10672; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 10673; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 10674; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 10675; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 10676; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 10677; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 10678; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 10679; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 10680; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 10681; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 10682; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 10683; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 10684; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 10685; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 10686; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 10687; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10688; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 10689; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 10690; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 10691; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 10692; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 10693; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 10694; GFX10-NEXT: s_setpc_b64 s[30:31] 10695; 10696; GFX11-LABEL: v_fadd_v16bf16: 10697; GFX11: ; %bb.0: 10698; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10699; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 10700; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 10701; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 10702; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 10703; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10704; GFX11-NEXT: v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 10705; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 10706; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10707; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 10708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 10709; GFX11-NEXT: v_add_f32_e32 v17, v18, v17 10710; GFX11-NEXT: v_add_f32_e32 v6, v6, v14 10711; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 10712; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 10713; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 10714; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 10715; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 10716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 10717; GFX11-NEXT: v_add_f32_e32 v7, v7, v15 10718; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 10719; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 10720; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 10721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 10722; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 10723; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 10724; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 10725; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 10726; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 10727; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 10728; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 10729; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 10730; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 10731; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 10732; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 10733; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 10734; GFX11-NEXT: v_dual_add_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 10735; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 10736; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 10737; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 10738; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10739; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 10740; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10741; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 10742; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 10743; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) 10744; GFX11-NEXT: v_add_f32_e32 v4, v4, v12 10745; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 10746; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 10747; GFX11-NEXT: v_add_f32_e32 v5, v5, v13 10748; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 10749; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 10750; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18 10751; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 10752; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 10753; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 10754; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 10755; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) 10756; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 10757; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 10758; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 10759; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 10760; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 10761; GFX11-NEXT: v_add_f32_e32 v12, v18, v12 10762; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 10763; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 10764; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 10765; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 10766; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 10767; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 10768; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 10769; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 10770; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 10771; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 10772; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 10773; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 10774; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 10775; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 10776; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 10777; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 10778; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 10779; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 10780; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10781; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 10782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 10783; GFX11-NEXT: v_add_f32_e32 v18, v19, v18 10784; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 10785; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 10786; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 10787; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 10788; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 10789; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 10790; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 10791; GFX11-NEXT: v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 10792; GFX11-NEXT: v_add_f32_e32 v3, v3, v11 10793; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 10794; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 10795; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 10796; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 10797; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10798; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 10799; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 10800; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10801; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 10802; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 10803; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 10804; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 10805; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 10806; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 10807; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 10808; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 10809; GFX11-NEXT: v_add_f32_e32 v19, v22, v20 10810; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 10811; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 10812; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 10813; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) 10814; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 10815; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 10816; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 10817; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 10818; GFX11-NEXT: v_dual_add_f32 v0, v0, v8 :: v_dual_add_f32 v1, v1, v9 10819; GFX11-NEXT: v_add_f32_e32 v9, v22, v20 10820; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 10821; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 10822; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 10823; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 10824; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 10825; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 10826; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 10827; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 10828; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 10829; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 10830; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 10831; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 10832; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 10833; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 10834; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 10835; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 10836; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 10837; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 10838; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 10839; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 10840; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 10841; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 10842; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10843; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 10844; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 10845; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 10846; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 10847; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 10848; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 10849; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10850; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 10851; GFX11-NEXT: s_setpc_b64 s[30:31] 10852 %op = fadd <16 x bfloat> %a, %b 10853 ret <16 x bfloat> %op 10854} 10855 10856define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { 10857; GCN-LABEL: v_fadd_v32bf16: 10858; GCN: ; %bb.0: 10859; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10860; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 10861; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 10862; GCN-NEXT: s_waitcnt vmcnt(1) 10863; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 10864; GCN-NEXT: s_waitcnt vmcnt(0) 10865; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 10866; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10867; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 10868; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 10869; GCN-NEXT: v_add_f32_e32 v31, v31, v32 10870; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 10871; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 10872; GCN-NEXT: s_waitcnt vmcnt(0) 10873; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10874; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10875; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 10876; GCN-NEXT: v_add_f32_e32 v30, v30, v32 10877; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 10878; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 10879; GCN-NEXT: s_waitcnt vmcnt(0) 10880; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10881; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10882; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 10883; GCN-NEXT: v_add_f32_e32 v29, v29, v32 10884; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 10885; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 10886; GCN-NEXT: s_waitcnt vmcnt(0) 10887; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10888; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10889; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 10890; GCN-NEXT: v_add_f32_e32 v28, v28, v32 10891; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 10892; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 10893; GCN-NEXT: s_waitcnt vmcnt(0) 10894; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10895; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10896; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 10897; GCN-NEXT: v_add_f32_e32 v27, v27, v32 10898; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 10899; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 10900; GCN-NEXT: s_waitcnt vmcnt(0) 10901; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10902; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10903; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 10904; GCN-NEXT: v_add_f32_e32 v26, v26, v32 10905; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 10906; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 10907; GCN-NEXT: s_waitcnt vmcnt(0) 10908; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10909; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10910; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 10911; GCN-NEXT: v_add_f32_e32 v25, v25, v32 10912; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 10913; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 10914; GCN-NEXT: s_waitcnt vmcnt(0) 10915; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10916; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10917; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 10918; GCN-NEXT: v_add_f32_e32 v24, v24, v32 10919; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 10920; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 10921; GCN-NEXT: s_waitcnt vmcnt(0) 10922; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10923; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10924; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 10925; GCN-NEXT: v_add_f32_e32 v23, v23, v32 10926; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 10927; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 10928; GCN-NEXT: s_waitcnt vmcnt(0) 10929; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10930; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10931; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 10932; GCN-NEXT: v_add_f32_e32 v22, v22, v32 10933; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 10934; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 10935; GCN-NEXT: s_waitcnt vmcnt(0) 10936; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10937; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10938; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 10939; GCN-NEXT: v_add_f32_e32 v21, v21, v32 10940; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 10941; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 10942; GCN-NEXT: s_waitcnt vmcnt(0) 10943; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10944; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10945; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 10946; GCN-NEXT: v_add_f32_e32 v20, v20, v32 10947; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 10948; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 10949; GCN-NEXT: s_waitcnt vmcnt(0) 10950; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10951; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10952; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 10953; GCN-NEXT: v_add_f32_e32 v19, v19, v32 10954; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 10955; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 10956; GCN-NEXT: s_waitcnt vmcnt(0) 10957; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10958; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10959; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 10960; GCN-NEXT: v_add_f32_e32 v18, v18, v32 10961; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 10962; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 10963; GCN-NEXT: s_waitcnt vmcnt(0) 10964; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10965; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10966; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 10967; GCN-NEXT: v_add_f32_e32 v17, v17, v32 10968; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 10969; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 10970; GCN-NEXT: s_waitcnt vmcnt(0) 10971; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10972; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10973; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 10974; GCN-NEXT: v_add_f32_e32 v16, v16, v32 10975; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 10976; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 10977; GCN-NEXT: s_waitcnt vmcnt(0) 10978; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10979; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10980; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 10981; GCN-NEXT: v_add_f32_e32 v15, v15, v32 10982; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 10983; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 10984; GCN-NEXT: s_waitcnt vmcnt(0) 10985; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10986; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10987; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 10988; GCN-NEXT: v_add_f32_e32 v14, v14, v32 10989; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 10990; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 10991; GCN-NEXT: s_waitcnt vmcnt(0) 10992; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 10993; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 10994; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 10995; GCN-NEXT: v_add_f32_e32 v13, v13, v32 10996; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 10997; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 10998; GCN-NEXT: s_waitcnt vmcnt(0) 10999; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11000; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11001; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 11002; GCN-NEXT: v_add_f32_e32 v12, v12, v32 11003; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 11004; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11005; GCN-NEXT: s_waitcnt vmcnt(0) 11006; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11007; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11008; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 11009; GCN-NEXT: v_add_f32_e32 v11, v11, v32 11010; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 11011; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11012; GCN-NEXT: s_waitcnt vmcnt(0) 11013; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11014; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11015; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 11016; GCN-NEXT: v_add_f32_e32 v10, v10, v32 11017; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 11018; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11019; GCN-NEXT: s_waitcnt vmcnt(0) 11020; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11021; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11022; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 11023; GCN-NEXT: v_add_f32_e32 v9, v9, v32 11024; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 11025; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11026; GCN-NEXT: s_waitcnt vmcnt(0) 11027; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11028; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11029; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 11030; GCN-NEXT: v_add_f32_e32 v8, v8, v32 11031; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 11032; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11033; GCN-NEXT: s_waitcnt vmcnt(0) 11034; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11035; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11036; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 11037; GCN-NEXT: v_add_f32_e32 v7, v7, v32 11038; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 11039; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11040; GCN-NEXT: s_waitcnt vmcnt(0) 11041; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11042; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11043; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 11044; GCN-NEXT: v_add_f32_e32 v6, v6, v32 11045; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 11046; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11047; GCN-NEXT: s_waitcnt vmcnt(0) 11048; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11049; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11050; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 11051; GCN-NEXT: v_add_f32_e32 v5, v5, v32 11052; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 11053; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11054; GCN-NEXT: s_waitcnt vmcnt(0) 11055; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11056; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11057; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 11058; GCN-NEXT: v_add_f32_e32 v4, v4, v32 11059; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 11060; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11061; GCN-NEXT: s_waitcnt vmcnt(0) 11062; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11063; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11064; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 11065; GCN-NEXT: v_add_f32_e32 v3, v3, v32 11066; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 11067; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11068; GCN-NEXT: s_waitcnt vmcnt(0) 11069; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11070; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11071; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 11072; GCN-NEXT: v_add_f32_e32 v2, v2, v32 11073; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 11074; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11075; GCN-NEXT: s_waitcnt vmcnt(0) 11076; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11077; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11078; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 11079; GCN-NEXT: v_add_f32_e32 v1, v1, v32 11080; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 11081; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11082; GCN-NEXT: s_waitcnt vmcnt(0) 11083; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 11084; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11085; GCN-NEXT: v_add_f32_e32 v0, v0, v32 11086; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11087; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11088; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11089; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11090; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11091; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11092; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11093; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11094; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11095; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11096; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11097; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11098; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 11099; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 11100; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 11101; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 11102; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 11103; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 11104; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 11105; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 11106; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 11107; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 11108; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 11109; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 11110; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 11111; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 11112; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 11113; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 11114; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11115; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11116; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11117; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 11118; GCN-NEXT: s_setpc_b64 s[30:31] 11119; 11120; GFX7-LABEL: v_fadd_v32bf16: 11121; GFX7: ; %bb.0: 11122; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11123; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 11124; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 11125; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 11126; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11127; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 11128; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11129; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 11130; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11131; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 11132; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 11133; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 11134; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 11135; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 11136; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 11137; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 11138; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 11139; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 11140; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 11141; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 11142; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 11143; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 11144; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 11145; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 11146; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 11147; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 11148; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 11149; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 11150; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 11151; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 11152; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 11153; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 11154; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 11155; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 11156; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 11157; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 11158; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 11159; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 11160; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 11161; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 11162; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 11163; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 11164; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11165; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 11166; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11167; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 11168; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11169; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 11170; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11171; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 11172; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11173; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 11174; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11175; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 11176; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11177; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 11178; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11179; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 11180; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11181; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11182; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11183; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 11184; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11185; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 11186; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11187; GFX7-NEXT: s_waitcnt vmcnt(1) 11188; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 11189; GFX7-NEXT: s_waitcnt vmcnt(0) 11190; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11191; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11192; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 11193; GFX7-NEXT: v_add_f32_e32 v31, v31, v32 11194; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 11195; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 11196; GFX7-NEXT: s_waitcnt vmcnt(0) 11197; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11198; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11199; GFX7-NEXT: v_add_f32_e32 v30, v30, v32 11200; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 11201; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11202; GFX7-NEXT: s_waitcnt vmcnt(0) 11203; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11204; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11205; GFX7-NEXT: v_add_f32_e32 v29, v29, v32 11206; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 11207; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11208; GFX7-NEXT: s_waitcnt vmcnt(0) 11209; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11210; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11211; GFX7-NEXT: v_add_f32_e32 v28, v28, v32 11212; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 11213; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11214; GFX7-NEXT: s_waitcnt vmcnt(0) 11215; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11216; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11217; GFX7-NEXT: v_add_f32_e32 v27, v27, v32 11218; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 11219; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 11220; GFX7-NEXT: s_waitcnt vmcnt(0) 11221; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11222; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11223; GFX7-NEXT: v_add_f32_e32 v26, v26, v32 11224; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 11225; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 11226; GFX7-NEXT: s_waitcnt vmcnt(0) 11227; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11228; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11229; GFX7-NEXT: v_add_f32_e32 v25, v25, v32 11230; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 11231; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 11232; GFX7-NEXT: s_waitcnt vmcnt(0) 11233; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11234; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11235; GFX7-NEXT: v_add_f32_e32 v24, v24, v32 11236; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 11237; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 11238; GFX7-NEXT: s_waitcnt vmcnt(0) 11239; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11240; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11241; GFX7-NEXT: v_add_f32_e32 v23, v23, v32 11242; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 11243; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 11244; GFX7-NEXT: s_waitcnt vmcnt(0) 11245; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11246; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11247; GFX7-NEXT: v_add_f32_e32 v22, v22, v32 11248; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 11249; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 11250; GFX7-NEXT: s_waitcnt vmcnt(0) 11251; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11252; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11253; GFX7-NEXT: v_add_f32_e32 v21, v21, v32 11254; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 11255; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 11256; GFX7-NEXT: s_waitcnt vmcnt(0) 11257; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11258; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11259; GFX7-NEXT: v_add_f32_e32 v20, v20, v32 11260; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 11261; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 11262; GFX7-NEXT: s_waitcnt vmcnt(0) 11263; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11264; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11265; GFX7-NEXT: v_add_f32_e32 v19, v19, v32 11266; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 11267; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 11268; GFX7-NEXT: s_waitcnt vmcnt(0) 11269; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11270; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11271; GFX7-NEXT: v_add_f32_e32 v18, v18, v32 11272; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 11273; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 11274; GFX7-NEXT: s_waitcnt vmcnt(0) 11275; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11276; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11277; GFX7-NEXT: v_add_f32_e32 v17, v17, v32 11278; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 11279; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 11280; GFX7-NEXT: s_waitcnt vmcnt(0) 11281; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11282; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11283; GFX7-NEXT: v_add_f32_e32 v16, v16, v32 11284; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 11285; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 11286; GFX7-NEXT: s_waitcnt vmcnt(0) 11287; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11288; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11289; GFX7-NEXT: v_add_f32_e32 v15, v15, v32 11290; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 11291; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 11292; GFX7-NEXT: s_waitcnt vmcnt(0) 11293; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11294; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11295; GFX7-NEXT: v_add_f32_e32 v14, v14, v32 11296; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 11297; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 11298; GFX7-NEXT: s_waitcnt vmcnt(0) 11299; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11300; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11301; GFX7-NEXT: v_add_f32_e32 v13, v13, v32 11302; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 11303; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 11304; GFX7-NEXT: s_waitcnt vmcnt(0) 11305; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11306; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11307; GFX7-NEXT: v_add_f32_e32 v12, v12, v32 11308; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 11309; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 11310; GFX7-NEXT: s_waitcnt vmcnt(0) 11311; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11312; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11313; GFX7-NEXT: v_add_f32_e32 v11, v11, v32 11314; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 11315; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11316; GFX7-NEXT: s_waitcnt vmcnt(0) 11317; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11318; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11319; GFX7-NEXT: v_add_f32_e32 v10, v10, v32 11320; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 11321; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11322; GFX7-NEXT: s_waitcnt vmcnt(0) 11323; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11324; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11325; GFX7-NEXT: v_add_f32_e32 v9, v9, v32 11326; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 11327; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11328; GFX7-NEXT: s_waitcnt vmcnt(0) 11329; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11330; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11331; GFX7-NEXT: v_add_f32_e32 v8, v8, v32 11332; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 11333; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11334; GFX7-NEXT: s_waitcnt vmcnt(0) 11335; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11336; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11337; GFX7-NEXT: v_add_f32_e32 v7, v7, v32 11338; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 11339; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11340; GFX7-NEXT: s_waitcnt vmcnt(0) 11341; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11342; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11343; GFX7-NEXT: v_add_f32_e32 v6, v6, v32 11344; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 11345; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11346; GFX7-NEXT: s_waitcnt vmcnt(0) 11347; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11348; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11349; GFX7-NEXT: v_add_f32_e32 v5, v5, v32 11350; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 11351; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11352; GFX7-NEXT: s_waitcnt vmcnt(0) 11353; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11354; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11355; GFX7-NEXT: v_add_f32_e32 v4, v4, v32 11356; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 11357; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11358; GFX7-NEXT: s_waitcnt vmcnt(0) 11359; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11360; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11361; GFX7-NEXT: v_add_f32_e32 v3, v3, v32 11362; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 11363; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11364; GFX7-NEXT: s_waitcnt vmcnt(0) 11365; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11366; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11367; GFX7-NEXT: v_add_f32_e32 v2, v2, v32 11368; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 11369; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11370; GFX7-NEXT: s_waitcnt vmcnt(0) 11371; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11372; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11373; GFX7-NEXT: v_add_f32_e32 v1, v1, v32 11374; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 11375; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11376; GFX7-NEXT: s_waitcnt vmcnt(0) 11377; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 11378; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 11379; GFX7-NEXT: v_add_f32_e32 v0, v0, v32 11380; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11381; GFX7-NEXT: s_setpc_b64 s[30:31] 11382; 11383; GFX8-LABEL: v_fadd_v32bf16: 11384; GFX8: ; %bb.0: 11385; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11386; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 11387; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 11388; GFX8-NEXT: v_add_f32_e32 v31, v32, v31 11389; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 11390; GFX8-NEXT: s_movk_i32 s4, 0x7fff 11391; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 11392; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11393; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 11394; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 11395; GFX8-NEXT: v_add_f32_e32 v14, v14, v30 11396; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 11397; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 11398; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 11399; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 11400; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 11401; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 11402; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 11403; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 11404; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 11405; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 11406; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 11407; GFX8-NEXT: v_add_f32_e32 v32, v32, v30 11408; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 11409; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 11410; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 11411; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11412; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 11413; GFX8-NEXT: v_add_f32_e32 v13, v13, v29 11414; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 11415; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 11416; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 11417; GFX8-NEXT: s_waitcnt vmcnt(0) 11418; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 11419; GFX8-NEXT: v_add_f32_e32 v33, v33, v34 11420; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11421; GFX8-NEXT: v_add_f32_e32 v30, v15, v30 11422; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 11423; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 11424; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 11425; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 11426; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 11427; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 11428; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 11429; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 11430; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11431; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 11432; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 11433; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc 11434; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 11435; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 11436; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11437; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 11438; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 11439; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 11440; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 11441; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 11442; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 11443; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 11444; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc 11445; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 11446; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 11447; GFX8-NEXT: v_add_f32_e32 v29, v33, v29 11448; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 11449; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 11450; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11451; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 11452; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11453; GFX8-NEXT: v_add_f32_e32 v12, v12, v28 11454; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 11455; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 11456; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 11457; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 11458; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 11459; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 11460; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 11461; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 11462; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 11463; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 11464; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 11465; GFX8-NEXT: v_add_f32_e32 v28, v33, v28 11466; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 11467; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 11468; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 11469; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11470; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11471; GFX8-NEXT: v_add_f32_e32 v11, v11, v27 11472; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 11473; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 11474; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 11475; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 11476; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 11477; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 11478; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 11479; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 11480; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 11481; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 11482; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 11483; GFX8-NEXT: v_add_f32_e32 v27, v33, v27 11484; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 11485; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 11486; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 11487; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11488; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11489; GFX8-NEXT: v_add_f32_e32 v10, v10, v26 11490; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 11491; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 11492; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 11493; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 11494; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 11495; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 11496; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 11497; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 11498; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 11499; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 11500; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 11501; GFX8-NEXT: v_add_f32_e32 v26, v33, v26 11502; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 11503; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 11504; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 11505; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11506; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11507; GFX8-NEXT: v_add_f32_e32 v9, v9, v25 11508; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 11509; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 11510; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 11511; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 11512; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 11513; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 11514; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 11515; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 11516; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 11517; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 11518; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 11519; GFX8-NEXT: v_add_f32_e32 v25, v33, v25 11520; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 11521; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 11522; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 11523; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11524; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11525; GFX8-NEXT: v_add_f32_e32 v8, v8, v24 11526; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 11527; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 11528; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 11529; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 11530; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 11531; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 11532; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 11533; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 11534; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 11535; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 11536; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 11537; GFX8-NEXT: v_add_f32_e32 v24, v33, v24 11538; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 11539; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 11540; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 11541; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11542; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11543; GFX8-NEXT: v_add_f32_e32 v7, v7, v23 11544; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 11545; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 11546; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 11547; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 11548; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 11549; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 11550; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 11551; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 11552; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 11553; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 11554; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 11555; GFX8-NEXT: v_add_f32_e32 v23, v33, v23 11556; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 11557; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 11558; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 11559; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11560; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11561; GFX8-NEXT: v_add_f32_e32 v6, v6, v22 11562; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 11563; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 11564; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 11565; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 11566; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 11567; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 11568; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 11569; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 11570; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 11571; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 11572; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 11573; GFX8-NEXT: v_add_f32_e32 v22, v33, v22 11574; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 11575; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 11576; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 11577; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11578; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11579; GFX8-NEXT: v_add_f32_e32 v5, v5, v21 11580; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 11581; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 11582; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 11583; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 11584; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 11585; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 11586; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 11587; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11588; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 11589; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 11590; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 11591; GFX8-NEXT: v_add_f32_e32 v21, v33, v21 11592; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 11593; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 11594; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 11595; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11596; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11597; GFX8-NEXT: v_add_f32_e32 v4, v4, v20 11598; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 11599; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 11600; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 11601; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 11602; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 11603; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 11604; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 11605; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 11606; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 11607; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 11608; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 11609; GFX8-NEXT: v_add_f32_e32 v20, v33, v20 11610; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 11611; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 11612; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 11613; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11614; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11615; GFX8-NEXT: v_add_f32_e32 v3, v3, v19 11616; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 11617; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 11618; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 11619; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 11620; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 11621; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 11622; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 11623; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 11624; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 11625; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 11626; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 11627; GFX8-NEXT: v_add_f32_e32 v19, v33, v19 11628; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 11629; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 11630; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 11631; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11632; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11633; GFX8-NEXT: v_add_f32_e32 v2, v2, v18 11634; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 11635; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 11636; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 11637; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 11638; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 11639; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 11640; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 11641; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11642; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 11643; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 11644; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 11645; GFX8-NEXT: v_add_f32_e32 v18, v33, v18 11646; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 11647; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 11648; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 11649; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11650; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11651; GFX8-NEXT: v_add_f32_e32 v1, v1, v17 11652; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 11653; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 11654; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 11655; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 11656; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 11657; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 11658; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 11659; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 11660; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 11661; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 11662; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 11663; GFX8-NEXT: v_add_f32_e32 v17, v33, v17 11664; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 11665; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 11666; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 11667; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11668; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 11669; GFX8-NEXT: v_add_f32_e32 v0, v0, v16 11670; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 11671; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 11672; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 11673; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 11674; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 11675; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 11676; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 11677; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 11678; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 11679; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 11680; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 11681; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11682; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11683; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 11684; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11685; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 11686; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 11687; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 11688; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 11689; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 11690; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 11691; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 11692; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 11693; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 11694; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 11695; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 11696; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 11697; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 11698; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 11699; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 11700; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 11701; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 11702; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 11703; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 11704; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 11705; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 11706; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 11707; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 11708; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 11709; GFX8-NEXT: s_setpc_b64 s[30:31] 11710; 11711; GFX9-LABEL: v_fadd_v32bf16: 11712; GFX9: ; %bb.0: 11713; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11714; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 11715; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 11716; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 11717; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 11718; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 11719; GFX9-NEXT: s_movk_i32 s4, 0x7fff 11720; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 11721; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 11722; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 11723; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 11724; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 11725; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 11726; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 11727; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 11728; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 11729; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 11730; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 11731; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 11732; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 11733; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 11734; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11735; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 11736; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 11737; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 11738; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 11739; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 11740; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 11741; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 11742; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc 11743; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 11744; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 11745; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 11746; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc 11747; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 11748; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 11749; GFX9-NEXT: v_add_f32_e32 v32, v32, v29 11750; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 11751; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 11752; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 11753; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11754; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 11755; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 11756; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 11757; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 11758; GFX9-NEXT: s_waitcnt vmcnt(0) 11759; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 11760; GFX9-NEXT: v_add_f32_e32 v33, v33, v34 11761; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 11762; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 11763; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 11764; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 11765; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 11766; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 11767; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 11768; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 11769; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 11770; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 11771; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 11772; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 11773; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 11774; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 11775; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 11776; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 11777; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 11778; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 11779; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 11780; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 11781; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 11782; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 11783; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 11784; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 11785; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 11786; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 11787; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 11788; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 11789; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 11790; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 11791; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 11792; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 11793; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 11794; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 11795; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 11796; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 11797; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 11798; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 11799; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 11800; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 11801; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 11802; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 11803; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 11804; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 11805; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 11806; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 11807; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 11808; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 11809; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 11810; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 11811; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 11812; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 11813; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 11814; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 11815; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 11816; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 11817; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 11818; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 11819; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 11820; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 11821; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 11822; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 11823; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 11824; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 11825; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 11826; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 11827; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 11828; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 11829; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 11830; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 11831; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 11832; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 11833; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 11834; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 11835; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 11836; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 11837; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 11838; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 11839; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 11840; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 11841; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 11842; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 11843; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 11844; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 11845; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 11846; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 11847; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 11848; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 11849; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 11850; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 11851; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 11852; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 11853; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 11854; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 11855; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 11856; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 11857; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 11858; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 11859; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 11860; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 11861; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 11862; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 11863; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 11864; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 11865; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 11866; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 11867; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 11868; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 11869; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 11870; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 11871; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 11872; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 11873; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 11874; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 11875; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 11876; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 11877; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 11878; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 11879; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 11880; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 11881; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 11882; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 11883; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 11884; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 11885; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 11886; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 11887; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 11888; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 11889; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 11890; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 11891; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11892; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 11893; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 11894; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 11895; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 11896; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 11897; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 11898; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 11899; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 11900; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 11901; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 11902; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 11903; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 11904; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 11905; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 11906; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 11907; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 11908; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 11909; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 11910; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 11911; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 11912; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 11913; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 11914; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 11915; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 11916; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 11917; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 11918; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 11919; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 11920; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 11921; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 11922; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 11923; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 11924; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 11925; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 11926; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 11927; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 11928; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 11929; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11930; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 11931; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 11932; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 11933; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 11934; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 11935; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 11936; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 11937; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 11938; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 11939; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11940; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 11941; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 11942; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 11943; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 11944; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 11945; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 11946; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 11947; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 11948; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 11949; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 11950; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 11951; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 11952; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 11953; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 11954; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 11955; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 11956; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 11957; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 11958; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 11959; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 11960; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 11961; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 11962; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 11963; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 11964; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 11965; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 11966; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 11967; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 11968; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 11969; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 11970; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 11971; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 11972; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 11973; GFX9-NEXT: s_mov_b32 s4, 0x7060302 11974; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 11975; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 11976; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 11977; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 11978; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 11979; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 11980; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 11981; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 11982; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 11983; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 11984; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 11985; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 11986; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 11987; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 11988; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 11989; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 11990; GFX9-NEXT: s_setpc_b64 s[30:31] 11991; 11992; GFX10-LABEL: v_fadd_v32bf16: 11993; GFX10: ; %bb.0: 11994; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11995; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 11996; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 11997; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 11998; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 11999; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 12000; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 12001; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 12002; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 12003; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 12004; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 12005; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 12006; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 12007; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 12008; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 12009; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 12010; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 12011; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 12012; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 12013; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 12014; GFX10-NEXT: v_add_f32_e32 v12, v12, v28 12015; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 12016; GFX10-NEXT: v_add_f32_e32 v39, v48, v39 12017; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 12018; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 12019; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 12020; GFX10-NEXT: v_add_f32_e32 v11, v11, v27 12021; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 12022; GFX10-NEXT: v_add_f32_e32 v49, v50, v49 12023; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 12024; GFX10-NEXT: v_add_f32_e32 v33, v34, v33 12025; GFX10-NEXT: v_add_f32_e32 v14, v14, v30 12026; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 12027; GFX10-NEXT: v_add_f32_e32 v35, v36, v35 12028; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 12029; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 12030; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 12031; GFX10-NEXT: v_add_f32_e32 v13, v13, v29 12032; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 12033; GFX10-NEXT: v_add_f32_e32 v37, v38, v37 12034; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 12035; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 12036; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 12037; GFX10-NEXT: v_add_f32_e32 v6, v6, v22 12038; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 12039; GFX10-NEXT: v_add_f32_e32 v27, v50, v27 12040; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 12041; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 12042; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12043; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 12044; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 12045; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 12046; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 12047; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 12048; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 12049; GFX10-NEXT: v_add_f32_e32 v8, v8, v24 12050; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 12051; GFX10-NEXT: v_add_f32_e32 v29, v38, v29 12052; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 12053; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 12054; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12055; GFX10-NEXT: v_add_f32_e32 v7, v7, v23 12056; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 12057; GFX10-NEXT: v_add_f32_e32 v28, v48, v28 12058; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 12059; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 12060; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12061; GFX10-NEXT: v_add_f32_e32 v0, v0, v16 12062; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 12063; GFX10-NEXT: v_add_f32_e32 v10, v10, v26 12064; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 12065; GFX10-NEXT: v_add_f32_e32 v34, v34, v51 12066; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 12067; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 12068; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 12069; GFX10-NEXT: v_add_f32_e32 v9, v9, v25 12070; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 12071; GFX10-NEXT: v_add_f32_e32 v30, v36, v30 12072; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 12073; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 12074; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12075; GFX10-NEXT: v_add_f32_e32 v2, v2, v18 12076; GFX10-NEXT: v_add_f32_e32 v18, v48, v23 12077; GFX10-NEXT: v_add_f32_e32 v1, v1, v17 12078; GFX10-NEXT: v_add_f32_e32 v17, v50, v22 12079; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 12080; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 12081; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 12082; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 12083; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 12084; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 12085; GFX10-NEXT: v_add_f32_e32 v4, v4, v20 12086; GFX10-NEXT: v_add_f32_e32 v20, v36, v25 12087; GFX10-NEXT: v_add_f32_e32 v3, v3, v19 12088; GFX10-NEXT: v_add_f32_e32 v19, v38, v24 12089; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 12090; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 12091; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff 12092; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo 12093; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 12094; GFX10-NEXT: v_add_f32_e32 v5, v5, v21 12095; GFX10-NEXT: v_add_f32_e32 v21, v51, v26 12096; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 12097; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 12098; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff 12099; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo 12100; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 12101; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 12102; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 12103; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff 12104; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 12105; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo 12106; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 12107; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 12108; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff 12109; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 12110; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 12111; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo 12112; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 12113; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 12114; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 12115; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 12116; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff 12117; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo 12118; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 12119; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 12120; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 12121; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff 12122; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 12123; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo 12124; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 12125; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 12126; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff 12127; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 12128; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 12129; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo 12130; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 12131; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff 12132; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 12133; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 12134; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff 12135; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo 12136; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 12137; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 12138; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 12139; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff 12140; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 12141; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo 12142; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 12143; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 12144; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff 12145; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 12146; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 12147; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo 12148; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 12149; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff 12150; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 12151; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 12152; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff 12153; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo 12154; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 12155; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 12156; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 12157; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff 12158; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 12159; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo 12160; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 12161; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 12162; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff 12163; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 12164; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 12165; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo 12166; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 12167; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 12168; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff 12169; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 12170; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 12171; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo 12172; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 12173; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 12174; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff 12175; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 12176; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo 12177; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 12178; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 12179; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff 12180; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 12181; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo 12182; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 12183; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 12184; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff 12185; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 12186; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo 12187; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 12188; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 12189; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff 12190; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 12191; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo 12192; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 12193; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff 12194; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 12195; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 12196; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo 12197; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 12198; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 12199; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 12200; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff 12201; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo 12202; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 12203; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff 12204; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 12205; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 12206; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo 12207; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 12208; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 12209; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 12210; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff 12211; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo 12212; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 12213; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 12214; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff 12215; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 12216; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo 12217; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 12218; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 12219; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff 12220; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 12221; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo 12222; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 12223; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 12224; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff 12225; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 12226; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo 12227; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 12228; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff 12229; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo 12230; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 12231; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 12232; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo 12233; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12234; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 12235; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 12236; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo 12237; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12238; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 12239; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo 12240; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 12241; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 12242; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo 12243; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 12244; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 12245; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 12246; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 12247; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 12248; GFX10-NEXT: s_waitcnt vmcnt(0) 12249; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 12250; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 12251; GFX10-NEXT: v_add_f32_e32 v17, v31, v17 12252; GFX10-NEXT: v_add_f32_e32 v15, v15, v18 12253; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 12254; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 12255; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 12256; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 12257; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 12258; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff 12259; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff 12260; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 12261; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 12262; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo 12263; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 12264; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 12265; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo 12266; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 12267; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 12268; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 12269; GFX10-NEXT: s_setpc_b64 s[30:31] 12270; 12271; GFX11-LABEL: v_fadd_v32bf16: 12272; GFX11: ; %bb.0: 12273; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12274; GFX11-NEXT: scratch_load_b32 v32, off, s32 12275; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 12276; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 12277; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 12278; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 12279; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 12280; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 12281; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 12282; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12283; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 12284; GFX11-NEXT: v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 12285; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 12286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 12287; GFX11-NEXT: v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 12288; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 12289; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 12290; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 12291; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 12292; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 12293; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 12294; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 12295; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff 12296; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 12297; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12298; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff 12299; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 12300; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 12301; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 12302; GFX11-NEXT: v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 12303; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 12304; GFX11-NEXT: v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 12305; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 12306; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 12307; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 12308; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 12309; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 12310; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 12311; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 12312; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 12313; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 12314; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff 12315; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 12316; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 12317; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12318; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 12319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 12320; GFX11-NEXT: v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 12321; GFX11-NEXT: v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 12322; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 12323; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff 12324; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 12325; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 12326; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 12327; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 12328; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 12329; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 12330; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff 12331; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 12332; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 12333; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 12334; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 12335; GFX11-NEXT: v_add_f32_e32 v2, v2, v18 12336; GFX11-NEXT: v_add_f32_e32 v0, v0, v16 12337; GFX11-NEXT: v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 12338; GFX11-NEXT: v_add_f32_e32 v7, v7, v23 12339; GFX11-NEXT: v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83 12340; GFX11-NEXT: v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 12341; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 12342; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 12343; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 12344; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 12345; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 12346; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 12347; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff 12348; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 12349; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 12350; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff 12351; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 12352; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 12353; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 12354; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff 12355; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 12356; GFX11-NEXT: v_add_f32_e32 v4, v4, v20 12357; GFX11-NEXT: v_add_f32_e32 v20, v80, v71 12358; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 12359; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 12360; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 12361; GFX11-NEXT: v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 12362; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 12363; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff 12364; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 12365; GFX11-NEXT: v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 12366; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 12367; GFX11-NEXT: v_add_f32_e32 v26, v52, v51 12368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 12369; GFX11-NEXT: v_add_f32_e32 v6, v6, v22 12370; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 12371; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 12372; GFX11-NEXT: v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 12373; GFX11-NEXT: v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 12374; GFX11-NEXT: v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 12375; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 12376; GFX11-NEXT: v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 12377; GFX11-NEXT: v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 12378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 12379; GFX11-NEXT: v_add_f32_e32 v29, v38, v37 12380; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 12381; GFX11-NEXT: v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 12382; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 12383; GFX11-NEXT: v_add_f32_e32 v14, v14, v30 12384; GFX11-NEXT: v_add_f32_e32 v28, v48, v39 12385; GFX11-NEXT: v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33 12386; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 12387; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 12388; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 12389; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 12390; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 12391; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 12392; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 12393; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 12394; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff 12395; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 12396; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 12397; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff 12398; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 12399; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 12400; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff 12401; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo 12402; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 12403; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 12404; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 12405; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff 12406; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 12407; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo 12408; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 12409; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 12410; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 12411; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 12412; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 12413; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo 12414; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 12415; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff 12416; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 12417; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 12418; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff 12419; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo 12420; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 12421; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 12422; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 12423; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff 12424; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 12425; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo 12426; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 12427; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 12428; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff 12429; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 12430; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 12431; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo 12432; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 12433; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff 12434; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 12435; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 12436; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff 12437; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo 12438; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 12439; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 12440; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff 12441; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 12442; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 12443; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo 12444; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 12445; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 12446; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff 12447; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 12448; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 12449; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo 12450; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 12451; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff 12452; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 12453; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 12454; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff 12455; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo 12456; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 12457; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 12458; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 12459; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff 12460; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 12461; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo 12462; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 12463; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff 12464; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 12465; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 12466; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 12467; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo 12468; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 12469; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff 12470; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 12471; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff 12472; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 12473; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo 12474; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 12475; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 12476; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 12477; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff 12478; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 12479; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo 12480; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 12481; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 12482; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 12483; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 12484; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 12485; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo 12486; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 12487; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 12488; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo 12489; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 12490; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 12491; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 12492; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo 12493; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 12494; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo 12495; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 12496; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 12497; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo 12498; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 12499; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo 12500; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 12501; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 12502; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 12503; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo 12504; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 12505; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo 12506; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 12507; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 12508; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo 12509; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 12510; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo 12511; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 12512; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo 12513; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 12514; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo 12515; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 12516; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 12517; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 12518; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo 12519; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12520; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo 12521; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12522; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 12523; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo 12524; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 12525; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 12526; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 12527; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo 12528; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 12529; GFX11-NEXT: s_waitcnt vmcnt(0) 12530; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 12531; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12532; GFX11-NEXT: v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 12533; GFX11-NEXT: v_add_f32_e32 v15, v15, v18 12534; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12535; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 12536; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 12537; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 12538; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 12539; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 12540; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff 12541; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff 12542; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 12543; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo 12544; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 12545; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo 12546; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12547; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 12548; GFX11-NEXT: s_setpc_b64 s[30:31] 12549 %op = fadd <32 x bfloat> %a, %b 12550 ret <32 x bfloat> %op 12551} 12552 12553define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { 12554; GCN-LABEL: v_fadd_bf16_fpimm_0: 12555; GCN: ; %bb.0: 12556; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12557; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 12558; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12559; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 12560; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12561; GCN-NEXT: s_setpc_b64 s[30:31] 12562; 12563; GFX7-LABEL: v_fadd_bf16_fpimm_0: 12564; GFX7: ; %bb.0: 12565; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12566; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 12567; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12568; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 12569; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12570; GFX7-NEXT: s_setpc_b64 s[30:31] 12571; 12572; GFX8-LABEL: v_fadd_bf16_fpimm_0: 12573; GFX8: ; %bb.0: 12574; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12575; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12576; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0 12577; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 12578; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 12579; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 12580; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 12581; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12582; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12583; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12584; GFX8-NEXT: s_setpc_b64 s[30:31] 12585; 12586; GFX9-LABEL: v_fadd_bf16_fpimm_0: 12587; GFX9: ; %bb.0: 12588; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12589; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12590; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 12591; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 12592; GFX9-NEXT: s_movk_i32 s4, 0x7fff 12593; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 12594; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 12595; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12596; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12597; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12598; GFX9-NEXT: s_setpc_b64 s[30:31] 12599; 12600; GFX10-LABEL: v_fadd_bf16_fpimm_0: 12601; GFX10: ; %bb.0: 12602; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12603; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12604; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0 12605; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 12606; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 12607; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12608; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12609; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12610; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12611; GFX10-NEXT: s_setpc_b64 s[30:31] 12612; 12613; GFX11-LABEL: v_fadd_bf16_fpimm_0: 12614; GFX11: ; %bb.0: 12615; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12616; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12617; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12618; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 12619; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 12620; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 12621; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12622; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 12623; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12624; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12625; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12626; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12627; GFX11-NEXT: s_setpc_b64 s[30:31] 12628 %add = fadd bfloat %arg0, 1.0 12629 ret bfloat %add 12630} 12631 12632define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { 12633; GCN-LABEL: v_fadd_bf16_fpimm_1: 12634; GCN: ; %bb.0: 12635; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12636; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 12637; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12638; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12639; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12640; GCN-NEXT: s_setpc_b64 s[30:31] 12641; 12642; GFX7-LABEL: v_fadd_bf16_fpimm_1: 12643; GFX7: ; %bb.0: 12644; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12645; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 12646; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12647; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12648; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12649; GFX7-NEXT: s_setpc_b64 s[30:31] 12650; 12651; GFX8-LABEL: v_fadd_bf16_fpimm_1: 12652; GFX8: ; %bb.0: 12653; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12654; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12655; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12656; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 12657; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 12658; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 12659; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 12660; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12661; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12662; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12663; GFX8-NEXT: s_setpc_b64 s[30:31] 12664; 12665; GFX9-LABEL: v_fadd_bf16_fpimm_1: 12666; GFX9: ; %bb.0: 12667; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12668; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12669; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12670; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 12671; GFX9-NEXT: s_movk_i32 s4, 0x7fff 12672; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 12673; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 12674; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12675; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12676; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12677; GFX9-NEXT: s_setpc_b64 s[30:31] 12678; 12679; GFX10-LABEL: v_fadd_bf16_fpimm_1: 12680; GFX10: ; %bb.0: 12681; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12682; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12683; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12684; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 12685; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 12686; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12687; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12688; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12689; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12690; GFX10-NEXT: s_setpc_b64 s[30:31] 12691; 12692; GFX11-LABEL: v_fadd_bf16_fpimm_1: 12693; GFX11: ; %bb.0: 12694; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12695; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12697; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0 12698; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 12699; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 12700; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 12702; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12703; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12705; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12706; GFX11-NEXT: s_setpc_b64 s[30:31] 12707 %add = fadd bfloat %arg0, 42.0 12708 ret bfloat %add 12709} 12710 12711define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { 12712; GCN-LABEL: v_fsub_bf16: 12713; GCN: ; %bb.0: 12714; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12715; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 12716; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 12717; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12718; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12719; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 12720; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12721; GCN-NEXT: s_setpc_b64 s[30:31] 12722; 12723; GFX7-LABEL: v_fsub_bf16: 12724; GFX7: ; %bb.0: 12725; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12726; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 12727; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 12728; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12729; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12730; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 12731; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12732; GFX7-NEXT: s_setpc_b64 s[30:31] 12733; 12734; GFX8-LABEL: v_fsub_bf16: 12735; GFX8: ; %bb.0: 12736; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12737; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12738; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12739; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 12740; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 12741; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 12742; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 12743; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 12744; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12745; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12746; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12747; GFX8-NEXT: s_setpc_b64 s[30:31] 12748; 12749; GFX9-LABEL: v_fsub_bf16: 12750; GFX9: ; %bb.0: 12751; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12752; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12753; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12754; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 12755; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 12756; GFX9-NEXT: s_movk_i32 s4, 0x7fff 12757; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 12758; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 12759; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12760; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 12761; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12762; GFX9-NEXT: s_setpc_b64 s[30:31] 12763; 12764; GFX10-LABEL: v_fsub_bf16: 12765; GFX10: ; %bb.0: 12766; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12767; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12768; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12769; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 12770; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 12771; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 12772; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12773; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12774; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12775; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12776; GFX10-NEXT: s_setpc_b64 s[30:31] 12777; 12778; GFX11-LABEL: v_fsub_bf16: 12779; GFX11: ; %bb.0: 12780; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12781; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12782; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 12783; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12784; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 12785; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 12786; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 12787; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12788; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 12789; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 12790; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 12791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12792; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12793; GFX11-NEXT: s_setpc_b64 s[30:31] 12794 %op = fsub bfloat %a, %b 12795 ret bfloat %op 12796} 12797 12798define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { 12799; GCN-LABEL: v_fsub_v2bf16: 12800; GCN: ; %bb.0: 12801; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12802; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 12803; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 12804; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 12805; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 12806; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12807; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12808; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12809; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12810; GCN-NEXT: v_sub_f32_e32 v1, v1, v3 12811; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 12812; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12813; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12814; GCN-NEXT: s_setpc_b64 s[30:31] 12815; 12816; GFX7-LABEL: v_fsub_v2bf16: 12817; GFX7: ; %bb.0: 12818; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12819; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 12820; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12821; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 12822; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 12823; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12824; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12825; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12826; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12827; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 12828; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 12829; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12830; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12831; GFX7-NEXT: s_setpc_b64 s[30:31] 12832; 12833; GFX8-LABEL: v_fsub_v2bf16: 12834; GFX8: ; %bb.0: 12835; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12836; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 12837; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 12838; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2 12839; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 12840; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 12841; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12842; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12843; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 12844; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 12845; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 12846; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12847; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 12848; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 12849; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 12850; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 12851; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 12852; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12853; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 12854; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 12855; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 12856; GFX8-NEXT: s_setpc_b64 s[30:31] 12857; 12858; GFX9-LABEL: v_fsub_v2bf16: 12859; GFX9: ; %bb.0: 12860; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12861; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 12862; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 12863; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 12864; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12865; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12866; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 12867; GFX9-NEXT: s_movk_i32 s4, 0x7fff 12868; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 12869; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 12870; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 12871; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12872; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 12873; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 12874; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 12875; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 12876; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 12877; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 12878; GFX9-NEXT: s_mov_b32 s4, 0x7060302 12879; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 12880; GFX9-NEXT: s_setpc_b64 s[30:31] 12881; 12882; GFX10-LABEL: v_fsub_v2bf16: 12883; GFX10: ; %bb.0: 12884; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12885; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 12886; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 12887; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12888; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12889; GFX10-NEXT: v_sub_f32_e32 v2, v3, v2 12890; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 12891; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 12892; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 12893; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 12894; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12895; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 12896; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 12897; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 12898; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 12899; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12900; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 12901; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 12902; GFX10-NEXT: s_setpc_b64 s[30:31] 12903; 12904; GFX11-LABEL: v_fsub_v2bf16: 12905; GFX11: ; %bb.0: 12906; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12907; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 12908; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12909; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 12910; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12911; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 12912; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 12913; GFX11-NEXT: v_sub_f32_e32 v2, v3, v2 12914; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 12915; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 12916; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 12917; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 12918; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12919; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 12920; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 12921; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 12922; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 12923; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 12924; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 12925; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 12926; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12927; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 12928; GFX11-NEXT: s_setpc_b64 s[30:31] 12929 %op = fsub <2 x bfloat> %a, %b 12930 ret <2 x bfloat> %op 12931} 12932 12933define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { 12934; GCN-LABEL: v_fsub_v3bf16: 12935; GCN: ; %bb.0: 12936; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12937; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 12938; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 12939; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 12940; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 12941; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 12942; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 12943; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 12944; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12945; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 12946; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12947; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12948; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12949; GCN-NEXT: v_sub_f32_e32 v2, v2, v5 12950; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 12951; GCN-NEXT: v_sub_f32_e32 v0, v0, v3 12952; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12953; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12954; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12955; GCN-NEXT: s_setpc_b64 s[30:31] 12956; 12957; GFX7-LABEL: v_fsub_v3bf16: 12958; GFX7: ; %bb.0: 12959; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12960; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 12961; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 12962; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 12963; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 12964; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12965; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 12966; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 12967; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12968; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 12969; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12970; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 12971; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12972; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 12973; GFX7-NEXT: v_sub_f32_e32 v1, v1, v4 12974; GFX7-NEXT: v_sub_f32_e32 v0, v0, v3 12975; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 12976; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 12977; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12978; GFX7-NEXT: s_setpc_b64 s[30:31] 12979; 12980; GFX8-LABEL: v_fsub_v3bf16: 12981; GFX8: ; %bb.0: 12982; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12983; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12984; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12985; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 12986; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 12987; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 12988; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 12989; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 12990; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 12991; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 12992; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 12993; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 12994; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 12995; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 12996; GFX8-NEXT: s_movk_i32 s4, 0x7fff 12997; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 12998; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 12999; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13000; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 13001; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 13002; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 13003; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13004; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 13005; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 13006; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 13007; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 13008; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 13009; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13010; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 13011; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13012; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13013; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 13014; GFX8-NEXT: s_setpc_b64 s[30:31] 13015; 13016; GFX9-LABEL: v_fsub_v3bf16: 13017; GFX9: ; %bb.0: 13018; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13019; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13020; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13021; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 13022; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 13023; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13024; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 13025; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 13026; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13027; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 13028; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13029; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 13030; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 13031; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13032; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13033; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 13034; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 13035; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 13036; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 13037; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13038; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 13039; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 13040; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 13041; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 13042; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13043; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 13044; GFX9-NEXT: s_mov_b32 s4, 0x7060302 13045; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 13046; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 13047; GFX9-NEXT: s_setpc_b64 s[30:31] 13048; 13049; GFX10-LABEL: v_fsub_v3bf16: 13050; GFX10: ; %bb.0: 13051; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13052; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13053; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13054; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13055; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13056; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13057; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13058; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 13059; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 13060; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 13061; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 13062; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 13063; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 13064; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13065; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 13066; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13067; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 13068; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13069; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 13070; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13071; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13072; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13073; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13074; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13075; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13076; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13077; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 13078; GFX10-NEXT: s_setpc_b64 s[30:31] 13079; 13080; GFX11TRUE16-LABEL: v_fsub_v3bf16: 13081; GFX11TRUE16: ; %bb.0: 13082; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13083; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13084; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13085; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13086; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13087; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13088; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13089; GFX11TRUE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 13090; GFX11TRUE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 13091; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 13092; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 13093; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 13094; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13095; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 13096; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 13097; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13098; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 13099; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 13100; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13101; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13102; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13103; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13104; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13105; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13106; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13107; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13108; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13109; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 13110; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 13111; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 13112; 13113; GFX11FAKE16-LABEL: v_fsub_v3bf16: 13114; GFX11FAKE16: ; %bb.0: 13115; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13116; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13117; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13118; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13119; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13120; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13121; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13122; GFX11FAKE16-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 13123; GFX11FAKE16-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 13124; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 13125; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 13126; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 13127; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13128; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 13129; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 13130; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13131; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 13132; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 13133; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13134; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13135; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13136; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13137; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13138; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13139; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13140; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13141; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13142; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 13143; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 13144; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 13145 %op = fsub <3 x bfloat> %a, %b 13146 ret <3 x bfloat> %op 13147} 13148 13149define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 13150; GCN-LABEL: v_fsub_v4bf16: 13151; GCN: ; %bb.0: 13152; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13153; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 13154; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 13155; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 13156; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 13157; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 13158; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 13159; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 13160; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 13161; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 13162; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13163; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 13164; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13165; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13166; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13167; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13168; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13169; GCN-NEXT: v_sub_f32_e32 v3, v3, v7 13170; GCN-NEXT: v_sub_f32_e32 v2, v2, v6 13171; GCN-NEXT: v_sub_f32_e32 v1, v1, v5 13172; GCN-NEXT: v_sub_f32_e32 v0, v0, v4 13173; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13174; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13175; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13176; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13177; GCN-NEXT: s_setpc_b64 s[30:31] 13178; 13179; GFX7-LABEL: v_fsub_v4bf16: 13180; GFX7: ; %bb.0: 13181; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13182; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 13183; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 13184; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 13185; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 13186; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13187; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 13188; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 13189; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 13190; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 13191; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13192; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 13193; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13194; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13195; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13196; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13197; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13198; GFX7-NEXT: v_sub_f32_e32 v3, v3, v7 13199; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 13200; GFX7-NEXT: v_sub_f32_e32 v1, v1, v5 13201; GFX7-NEXT: v_sub_f32_e32 v0, v0, v4 13202; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13203; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13204; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13205; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13206; GFX7-NEXT: s_setpc_b64 s[30:31] 13207; 13208; GFX8-LABEL: v_fsub_v4bf16: 13209; GFX8: ; %bb.0: 13210; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13211; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13212; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13213; GFX8-NEXT: v_sub_f32_e32 v4, v5, v4 13214; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 13215; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 13216; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13217; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13218; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 13219; GFX8-NEXT: v_sub_f32_e32 v1, v1, v3 13220; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 13221; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13222; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 13223; GFX8-NEXT: s_movk_i32 s4, 0x7fff 13224; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 13225; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 13226; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 13227; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 13228; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13229; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 13230; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13231; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13232; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 13233; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 13234; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 13235; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13236; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13237; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 13238; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 13239; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 13240; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13241; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 13242; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 13243; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 13244; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 13245; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 13246; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13247; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 13248; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13249; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13250; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 13251; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 13252; GFX8-NEXT: s_setpc_b64 s[30:31] 13253; 13254; GFX9-LABEL: v_fsub_v4bf16: 13255; GFX9: ; %bb.0: 13256; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13257; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13258; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13259; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 13260; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13261; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13262; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 13263; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13264; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 13265; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 13266; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 13267; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13268; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 13269; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 13270; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 13271; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 13272; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13273; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 13274; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13275; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13276; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 13277; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13278; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13279; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 13280; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 13281; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 13282; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 13283; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13284; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 13285; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 13286; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 13287; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 13288; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13289; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 13290; GFX9-NEXT: s_mov_b32 s4, 0x7060302 13291; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 13292; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 13293; GFX9-NEXT: s_setpc_b64 s[30:31] 13294; 13295; GFX10-LABEL: v_fsub_v4bf16: 13296; GFX10: ; %bb.0: 13297; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13298; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13299; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13300; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13301; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13302; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13303; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 13304; GFX10-NEXT: v_sub_f32_e32 v4, v5, v4 13305; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13306; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13307; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 13308; GFX10-NEXT: v_sub_f32_e32 v3, v7, v6 13309; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 13310; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 13311; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 13312; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13313; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 13314; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 13315; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 13316; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 13317; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 13318; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 13319; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 13320; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 13321; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13322; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 13323; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 13324; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 13325; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 13326; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13327; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 13328; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13329; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 13330; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 13331; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 13332; GFX10-NEXT: s_setpc_b64 s[30:31] 13333; 13334; GFX11-LABEL: v_fsub_v4bf16: 13335; GFX11: ; %bb.0: 13336; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13337; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13338; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 13339; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13340; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13341; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13342; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13344; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 13345; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13346; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 13347; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13348; GFX11-NEXT: v_sub_f32_e32 v1, v1, v3 13349; GFX11-NEXT: v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4 13350; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 13351; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 13352; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 13353; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 13354; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 13355; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13356; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 13357; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 13358; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 13359; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 13360; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) 13361; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 13362; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 13363; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13364; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 13365; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 13366; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 13367; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13369; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 13370; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13371; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 13372; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 13373; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13374; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 13375; GFX11-NEXT: s_setpc_b64 s[30:31] 13376 %op = fsub <4 x bfloat> %a, %b 13377 ret <4 x bfloat> %op 13378} 13379 13380define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { 13381; GCN-LABEL: v_fmul_bf16: 13382; GCN: ; %bb.0: 13383; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13384; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 13385; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 13386; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13387; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13388; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 13389; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13390; GCN-NEXT: s_setpc_b64 s[30:31] 13391; 13392; GFX7-LABEL: v_fmul_bf16: 13393; GFX7: ; %bb.0: 13394; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13395; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 13396; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 13397; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13398; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13399; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 13400; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13401; GFX7-NEXT: s_setpc_b64 s[30:31] 13402; 13403; GFX8-LABEL: v_fmul_bf16: 13404; GFX8: ; %bb.0: 13405; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13406; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13407; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13408; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 13409; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 13410; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 13411; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 13412; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 13413; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13414; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 13415; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13416; GFX8-NEXT: s_setpc_b64 s[30:31] 13417; 13418; GFX9-LABEL: v_fmul_bf16: 13419; GFX9: ; %bb.0: 13420; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13421; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13422; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13423; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 13424; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 13425; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13426; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 13427; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 13428; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13429; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 13430; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13431; GFX9-NEXT: s_setpc_b64 s[30:31] 13432; 13433; GFX10-LABEL: v_fmul_bf16: 13434; GFX10: ; %bb.0: 13435; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13436; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13437; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13438; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 13439; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 13440; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 13441; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13442; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 13443; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 13444; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13445; GFX10-NEXT: s_setpc_b64 s[30:31] 13446; 13447; GFX11-LABEL: v_fmul_bf16: 13448; GFX11: ; %bb.0: 13449; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13450; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13451; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 13452; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13453; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 13454; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 13455; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 13456; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13457; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 13458; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 13459; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 13460; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13461; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13462; GFX11-NEXT: s_setpc_b64 s[30:31] 13463 %op = fmul bfloat %a, %b 13464 ret bfloat %op 13465} 13466 13467define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { 13468; GCN-LABEL: v_fmul_v2bf16: 13469; GCN: ; %bb.0: 13470; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13471; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 13472; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 13473; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 13474; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 13475; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13476; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13477; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13478; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13479; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 13480; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 13481; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13482; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13483; GCN-NEXT: s_setpc_b64 s[30:31] 13484; 13485; GFX7-LABEL: v_fmul_v2bf16: 13486; GFX7: ; %bb.0: 13487; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13488; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 13489; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13490; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 13491; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 13492; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13493; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13494; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13495; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13496; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 13497; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 13498; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13499; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13500; GFX7-NEXT: s_setpc_b64 s[30:31] 13501; 13502; GFX8-LABEL: v_fmul_v2bf16: 13503; GFX8: ; %bb.0: 13504; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13505; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 13506; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 13507; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 13508; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 13509; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 13510; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13511; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13512; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 13513; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 13514; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 13515; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13516; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 13517; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 13518; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 13519; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 13520; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 13521; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13522; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 13523; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13524; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 13525; GFX8-NEXT: s_setpc_b64 s[30:31] 13526; 13527; GFX9-LABEL: v_fmul_v2bf16: 13528; GFX9: ; %bb.0: 13529; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13530; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 13531; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 13532; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 13533; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13534; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13535; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 13536; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13537; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 13538; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 13539; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 13540; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 13541; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 13542; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 13543; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 13544; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 13545; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13546; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 13547; GFX9-NEXT: s_mov_b32 s4, 0x7060302 13548; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 13549; GFX9-NEXT: s_setpc_b64 s[30:31] 13550; 13551; GFX10-LABEL: v_fmul_v2bf16: 13552; GFX10: ; %bb.0: 13553; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13554; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 13555; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 13556; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13557; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13558; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 13559; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 13560; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 13561; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 13562; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 13563; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 13564; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 13565; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 13566; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 13567; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 13568; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13569; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 13570; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 13571; GFX10-NEXT: s_setpc_b64 s[30:31] 13572; 13573; GFX11-LABEL: v_fmul_v2bf16: 13574; GFX11: ; %bb.0: 13575; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13576; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 13577; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13578; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 13579; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13580; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 13581; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 13582; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 13583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 13584; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 13585; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 13586; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 13587; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 13588; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 13589; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 13590; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 13591; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 13592; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 13593; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13594; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 13595; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13596; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 13597; GFX11-NEXT: s_setpc_b64 s[30:31] 13598 %op = fmul <2 x bfloat> %a, %b 13599 ret <2 x bfloat> %op 13600} 13601 13602define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { 13603; GCN-LABEL: v_fmul_v3bf16: 13604; GCN: ; %bb.0: 13605; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13606; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 13607; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 13608; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 13609; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 13610; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 13611; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 13612; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13613; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13614; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13615; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13616; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13617; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13618; GCN-NEXT: v_mul_f32_e32 v2, v2, v5 13619; GCN-NEXT: v_mul_f32_e32 v1, v1, v4 13620; GCN-NEXT: v_mul_f32_e32 v0, v0, v3 13621; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13622; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13623; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13624; GCN-NEXT: s_setpc_b64 s[30:31] 13625; 13626; GFX7-LABEL: v_fmul_v3bf16: 13627; GFX7: ; %bb.0: 13628; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13629; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 13630; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 13631; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 13632; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 13633; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13634; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 13635; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13636; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13637; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13638; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13639; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13640; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13641; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5 13642; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4 13643; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 13644; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13645; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13646; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13647; GFX7-NEXT: s_setpc_b64 s[30:31] 13648; 13649; GFX8-LABEL: v_fmul_v3bf16: 13650; GFX8: ; %bb.0: 13651; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13652; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13653; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13654; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 13655; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 13656; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 13657; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 13658; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 13659; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13660; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 13661; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13662; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 13663; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 13664; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 13665; GFX8-NEXT: s_movk_i32 s4, 0x7fff 13666; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 13667; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13668; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13669; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 13670; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 13671; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 13672; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13673; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 13674; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 13675; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 13676; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 13677; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 13678; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13679; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 13680; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13681; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13682; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 13683; GFX8-NEXT: s_setpc_b64 s[30:31] 13684; 13685; GFX9-LABEL: v_fmul_v3bf16: 13686; GFX9: ; %bb.0: 13687; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13688; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13689; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13690; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 13691; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 13692; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13693; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 13694; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 13695; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13696; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 13697; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13698; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 13699; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 13700; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13701; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13702; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 13703; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 13704; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 13705; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 13706; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13707; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 13708; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 13709; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 13710; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 13711; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13712; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 13713; GFX9-NEXT: s_mov_b32 s4, 0x7060302 13714; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 13715; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 13716; GFX9-NEXT: s_setpc_b64 s[30:31] 13717; 13718; GFX10-LABEL: v_fmul_v3bf16: 13719; GFX10: ; %bb.0: 13720; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13721; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13722; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13723; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13724; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13725; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13726; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13727; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 13728; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 13729; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 13730; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 13731; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 13732; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 13733; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13734; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 13735; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13736; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 13737; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13738; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 13739; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13740; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13741; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13742; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13743; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13744; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13745; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13746; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 13747; GFX10-NEXT: s_setpc_b64 s[30:31] 13748; 13749; GFX11TRUE16-LABEL: v_fmul_v3bf16: 13750; GFX11TRUE16: ; %bb.0: 13751; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13752; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13753; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13754; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13755; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13756; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13757; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13758; GFX11TRUE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 13759; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 13760; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 13761; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 13762; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 13763; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13764; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 13765; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 13766; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13767; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 13768; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 13769; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13770; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13771; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13772; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13773; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13774; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13775; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13776; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13777; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13778; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 13779; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 13780; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 13781; 13782; GFX11FAKE16-LABEL: v_fmul_v3bf16: 13783; GFX11FAKE16: ; %bb.0: 13784; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13785; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 13786; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13787; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 13788; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13789; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13790; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 13791; GFX11FAKE16-NEXT: v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 13792; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 13793; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 13794; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 13795; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 13796; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13797; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 13798; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 13799; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 13800; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 13801; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 13802; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 13803; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 13804; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 13805; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13806; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 13807; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 13808; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13809; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 13810; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 13811; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 13812; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 13813; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 13814 %op = fmul <3 x bfloat> %a, %b 13815 ret <3 x bfloat> %op 13816} 13817 13818define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 13819; GCN-LABEL: v_fmul_v4bf16: 13820; GCN: ; %bb.0: 13821; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13822; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 13823; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 13824; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 13825; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 13826; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 13827; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 13828; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 13829; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 13830; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 13831; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13832; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 13833; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13834; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13835; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13836; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13837; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13838; GCN-NEXT: v_mul_f32_e32 v3, v3, v7 13839; GCN-NEXT: v_mul_f32_e32 v2, v2, v6 13840; GCN-NEXT: v_mul_f32_e32 v1, v1, v5 13841; GCN-NEXT: v_mul_f32_e32 v0, v0, v4 13842; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13843; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13844; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13845; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13846; GCN-NEXT: s_setpc_b64 s[30:31] 13847; 13848; GFX7-LABEL: v_fmul_v4bf16: 13849; GFX7: ; %bb.0: 13850; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13851; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 13852; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 13853; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 13854; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 13855; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 13856; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 13857; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 13858; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 13859; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 13860; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13861; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 13862; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13863; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 13864; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13865; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 13866; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13867; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 13868; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 13869; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 13870; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4 13871; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13872; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13873; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13874; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13875; GFX7-NEXT: s_setpc_b64 s[30:31] 13876; 13877; GFX8-LABEL: v_fmul_v4bf16: 13878; GFX8: ; %bb.0: 13879; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13880; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13881; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13882; GFX8-NEXT: v_mul_f32_e32 v4, v5, v4 13883; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 13884; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 13885; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13886; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13887; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 13888; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 13889; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 13890; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13891; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 13892; GFX8-NEXT: s_movk_i32 s4, 0x7fff 13893; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 13894; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 13895; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 13896; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 13897; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13898; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 13899; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13900; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13901; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 13902; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 13903; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 13904; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13905; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13906; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 13907; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 13908; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 13909; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13910; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 13911; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 13912; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 13913; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 13914; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 13915; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13916; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 13917; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13918; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 13919; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 13920; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 13921; GFX8-NEXT: s_setpc_b64 s[30:31] 13922; 13923; GFX9-LABEL: v_fmul_v4bf16: 13924; GFX9: ; %bb.0: 13925; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13926; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13927; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13928; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 13929; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13930; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13931; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 13932; GFX9-NEXT: s_movk_i32 s4, 0x7fff 13933; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 13934; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 13935; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 13936; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 13937; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 13938; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 13939; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 13940; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 13941; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 13942; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 13943; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 13944; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 13945; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 13946; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13947; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13948; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 13949; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 13950; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 13951; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 13952; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 13953; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 13954; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 13955; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 13956; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 13957; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 13958; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 13959; GFX9-NEXT: s_mov_b32 s4, 0x7060302 13960; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 13961; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 13962; GFX9-NEXT: s_setpc_b64 s[30:31] 13963; 13964; GFX10-LABEL: v_fmul_v4bf16: 13965; GFX10: ; %bb.0: 13966; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13967; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 13968; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 13969; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 13970; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 13971; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 13972; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 13973; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 13974; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 13975; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 13976; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 13977; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 13978; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 13979; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 13980; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 13981; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 13982; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 13983; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 13984; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 13985; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 13986; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 13987; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 13988; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 13989; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 13990; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 13991; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 13992; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 13993; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 13994; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 13995; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 13996; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 13997; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 13998; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 13999; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 14000; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 14001; GFX10-NEXT: s_setpc_b64 s[30:31] 14002; 14003; GFX11-LABEL: v_fmul_v4bf16: 14004; GFX11: ; %bb.0: 14005; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14006; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 14007; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 14008; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14009; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14010; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 14011; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 14012; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 14013; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 14014; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14015; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 14016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 14017; GFX11-NEXT: v_mul_f32_e32 v1, v1, v3 14018; GFX11-NEXT: v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4 14019; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 14020; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 14021; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 14022; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 14023; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 14024; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 14025; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 14026; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 14027; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 14028; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 14029; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) 14030; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 14031; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 14032; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 14033; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 14034; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 14035; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 14036; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 14037; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 14038; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 14039; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 14040; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 14041; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 14042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14043; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 14044; GFX11-NEXT: s_setpc_b64 s[30:31] 14045 %op = fmul <4 x bfloat> %a, %b 14046 ret <4 x bfloat> %op 14047} 14048 14049define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 14050; GCN-LABEL: v_fmul_v8bf16: 14051; GCN: ; %bb.0: 14052; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14053; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 14054; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 14055; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 14056; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 14057; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 14058; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 14059; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 14060; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 14061; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 14062; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 14063; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 14064; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 14065; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 14066; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 14067; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 14068; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 14069; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14070; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14071; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14072; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14073; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14074; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14075; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14076; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14077; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14078; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14079; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14080; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14081; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14082; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14083; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14084; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14085; GCN-NEXT: v_mul_f32_e32 v7, v7, v15 14086; GCN-NEXT: v_mul_f32_e32 v6, v6, v14 14087; GCN-NEXT: v_mul_f32_e32 v5, v5, v13 14088; GCN-NEXT: v_mul_f32_e32 v4, v4, v12 14089; GCN-NEXT: v_mul_f32_e32 v3, v3, v11 14090; GCN-NEXT: v_mul_f32_e32 v2, v2, v10 14091; GCN-NEXT: v_mul_f32_e32 v1, v1, v9 14092; GCN-NEXT: v_mul_f32_e32 v0, v0, v8 14093; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14094; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14095; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14096; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14097; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14098; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14099; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14100; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14101; GCN-NEXT: s_setpc_b64 s[30:31] 14102; 14103; GFX7-LABEL: v_fmul_v8bf16: 14104; GFX7: ; %bb.0: 14105; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14106; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 14107; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 14108; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 14109; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 14110; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14111; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 14112; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14113; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 14114; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 14115; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 14116; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 14117; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 14118; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 14119; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 14120; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 14121; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 14122; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14123; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14124; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14125; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14126; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14127; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14128; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14129; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14130; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14131; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14132; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14133; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14134; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14135; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14136; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14137; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14138; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15 14139; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14 14140; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13 14141; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12 14142; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11 14143; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10 14144; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9 14145; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8 14146; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14147; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14148; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14149; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14150; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14151; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14152; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14153; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14154; GFX7-NEXT: s_setpc_b64 s[30:31] 14155; 14156; GFX8-LABEL: v_fmul_v8bf16: 14157; GFX8: ; %bb.0: 14158; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14159; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 14160; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 14161; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8 14162; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 14163; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 14164; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14165; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14166; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14167; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7 14168; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 14169; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 14170; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 14171; GFX8-NEXT: s_movk_i32 s4, 0x7fff 14172; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 14173; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 14174; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 14175; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 14176; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14177; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 14178; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 14179; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 14180; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7 14181; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 14182; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 14183; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14184; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14185; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 14186; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6 14187; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 14188; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 14189; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 14190; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 14191; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 14192; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 14193; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 14194; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14195; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 14196; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 14197; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 14198; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6 14199; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 14200; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 14201; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14202; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14203; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 14204; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5 14205; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 14206; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 14207; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 14208; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 14209; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 14210; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 14211; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 14212; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 14213; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 14214; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 14215; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 14216; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5 14217; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14218; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14219; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14220; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14221; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 14222; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4 14223; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14224; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14225; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 14226; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14227; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 14228; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 14229; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 14230; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 14231; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 14232; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 14233; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14234; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14235; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 14236; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 14237; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 14238; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 14239; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 14240; GFX8-NEXT: s_setpc_b64 s[30:31] 14241; 14242; GFX9-LABEL: v_fmul_v8bf16: 14243; GFX9: ; %bb.0: 14244; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14245; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 14246; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 14247; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 14248; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14249; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14250; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 14251; GFX9-NEXT: s_movk_i32 s4, 0x7fff 14252; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 14253; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 14254; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 14255; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 14256; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 14257; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 14258; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 14259; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 14260; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14261; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 14262; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 14263; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 14264; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 14265; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14266; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14267; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 14268; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 14269; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 14270; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 14271; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 14272; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 14273; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 14274; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 14275; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 14276; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14277; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 14278; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 14279; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 14280; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 14281; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14282; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14283; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 14284; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 14285; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 14286; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 14287; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 14288; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 14289; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 14290; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 14291; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 14292; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 14293; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 14294; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 14295; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 14296; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 14297; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14298; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14299; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 14300; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 14301; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 14302; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 14303; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14304; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 14305; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14306; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 14307; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 14308; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 14309; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 14310; GFX9-NEXT: s_mov_b32 s4, 0x7060302 14311; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 14312; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 14313; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 14314; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 14315; GFX9-NEXT: s_setpc_b64 s[30:31] 14316; 14317; GFX10-LABEL: v_fmul_v8bf16: 14318; GFX10: ; %bb.0: 14319; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14320; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 14321; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 14322; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14323; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14324; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 14325; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14326; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8 14327; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 14328; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14329; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7 14330; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 14331; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 14332; GFX10-NEXT: v_mul_f32_e32 v7, v10, v9 14333; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 14334; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 14335; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6 14336; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 14337; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 14338; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 14339; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 14340; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 14341; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 14342; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 14343; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 14344; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 14345; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 14346; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 14347; GFX10-NEXT: v_mul_f32_e32 v6, v10, v6 14348; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 14349; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14350; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14351; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 14352; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo 14353; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 14354; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 14355; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14356; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14357; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 14358; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 14359; GFX10-NEXT: v_mul_f32_e32 v5, v15, v13 14360; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 14361; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 14362; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 14363; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 14364; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 14365; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 14366; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 14367; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 14368; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 14369; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 14370; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 14371; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 14372; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo 14373; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 14374; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14375; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 14376; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 14377; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 14378; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 14379; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 14380; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 14381; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 14382; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 14383; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 14384; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 14385; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 14386; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 14387; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 14388; GFX10-NEXT: s_setpc_b64 s[30:31] 14389; 14390; GFX11-LABEL: v_fmul_v8bf16: 14391; GFX11: ; %bb.0: 14392; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14393; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 14394; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 14395; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14396; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 14397; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 14398; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 14399; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 14400; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 14401; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 14402; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14403; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 14404; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 14405; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7 14406; GFX11-NEXT: v_mul_f32_e32 v7, v10, v9 14407; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 14408; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 14409; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14410; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 14411; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 14412; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 14413; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 14414; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 14415; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 14416; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 14417; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 14418; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 14419; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14420; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 14421; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14422; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 14423; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6 14424; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 14425; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14426; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 14427; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 14428; GFX11-NEXT: v_mul_f32_e32 v6, v10, v6 14429; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 14430; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 14431; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 14432; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 14433; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 14434; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 14435; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 14436; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 14437; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 14438; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14439; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14440; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 14441; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 14442; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 14443; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 14444; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 14445; GFX11-NEXT: v_mul_f32_e32 v5, v15, v13 14446; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 14447; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 14448; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 14449; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 14450; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 14451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 14452; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 14453; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 14454; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14455; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 14456; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 14457; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 14458; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 14459; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 14460; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 14461; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 14462; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 14463; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 14464; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 14465; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 14466; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 14467; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 14468; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 14469; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14470; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 14471; GFX11-NEXT: s_setpc_b64 s[30:31] 14472 %op = fmul <8 x bfloat> %a, %b 14473 ret <8 x bfloat> %op 14474} 14475 14476define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { 14477; GCN-LABEL: v_fmul_v16bf16: 14478; GCN: ; %bb.0: 14479; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14480; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 14481; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 14482; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 14483; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14484; GCN-NEXT: v_mul_f32_e32 v14, v14, v30 14485; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 14486; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 14487; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 14488; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14489; GCN-NEXT: v_mul_f32_e32 v13, v13, v29 14490; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 14491; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 14492; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 14493; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14494; GCN-NEXT: v_mul_f32_e32 v12, v12, v28 14495; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 14496; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 14497; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 14498; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 14499; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 14500; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 14501; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 14502; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 14503; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 14504; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 14505; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 14506; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 14507; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 14508; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 14509; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 14510; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 14511; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 14512; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 14513; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 14514; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 14515; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 14516; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 14517; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 14518; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 14519; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 14520; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 14521; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14522; GCN-NEXT: v_mul_f32_e32 v11, v11, v27 14523; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 14524; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 14525; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14526; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 14527; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14528; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 14529; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14530; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 14531; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14532; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 14533; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14534; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 14535; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14536; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 14537; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14538; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 14539; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14540; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 14541; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14542; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 14543; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14544; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 14545; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14546; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14547; GCN-NEXT: v_mul_f32_e32 v10, v10, v26 14548; GCN-NEXT: v_mul_f32_e32 v9, v9, v25 14549; GCN-NEXT: v_mul_f32_e32 v8, v8, v24 14550; GCN-NEXT: v_mul_f32_e32 v7, v7, v23 14551; GCN-NEXT: v_mul_f32_e32 v6, v6, v22 14552; GCN-NEXT: v_mul_f32_e32 v5, v5, v21 14553; GCN-NEXT: v_mul_f32_e32 v4, v4, v20 14554; GCN-NEXT: v_mul_f32_e32 v3, v3, v19 14555; GCN-NEXT: v_mul_f32_e32 v2, v2, v18 14556; GCN-NEXT: v_mul_f32_e32 v1, v1, v17 14557; GCN-NEXT: v_mul_f32_e32 v0, v0, v16 14558; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14559; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14560; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14561; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14562; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14563; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14564; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14565; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14566; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14567; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14568; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14569; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14570; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14571; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14572; GCN-NEXT: s_waitcnt vmcnt(0) 14573; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 14574; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 14575; GCN-NEXT: v_mul_f32_e32 v15, v15, v16 14576; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14577; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14578; GCN-NEXT: s_setpc_b64 s[30:31] 14579; 14580; GFX7-LABEL: v_fmul_v16bf16: 14581; GFX7: ; %bb.0: 14582; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14583; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 14584; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 14585; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 14586; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14587; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27 14588; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 14589; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 14590; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 14591; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 14592; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14593; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 14594; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 14595; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 14596; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 14597; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 14598; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 14599; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 14600; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 14601; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 14602; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 14603; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 14604; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 14605; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 14606; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 14607; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 14608; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 14609; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22 14610; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 14611; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 14612; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 14613; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14614; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 14615; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14616; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 14617; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 14618; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 14619; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 14620; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 14621; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 14622; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14623; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 14624; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14625; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 14626; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14627; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 14628; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14629; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 14630; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14631; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 14632; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14633; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 14634; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14635; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14636; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 14637; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14638; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 14639; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14640; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 14641; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14642; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 14643; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14644; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 14645; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14646; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 14647; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14648; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30 14649; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29 14650; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28 14651; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26 14652; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25 14653; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24 14654; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23 14655; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21 14656; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20 14657; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19 14658; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18 14659; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17 14660; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16 14661; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14662; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14663; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14664; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14665; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14666; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14667; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14668; GFX7-NEXT: s_waitcnt vmcnt(0) 14669; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 14670; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 14671; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22 14672; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14673; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14674; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14675; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14676; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14677; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14678; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14679; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14680; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14681; GFX7-NEXT: s_setpc_b64 s[30:31] 14682; 14683; GFX8-LABEL: v_fmul_v16bf16: 14684; GFX8: ; %bb.0: 14685; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14686; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 14687; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 14688; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16 14689; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 14690; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 14691; GFX8-NEXT: s_movk_i32 s4, 0x7fff 14692; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14693; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14694; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14695; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15 14696; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 14697; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 14698; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 14699; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 14700; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 14701; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 14702; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 14703; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 14704; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 14705; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 14706; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 14707; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15 14708; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 14709; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 14710; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14711; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14712; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14713; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14 14714; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 14715; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 14716; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 14717; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 14718; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 14719; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 14720; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 14721; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 14722; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 14723; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 14724; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 14725; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14 14726; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 14727; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 14728; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14729; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14730; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14731; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13 14732; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 14733; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 14734; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 14735; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 14736; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 14737; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 14738; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 14739; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14740; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 14741; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 14742; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 14743; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13 14744; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 14745; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 14746; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14747; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14748; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14749; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12 14750; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 14751; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 14752; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 14753; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 14754; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 14755; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 14756; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 14757; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 14758; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 14759; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 14760; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 14761; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12 14762; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 14763; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 14764; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14765; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14766; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14767; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11 14768; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 14769; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 14770; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 14771; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 14772; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 14773; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 14774; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 14775; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14776; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 14777; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 14778; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 14779; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11 14780; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 14781; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 14782; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14783; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14784; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14785; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10 14786; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 14787; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 14788; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 14789; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 14790; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 14791; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 14792; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 14793; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14794; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 14795; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 14796; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 14797; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10 14798; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 14799; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 14800; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14801; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14802; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14803; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9 14804; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 14805; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 14806; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 14807; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 14808; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 14809; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 14810; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 14811; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 14812; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 14813; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 14814; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 14815; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9 14816; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 14817; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 14818; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14819; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14820; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 14821; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8 14822; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 14823; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 14824; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 14825; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 14826; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 14827; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 14828; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 14829; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 14830; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 14831; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 14832; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 14833; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14834; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 14835; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 14836; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14837; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14838; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 14839; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 14840; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 14841; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 14842; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 14843; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 14844; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 14845; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 14846; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 14847; GFX8-NEXT: s_setpc_b64 s[30:31] 14848; 14849; GFX9-LABEL: v_fmul_v16bf16: 14850; GFX9: ; %bb.0: 14851; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14852; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 14853; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 14854; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 14855; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14856; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14857; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 14858; GFX9-NEXT: s_movk_i32 s4, 0x7fff 14859; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 14860; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 14861; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 14862; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 14863; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 14864; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 14865; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 14866; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 14867; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 14868; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 14869; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 14870; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 14871; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 14872; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 14873; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 14874; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 14875; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 14876; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 14877; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 14878; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 14879; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 14880; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 14881; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 14882; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 14883; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 14884; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 14885; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 14886; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 14887; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 14888; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 14889; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 14890; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 14891; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 14892; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 14893; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 14894; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 14895; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 14896; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 14897; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 14898; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 14899; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14900; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 14901; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 14902; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 14903; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 14904; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 14905; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 14906; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 14907; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 14908; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 14909; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 14910; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 14911; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 14912; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 14913; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 14914; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 14915; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 14916; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 14917; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 14918; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 14919; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 14920; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 14921; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14922; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 14923; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 14924; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 14925; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 14926; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 14927; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 14928; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 14929; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 14930; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 14931; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14932; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 14933; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 14934; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 14935; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 14936; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 14937; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14938; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 14939; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 14940; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 14941; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 14942; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 14943; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 14944; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 14945; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 14946; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 14947; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 14948; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 14949; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 14950; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 14951; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 14952; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 14953; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 14954; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 14955; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 14956; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 14957; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 14958; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 14959; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 14960; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 14961; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 14962; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 14963; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 14964; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 14965; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 14966; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 14967; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 14968; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 14969; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 14970; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 14971; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 14972; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 14973; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 14974; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 14975; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 14976; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 14977; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 14978; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 14979; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 14980; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 14981; GFX9-NEXT: s_mov_b32 s4, 0x7060302 14982; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 14983; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 14984; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 14985; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 14986; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 14987; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 14988; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 14989; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 14990; GFX9-NEXT: s_setpc_b64 s[30:31] 14991; 14992; GFX10-LABEL: v_fmul_v16bf16: 14993; GFX10: ; %bb.0: 14994; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14995; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15 14996; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 14997; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 14998; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 14999; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 15000; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15001; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16 15002; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 15003; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15 15004; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15005; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 15006; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 15007; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 15008; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 15009; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17 15010; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 15011; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14 15012; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 15013; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 15014; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 15015; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo 15016; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 15017; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 15018; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 15019; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 15020; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15021; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 15022; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 15023; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 15024; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15025; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 15026; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 15027; GFX10-NEXT: v_mul_f32_e32 v17, v20, v19 15028; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 15029; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13 15030; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo 15031; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 15032; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 15033; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 15034; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 15035; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15036; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 15037; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15038; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15039; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo 15040; GFX10-NEXT: v_mul_f32_e32 v13, v19, v18 15041; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 15042; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 15043; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 15044; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 15045; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 15046; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 15047; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12 15048; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 15049; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15050; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 15051; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 15052; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 15053; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15054; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 15055; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 15056; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15057; GFX10-NEXT: v_mul_f32_e32 v12, v18, v12 15058; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 15059; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 15060; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 15061; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11 15062; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 15063; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 15064; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 15065; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 15066; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 15067; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 15068; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15069; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 15070; GFX10-NEXT: v_mul_f32_e32 v18, v19, v18 15071; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15072; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 15073; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 15074; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 15075; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 15076; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10 15077; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 15078; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 15079; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 15080; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 15081; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 15082; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 15083; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 15084; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 15085; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 15086; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 15087; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15088; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 15089; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 15090; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 15091; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 15092; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 15093; GFX10-NEXT: v_mul_f32_e32 v19, v22, v20 15094; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 15095; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 15096; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15097; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15098; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 15099; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9 15100; GFX10-NEXT: v_mul_f32_e32 v9, v22, v20 15101; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 15102; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8 15103; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 15104; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 15105; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 15106; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 15107; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 15108; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 15109; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 15110; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 15111; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 15112; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 15113; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 15114; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 15115; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 15116; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 15117; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 15118; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 15119; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 15120; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 15121; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 15122; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 15123; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 15124; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 15125; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 15126; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 15127; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 15128; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 15129; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 15130; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 15131; GFX10-NEXT: s_setpc_b64 s[30:31] 15132; 15133; GFX11-LABEL: v_fmul_v16bf16: 15134; GFX11: ; %bb.0: 15135; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15136; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 15137; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 15138; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15139; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 15140; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 15141; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 15142; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 15143; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15144; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 15145; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 15146; GFX11-NEXT: v_mul_f32_e32 v17, v18, v17 15147; GFX11-NEXT: v_mul_f32_e32 v6, v6, v14 15148; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 15149; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 15150; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 15151; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 15152; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15153; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 15154; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15 15155; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 15156; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 15157; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 15158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15159; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 15160; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 15161; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 15162; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 15163; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 15164; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 15165; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 15166; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 15167; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 15168; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 15169; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 15170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 15171; GFX11-NEXT: v_dual_mul_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 15172; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 15173; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 15174; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 15175; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15176; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15177; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15178; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 15179; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15180; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) 15181; GFX11-NEXT: v_mul_f32_e32 v4, v4, v12 15182; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 15183; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15184; GFX11-NEXT: v_mul_f32_e32 v5, v5, v13 15185; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 15186; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 15187; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18 15188; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 15189; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 15190; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 15191; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 15192; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) 15193; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 15194; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 15195; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 15196; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 15197; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15198; GFX11-NEXT: v_mul_f32_e32 v12, v18, v12 15199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 15200; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 15201; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 15202; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 15203; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 15204; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15205; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 15206; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 15207; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 15208; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 15209; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 15210; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 15211; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 15212; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 15213; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 15214; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 15215; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 15216; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15217; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15218; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 15219; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 15220; GFX11-NEXT: v_mul_f32_e32 v18, v19, v18 15221; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 15222; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 15223; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15224; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15225; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 15226; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 15227; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15228; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 15229; GFX11-NEXT: v_mul_f32_e32 v3, v3, v11 15230; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 15231; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 15232; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 15233; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 15234; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 15235; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 15236; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 15237; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15238; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 15239; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 15240; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 15241; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 15242; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 15243; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 15244; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 15245; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 15246; GFX11-NEXT: v_mul_f32_e32 v19, v22, v20 15247; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 15248; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 15249; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15250; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) 15251; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 15252; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15253; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15254; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 15255; GFX11-NEXT: v_dual_mul_f32 v0, v0, v8 :: v_dual_mul_f32 v1, v1, v9 15256; GFX11-NEXT: v_mul_f32_e32 v9, v22, v20 15257; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 15258; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 15259; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 15260; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 15261; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 15262; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 15263; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 15264; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 15265; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 15266; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 15267; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 15268; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 15269; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 15270; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 15271; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 15272; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 15273; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 15274; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 15275; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 15276; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 15277; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 15278; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 15279; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 15280; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 15281; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 15282; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 15283; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 15284; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 15285; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 15286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15287; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 15288; GFX11-NEXT: s_setpc_b64 s[30:31] 15289 %op = fmul <16 x bfloat> %a, %b 15290 ret <16 x bfloat> %op 15291} 15292 15293define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { 15294; GCN-LABEL: v_fmul_v32bf16: 15295; GCN: ; %bb.0: 15296; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15297; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 15298; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 15299; GCN-NEXT: s_waitcnt vmcnt(1) 15300; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 15301; GCN-NEXT: s_waitcnt vmcnt(0) 15302; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 15303; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15304; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 15305; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 15306; GCN-NEXT: v_mul_f32_e32 v31, v31, v32 15307; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 15308; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15309; GCN-NEXT: s_waitcnt vmcnt(0) 15310; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15311; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15312; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 15313; GCN-NEXT: v_mul_f32_e32 v30, v30, v32 15314; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 15315; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 15316; GCN-NEXT: s_waitcnt vmcnt(0) 15317; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15318; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15319; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 15320; GCN-NEXT: v_mul_f32_e32 v29, v29, v32 15321; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 15322; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 15323; GCN-NEXT: s_waitcnt vmcnt(0) 15324; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15325; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15326; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 15327; GCN-NEXT: v_mul_f32_e32 v28, v28, v32 15328; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 15329; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 15330; GCN-NEXT: s_waitcnt vmcnt(0) 15331; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15332; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15333; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 15334; GCN-NEXT: v_mul_f32_e32 v27, v27, v32 15335; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 15336; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 15337; GCN-NEXT: s_waitcnt vmcnt(0) 15338; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15339; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15340; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 15341; GCN-NEXT: v_mul_f32_e32 v26, v26, v32 15342; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 15343; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 15344; GCN-NEXT: s_waitcnt vmcnt(0) 15345; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15346; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15347; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 15348; GCN-NEXT: v_mul_f32_e32 v25, v25, v32 15349; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 15350; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 15351; GCN-NEXT: s_waitcnt vmcnt(0) 15352; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15353; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15354; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 15355; GCN-NEXT: v_mul_f32_e32 v24, v24, v32 15356; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 15357; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 15358; GCN-NEXT: s_waitcnt vmcnt(0) 15359; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15360; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15361; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 15362; GCN-NEXT: v_mul_f32_e32 v23, v23, v32 15363; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 15364; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 15365; GCN-NEXT: s_waitcnt vmcnt(0) 15366; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15367; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15368; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 15369; GCN-NEXT: v_mul_f32_e32 v22, v22, v32 15370; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 15371; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 15372; GCN-NEXT: s_waitcnt vmcnt(0) 15373; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15374; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15375; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 15376; GCN-NEXT: v_mul_f32_e32 v21, v21, v32 15377; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 15378; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 15379; GCN-NEXT: s_waitcnt vmcnt(0) 15380; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15381; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15382; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 15383; GCN-NEXT: v_mul_f32_e32 v20, v20, v32 15384; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 15385; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 15386; GCN-NEXT: s_waitcnt vmcnt(0) 15387; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15388; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15389; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 15390; GCN-NEXT: v_mul_f32_e32 v19, v19, v32 15391; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 15392; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 15393; GCN-NEXT: s_waitcnt vmcnt(0) 15394; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15395; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15396; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 15397; GCN-NEXT: v_mul_f32_e32 v18, v18, v32 15398; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 15399; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 15400; GCN-NEXT: s_waitcnt vmcnt(0) 15401; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15402; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15403; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 15404; GCN-NEXT: v_mul_f32_e32 v17, v17, v32 15405; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 15406; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 15407; GCN-NEXT: s_waitcnt vmcnt(0) 15408; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15409; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15410; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 15411; GCN-NEXT: v_mul_f32_e32 v16, v16, v32 15412; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 15413; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 15414; GCN-NEXT: s_waitcnt vmcnt(0) 15415; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15416; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15417; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 15418; GCN-NEXT: v_mul_f32_e32 v15, v15, v32 15419; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 15420; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15421; GCN-NEXT: s_waitcnt vmcnt(0) 15422; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15423; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15424; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 15425; GCN-NEXT: v_mul_f32_e32 v14, v14, v32 15426; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 15427; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15428; GCN-NEXT: s_waitcnt vmcnt(0) 15429; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15430; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15431; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 15432; GCN-NEXT: v_mul_f32_e32 v13, v13, v32 15433; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 15434; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15435; GCN-NEXT: s_waitcnt vmcnt(0) 15436; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15437; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15438; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 15439; GCN-NEXT: v_mul_f32_e32 v12, v12, v32 15440; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 15441; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15442; GCN-NEXT: s_waitcnt vmcnt(0) 15443; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15444; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15445; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 15446; GCN-NEXT: v_mul_f32_e32 v11, v11, v32 15447; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 15448; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15449; GCN-NEXT: s_waitcnt vmcnt(0) 15450; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15451; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15452; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 15453; GCN-NEXT: v_mul_f32_e32 v10, v10, v32 15454; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 15455; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15456; GCN-NEXT: s_waitcnt vmcnt(0) 15457; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15458; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15459; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 15460; GCN-NEXT: v_mul_f32_e32 v9, v9, v32 15461; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 15462; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15463; GCN-NEXT: s_waitcnt vmcnt(0) 15464; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15465; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15466; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 15467; GCN-NEXT: v_mul_f32_e32 v8, v8, v32 15468; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 15469; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15470; GCN-NEXT: s_waitcnt vmcnt(0) 15471; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15472; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15473; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 15474; GCN-NEXT: v_mul_f32_e32 v7, v7, v32 15475; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 15476; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15477; GCN-NEXT: s_waitcnt vmcnt(0) 15478; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15479; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15480; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 15481; GCN-NEXT: v_mul_f32_e32 v6, v6, v32 15482; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 15483; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15484; GCN-NEXT: s_waitcnt vmcnt(0) 15485; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15486; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15487; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 15488; GCN-NEXT: v_mul_f32_e32 v5, v5, v32 15489; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 15490; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15491; GCN-NEXT: s_waitcnt vmcnt(0) 15492; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15493; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15494; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 15495; GCN-NEXT: v_mul_f32_e32 v4, v4, v32 15496; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 15497; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15498; GCN-NEXT: s_waitcnt vmcnt(0) 15499; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15500; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15501; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 15502; GCN-NEXT: v_mul_f32_e32 v3, v3, v32 15503; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 15504; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15505; GCN-NEXT: s_waitcnt vmcnt(0) 15506; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15507; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15508; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 15509; GCN-NEXT: v_mul_f32_e32 v2, v2, v32 15510; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 15511; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 15512; GCN-NEXT: s_waitcnt vmcnt(0) 15513; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15514; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15515; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 15516; GCN-NEXT: v_mul_f32_e32 v1, v1, v32 15517; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 15518; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15519; GCN-NEXT: s_waitcnt vmcnt(0) 15520; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 15521; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15522; GCN-NEXT: v_mul_f32_e32 v0, v0, v32 15523; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15524; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 15525; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15526; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15527; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15528; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15529; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15530; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15531; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15532; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15533; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15534; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15535; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15536; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15537; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15538; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 15539; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 15540; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 15541; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 15542; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 15543; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 15544; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 15545; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 15546; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 15547; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 15548; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 15549; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 15550; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 15551; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 15552; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 15553; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15554; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 15555; GCN-NEXT: s_setpc_b64 s[30:31] 15556; 15557; GFX7-LABEL: v_fmul_v32bf16: 15558; GFX7: ; %bb.0: 15559; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15560; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 15561; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 15562; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 15563; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15564; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 15565; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 15566; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 15567; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 15568; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 15569; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 15570; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 15571; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 15572; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 15573; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 15574; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 15575; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 15576; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 15577; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 15578; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 15579; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 15580; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 15581; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 15582; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 15583; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 15584; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 15585; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 15586; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 15587; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 15588; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 15589; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 15590; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 15591; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 15592; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 15593; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 15594; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 15595; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15596; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 15597; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15598; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 15599; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15600; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 15601; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15602; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 15603; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15604; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 15605; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15606; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 15607; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15608; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 15609; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15610; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 15611; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15612; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15613; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15614; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15615; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15616; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15617; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15618; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 15619; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15620; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 15621; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 15622; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 15623; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15624; GFX7-NEXT: s_waitcnt vmcnt(1) 15625; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 15626; GFX7-NEXT: s_waitcnt vmcnt(0) 15627; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15628; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15629; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 15630; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32 15631; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 15632; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 15633; GFX7-NEXT: s_waitcnt vmcnt(0) 15634; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15635; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15636; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32 15637; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 15638; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15639; GFX7-NEXT: s_waitcnt vmcnt(0) 15640; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15641; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15642; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32 15643; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 15644; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 15645; GFX7-NEXT: s_waitcnt vmcnt(0) 15646; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15647; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15648; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32 15649; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 15650; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 15651; GFX7-NEXT: s_waitcnt vmcnt(0) 15652; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15653; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15654; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32 15655; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 15656; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 15657; GFX7-NEXT: s_waitcnt vmcnt(0) 15658; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15659; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15660; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32 15661; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 15662; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 15663; GFX7-NEXT: s_waitcnt vmcnt(0) 15664; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15665; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15666; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32 15667; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 15668; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 15669; GFX7-NEXT: s_waitcnt vmcnt(0) 15670; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15671; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15672; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32 15673; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 15674; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 15675; GFX7-NEXT: s_waitcnt vmcnt(0) 15676; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15677; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15678; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32 15679; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 15680; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 15681; GFX7-NEXT: s_waitcnt vmcnt(0) 15682; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15683; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15684; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32 15685; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 15686; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 15687; GFX7-NEXT: s_waitcnt vmcnt(0) 15688; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15689; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15690; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32 15691; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 15692; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 15693; GFX7-NEXT: s_waitcnt vmcnt(0) 15694; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15695; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15696; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32 15697; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 15698; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 15699; GFX7-NEXT: s_waitcnt vmcnt(0) 15700; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15701; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15702; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32 15703; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 15704; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 15705; GFX7-NEXT: s_waitcnt vmcnt(0) 15706; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15707; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15708; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32 15709; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 15710; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 15711; GFX7-NEXT: s_waitcnt vmcnt(0) 15712; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15713; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15714; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32 15715; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 15716; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 15717; GFX7-NEXT: s_waitcnt vmcnt(0) 15718; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15719; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15720; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32 15721; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 15722; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 15723; GFX7-NEXT: s_waitcnt vmcnt(0) 15724; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15725; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15726; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32 15727; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 15728; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 15729; GFX7-NEXT: s_waitcnt vmcnt(0) 15730; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15731; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15732; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32 15733; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 15734; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15735; GFX7-NEXT: s_waitcnt vmcnt(0) 15736; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15737; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15738; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32 15739; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 15740; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15741; GFX7-NEXT: s_waitcnt vmcnt(0) 15742; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15743; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15744; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32 15745; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 15746; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15747; GFX7-NEXT: s_waitcnt vmcnt(0) 15748; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15749; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15750; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32 15751; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 15752; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15753; GFX7-NEXT: s_waitcnt vmcnt(0) 15754; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15755; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15756; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32 15757; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 15758; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15759; GFX7-NEXT: s_waitcnt vmcnt(0) 15760; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15761; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15762; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32 15763; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 15764; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15765; GFX7-NEXT: s_waitcnt vmcnt(0) 15766; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15767; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15768; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32 15769; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 15770; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15771; GFX7-NEXT: s_waitcnt vmcnt(0) 15772; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15773; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15774; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32 15775; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 15776; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15777; GFX7-NEXT: s_waitcnt vmcnt(0) 15778; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15779; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15780; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32 15781; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 15782; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15783; GFX7-NEXT: s_waitcnt vmcnt(0) 15784; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15785; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15786; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32 15787; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 15788; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 15789; GFX7-NEXT: s_waitcnt vmcnt(0) 15790; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15791; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15792; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32 15793; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 15794; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 15795; GFX7-NEXT: s_waitcnt vmcnt(0) 15796; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15797; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15798; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32 15799; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 15800; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15801; GFX7-NEXT: s_waitcnt vmcnt(0) 15802; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15803; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15804; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32 15805; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 15806; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15807; GFX7-NEXT: s_waitcnt vmcnt(0) 15808; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15809; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15810; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32 15811; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 15812; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 15813; GFX7-NEXT: s_waitcnt vmcnt(0) 15814; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 15815; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 15816; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32 15817; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 15818; GFX7-NEXT: s_setpc_b64 s[30:31] 15819; 15820; GFX8-LABEL: v_fmul_v32bf16: 15821; GFX8: ; %bb.0: 15822; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15823; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 15824; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 15825; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31 15826; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 15827; GFX8-NEXT: s_movk_i32 s4, 0x7fff 15828; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 15829; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15830; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 15831; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 15832; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30 15833; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 15834; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 15835; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 15836; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 15837; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 15838; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 15839; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 15840; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 15841; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 15842; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 15843; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 15844; GFX8-NEXT: v_mul_f32_e32 v32, v32, v30 15845; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 15846; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 15847; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 15848; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 15849; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 15850; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29 15851; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 15852; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 15853; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 15854; GFX8-NEXT: s_waitcnt vmcnt(0) 15855; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 15856; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34 15857; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 15858; GFX8-NEXT: v_mul_f32_e32 v30, v15, v30 15859; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 15860; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 15861; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 15862; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 15863; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 15864; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 15865; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 15866; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 15867; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15868; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 15869; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 15870; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc 15871; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 15872; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 15873; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15874; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 15875; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 15876; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 15877; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 15878; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 15879; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 15880; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 15881; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc 15882; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 15883; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 15884; GFX8-NEXT: v_mul_f32_e32 v29, v33, v29 15885; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 15886; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 15887; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 15888; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 15889; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15890; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28 15891; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 15892; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 15893; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 15894; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 15895; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 15896; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 15897; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 15898; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 15899; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 15900; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 15901; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 15902; GFX8-NEXT: v_mul_f32_e32 v28, v33, v28 15903; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 15904; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 15905; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 15906; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 15907; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15908; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27 15909; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 15910; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 15911; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 15912; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 15913; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 15914; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 15915; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 15916; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 15917; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 15918; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 15919; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 15920; GFX8-NEXT: v_mul_f32_e32 v27, v33, v27 15921; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 15922; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 15923; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 15924; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 15925; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15926; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26 15927; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 15928; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 15929; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 15930; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 15931; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 15932; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 15933; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 15934; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 15935; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 15936; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 15937; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 15938; GFX8-NEXT: v_mul_f32_e32 v26, v33, v26 15939; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 15940; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 15941; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 15942; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 15943; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15944; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25 15945; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 15946; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 15947; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 15948; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 15949; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 15950; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 15951; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 15952; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 15953; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 15954; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 15955; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 15956; GFX8-NEXT: v_mul_f32_e32 v25, v33, v25 15957; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 15958; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 15959; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 15960; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 15961; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15962; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24 15963; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 15964; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 15965; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 15966; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 15967; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 15968; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 15969; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 15970; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 15971; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 15972; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 15973; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 15974; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24 15975; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 15976; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 15977; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 15978; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 15979; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15980; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23 15981; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 15982; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 15983; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 15984; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 15985; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 15986; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 15987; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 15988; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 15989; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 15990; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 15991; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 15992; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23 15993; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 15994; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 15995; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 15996; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 15997; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 15998; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22 15999; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 16000; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 16001; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 16002; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 16003; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 16004; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 16005; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 16006; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16007; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 16008; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 16009; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 16010; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22 16011; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 16012; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 16013; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 16014; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 16015; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16016; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21 16017; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 16018; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 16019; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 16020; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 16021; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 16022; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 16023; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 16024; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16025; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 16026; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 16027; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 16028; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21 16029; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 16030; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 16031; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 16032; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 16033; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16034; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20 16035; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 16036; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 16037; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 16038; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 16039; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 16040; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 16041; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 16042; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 16043; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 16044; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 16045; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 16046; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20 16047; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 16048; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 16049; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 16050; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16051; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16052; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19 16053; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 16054; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 16055; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 16056; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 16057; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 16058; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 16059; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 16060; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 16061; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 16062; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 16063; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 16064; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19 16065; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 16066; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 16067; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 16068; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16069; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16070; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18 16071; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 16072; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 16073; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 16074; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 16075; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 16076; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 16077; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 16078; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 16079; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 16080; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 16081; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 16082; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18 16083; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 16084; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 16085; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 16086; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 16087; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16088; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17 16089; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 16090; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 16091; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 16092; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 16093; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 16094; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 16095; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 16096; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 16097; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 16098; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 16099; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 16100; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17 16101; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 16102; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 16103; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 16104; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 16105; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 16106; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16 16107; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 16108; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 16109; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 16110; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 16111; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 16112; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 16113; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 16114; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 16115; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 16116; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 16117; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 16118; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 16119; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 16120; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 16121; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16122; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 16123; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 16124; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 16125; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 16126; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 16127; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 16128; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 16129; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 16130; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 16131; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 16132; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 16133; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 16134; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 16135; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 16136; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 16137; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 16138; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 16139; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 16140; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 16141; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 16142; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 16143; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 16144; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 16145; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 16146; GFX8-NEXT: s_setpc_b64 s[30:31] 16147; 16148; GFX9-LABEL: v_fmul_v32bf16: 16149; GFX9: ; %bb.0: 16150; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16151; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 16152; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 16153; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 16154; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 16155; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 16156; GFX9-NEXT: s_movk_i32 s4, 0x7fff 16157; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 16158; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 16159; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 16160; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 16161; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 16162; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 16163; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 16164; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 16165; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 16166; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 16167; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 16168; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 16169; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 16170; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 16171; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 16172; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 16173; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 16174; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 16175; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 16176; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 16177; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 16178; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 16179; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc 16180; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 16181; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 16182; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 16183; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc 16184; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 16185; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 16186; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29 16187; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 16188; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 16189; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 16190; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 16191; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 16192; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 16193; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 16194; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 16195; GFX9-NEXT: s_waitcnt vmcnt(0) 16196; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 16197; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34 16198; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 16199; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 16200; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 16201; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 16202; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 16203; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 16204; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 16205; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 16206; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 16207; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 16208; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 16209; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 16210; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 16211; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 16212; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 16213; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 16214; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 16215; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 16216; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 16217; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 16218; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 16219; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 16220; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 16221; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 16222; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 16223; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 16224; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 16225; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 16226; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 16227; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 16228; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 16229; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 16230; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 16231; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 16232; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 16233; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 16234; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 16235; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 16236; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 16237; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 16238; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 16239; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 16240; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 16241; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 16242; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 16243; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 16244; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 16245; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 16246; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 16247; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 16248; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 16249; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 16250; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 16251; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 16252; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 16253; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 16254; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 16255; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 16256; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 16257; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 16258; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 16259; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 16260; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 16261; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 16262; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 16263; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 16264; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 16265; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 16266; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 16267; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 16268; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 16269; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 16270; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 16271; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 16272; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 16273; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 16274; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 16275; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 16276; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 16277; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 16278; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 16279; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 16280; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 16281; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 16282; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 16283; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 16284; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 16285; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 16286; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 16287; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 16288; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 16289; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 16290; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 16291; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 16292; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 16293; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 16294; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 16295; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 16296; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 16297; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 16298; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 16299; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 16300; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 16301; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 16302; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 16303; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 16304; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 16305; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 16306; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 16307; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 16308; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 16309; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 16310; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 16311; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 16312; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16313; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 16314; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 16315; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 16316; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 16317; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 16318; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 16319; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 16320; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 16321; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 16322; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 16323; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 16324; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 16325; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 16326; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 16327; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 16328; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16329; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 16330; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 16331; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 16332; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 16333; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 16334; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 16335; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 16336; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 16337; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 16338; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 16339; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 16340; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 16341; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 16342; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 16343; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 16344; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 16345; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 16346; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 16347; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 16348; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 16349; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 16350; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16351; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 16352; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 16353; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 16354; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 16355; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 16356; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 16357; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 16358; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 16359; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 16360; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 16361; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 16362; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 16363; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 16364; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 16365; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 16366; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16367; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 16368; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 16369; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 16370; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 16371; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 16372; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 16373; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 16374; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 16375; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 16376; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 16377; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 16378; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 16379; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 16380; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 16381; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 16382; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 16383; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 16384; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 16385; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 16386; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 16387; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 16388; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 16389; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 16390; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 16391; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 16392; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 16393; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 16394; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 16395; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 16396; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 16397; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 16398; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 16399; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 16400; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 16401; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 16402; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 16403; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 16404; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 16405; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 16406; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 16407; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 16408; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 16409; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 16410; GFX9-NEXT: s_mov_b32 s4, 0x7060302 16411; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 16412; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 16413; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 16414; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 16415; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 16416; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 16417; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 16418; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 16419; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 16420; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 16421; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 16422; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 16423; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 16424; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 16425; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 16426; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 16427; GFX9-NEXT: s_setpc_b64 s[30:31] 16428; 16429; GFX10-LABEL: v_fmul_v32bf16: 16430; GFX10: ; %bb.0: 16431; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16432; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 16433; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 16434; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 16435; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 16436; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 16437; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 16438; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 16439; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 16440; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 16441; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 16442; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 16443; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 16444; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 16445; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 16446; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 16447; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 16448; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 16449; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 16450; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 16451; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28 16452; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 16453; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39 16454; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 16455; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 16456; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 16457; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27 16458; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 16459; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49 16460; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 16461; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33 16462; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30 16463; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 16464; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35 16465; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 16466; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 16467; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 16468; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29 16469; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 16470; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37 16471; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 16472; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 16473; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 16474; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22 16475; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 16476; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27 16477; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 16478; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 16479; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 16480; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 16481; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 16482; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 16483; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 16484; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 16485; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 16486; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24 16487; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 16488; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29 16489; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 16490; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 16491; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16492; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23 16493; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 16494; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28 16495; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 16496; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 16497; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 16498; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16 16499; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 16500; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26 16501; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 16502; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51 16503; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 16504; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 16505; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 16506; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25 16507; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 16508; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30 16509; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 16510; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 16511; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16512; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18 16513; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23 16514; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17 16515; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22 16516; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 16517; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 16518; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 16519; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 16520; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 16521; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 16522; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20 16523; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25 16524; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19 16525; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24 16526; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 16527; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 16528; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff 16529; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo 16530; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 16531; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21 16532; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26 16533; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 16534; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 16535; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff 16536; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo 16537; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 16538; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 16539; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 16540; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff 16541; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 16542; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo 16543; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 16544; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 16545; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff 16546; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 16547; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 16548; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo 16549; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 16550; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 16551; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 16552; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 16553; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff 16554; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo 16555; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 16556; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 16557; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 16558; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff 16559; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 16560; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo 16561; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 16562; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 16563; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff 16564; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 16565; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 16566; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo 16567; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 16568; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff 16569; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 16570; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 16571; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff 16572; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo 16573; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 16574; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 16575; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 16576; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff 16577; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 16578; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo 16579; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 16580; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 16581; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff 16582; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 16583; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 16584; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo 16585; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 16586; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff 16587; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 16588; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 16589; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff 16590; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo 16591; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 16592; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 16593; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 16594; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff 16595; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 16596; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo 16597; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 16598; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 16599; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff 16600; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 16601; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 16602; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo 16603; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 16604; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 16605; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff 16606; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 16607; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 16608; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo 16609; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 16610; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 16611; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff 16612; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 16613; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo 16614; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 16615; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 16616; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff 16617; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 16618; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo 16619; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 16620; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 16621; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff 16622; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 16623; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo 16624; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16625; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 16626; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff 16627; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 16628; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo 16629; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 16630; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff 16631; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 16632; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 16633; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo 16634; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16635; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 16636; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 16637; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff 16638; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo 16639; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 16640; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff 16641; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 16642; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 16643; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo 16644; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 16645; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 16646; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 16647; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff 16648; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo 16649; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 16650; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 16651; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff 16652; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 16653; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo 16654; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 16655; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 16656; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff 16657; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 16658; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo 16659; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 16660; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 16661; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff 16662; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 16663; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo 16664; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 16665; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff 16666; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo 16667; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 16668; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 16669; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo 16670; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 16671; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 16672; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 16673; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo 16674; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 16675; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 16676; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo 16677; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 16678; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 16679; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo 16680; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 16681; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 16682; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 16683; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 16684; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 16685; GFX10-NEXT: s_waitcnt vmcnt(0) 16686; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 16687; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 16688; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17 16689; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18 16690; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 16691; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 16692; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 16693; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 16694; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 16695; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff 16696; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff 16697; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 16698; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 16699; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo 16700; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 16701; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 16702; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo 16703; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 16704; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 16705; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 16706; GFX10-NEXT: s_setpc_b64 s[30:31] 16707; 16708; GFX11-LABEL: v_fmul_v32bf16: 16709; GFX11: ; %bb.0: 16710; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16711; GFX11-NEXT: scratch_load_b32 v32, off, s32 16712; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 16713; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 16714; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 16715; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 16716; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 16717; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 16718; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 16719; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 16720; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 16721; GFX11-NEXT: v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 16722; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 16723; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 16724; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 16725; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 16726; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 16727; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 16728; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 16729; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 16730; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 16731; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 16732; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff 16733; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 16734; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16735; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff 16736; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 16737; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 16738; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 16739; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 16740; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 16741; GFX11-NEXT: v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 16742; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 16743; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 16744; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 16745; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 16746; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 16747; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 16748; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 16749; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 16750; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 16751; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff 16752; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 16753; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 16754; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 16755; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 16756; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16757; GFX11-NEXT: v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 16758; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 16759; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 16760; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff 16761; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 16762; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 16763; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 16764; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 16765; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 16766; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 16767; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff 16768; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 16769; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 16770; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 16771; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 16772; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18 16773; GFX11-NEXT: v_mul_f32_e32 v0, v0, v16 16774; GFX11-NEXT: v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 16775; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23 16776; GFX11-NEXT: v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83 16777; GFX11-NEXT: v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 16778; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 16779; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 16780; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 16781; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 16782; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 16783; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 16784; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff 16785; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 16786; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 16787; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff 16788; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 16789; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 16790; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 16791; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff 16792; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 16793; GFX11-NEXT: v_mul_f32_e32 v4, v4, v20 16794; GFX11-NEXT: v_mul_f32_e32 v20, v80, v71 16795; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 16796; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 16797; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 16798; GFX11-NEXT: v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 16799; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 16800; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff 16801; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 16802; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 16803; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 16804; GFX11-NEXT: v_mul_f32_e32 v26, v52, v51 16805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 16806; GFX11-NEXT: v_mul_f32_e32 v6, v6, v22 16807; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 16808; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 16809; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 16810; GFX11-NEXT: v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 16811; GFX11-NEXT: v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 16812; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 16813; GFX11-NEXT: v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 16814; GFX11-NEXT: v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 16815; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 16816; GFX11-NEXT: v_mul_f32_e32 v29, v38, v37 16817; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 16818; GFX11-NEXT: v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 16819; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 16820; GFX11-NEXT: v_mul_f32_e32 v14, v14, v30 16821; GFX11-NEXT: v_mul_f32_e32 v28, v48, v39 16822; GFX11-NEXT: v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33 16823; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 16824; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 16825; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 16826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 16827; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 16828; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 16829; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 16830; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 16831; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff 16832; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 16833; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 16834; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff 16835; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 16836; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 16837; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff 16838; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo 16839; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 16840; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 16841; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 16842; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff 16843; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 16844; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo 16845; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 16846; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 16847; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 16848; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 16849; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 16850; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo 16851; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 16852; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff 16853; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 16854; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 16855; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff 16856; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo 16857; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 16858; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 16859; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 16860; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff 16861; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 16862; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo 16863; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 16864; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 16865; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff 16866; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 16867; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 16868; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo 16869; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 16870; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff 16871; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 16872; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 16873; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff 16874; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo 16875; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 16876; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 16877; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff 16878; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 16879; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 16880; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo 16881; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 16882; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 16883; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff 16884; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 16885; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 16886; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo 16887; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 16888; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff 16889; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 16890; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 16891; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff 16892; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo 16893; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 16894; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 16895; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 16896; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff 16897; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 16898; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo 16899; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 16900; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff 16901; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 16902; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 16903; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 16904; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo 16905; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 16906; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff 16907; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 16908; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff 16909; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 16910; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo 16911; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 16912; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 16913; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 16914; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff 16915; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 16916; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo 16917; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 16918; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 16919; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 16920; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 16921; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 16922; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo 16923; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 16924; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 16925; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo 16926; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 16927; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 16928; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 16929; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo 16930; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16931; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo 16932; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 16933; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 16934; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo 16935; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16936; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo 16937; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 16938; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 16939; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 16940; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo 16941; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 16942; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo 16943; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 16944; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 16945; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo 16946; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 16947; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo 16948; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 16949; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo 16950; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 16951; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo 16952; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 16953; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 16954; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 16955; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo 16956; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 16957; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo 16958; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 16959; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 16960; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo 16961; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 16962; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 16963; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 16964; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo 16965; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 16966; GFX11-NEXT: s_waitcnt vmcnt(0) 16967; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 16968; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16969; GFX11-NEXT: v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 16970; GFX11-NEXT: v_mul_f32_e32 v15, v15, v18 16971; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16972; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 16973; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 16974; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 16975; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 16976; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 16977; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff 16978; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff 16979; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 16980; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo 16981; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 16982; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo 16983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 16984; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 16985; GFX11-NEXT: s_setpc_b64 s[30:31] 16986 %op = fmul <32 x bfloat> %a, %b 16987 ret <32 x bfloat> %op 16988} 16989 16990define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { 16991; GCN-LABEL: v_fdiv_bf16: 16992; GCN: ; %bb.0: 16993; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16994; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 16995; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 16996; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 16997; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 16998; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 16999; GCN-NEXT: v_rcp_f32_e32 v3, v2 17000; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0 17001; GCN-NEXT: v_fma_f32 v3, v4, v3, v3 17002; GCN-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 17003; GCN-NEXT: v_mul_f32_e32 v5, v4, v3 17004; GCN-NEXT: v_fma_f32 v6, -v2, v5, v4 17005; GCN-NEXT: v_fma_f32 v5, v6, v3, v5 17006; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4 17007; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5 17008; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17009; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17010; GCN-NEXT: s_setpc_b64 s[30:31] 17011; 17012; GFX7-LABEL: v_fdiv_bf16: 17013; GFX7: ; %bb.0: 17014; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17015; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17016; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17017; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17018; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17019; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 17020; GFX7-NEXT: v_rcp_f32_e32 v3, v2 17021; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 17022; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 17023; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 17024; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 17025; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 17026; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 17027; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 17028; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 17029; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17030; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17031; GFX7-NEXT: s_setpc_b64 s[30:31] 17032; 17033; GFX8-LABEL: v_fdiv_bf16: 17034; GFX8: ; %bb.0: 17035; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17036; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17037; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17038; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 17039; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 17040; GFX8-NEXT: v_rcp_f32_e32 v4, v2 17041; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 17042; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 17043; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 17044; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 17045; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 17046; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 17047; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 17048; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17049; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 17050; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 17051; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 17052; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 17053; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17054; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 17055; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17056; GFX8-NEXT: s_setpc_b64 s[30:31] 17057; 17058; GFX9-LABEL: v_fdiv_bf16: 17059; GFX9: ; %bb.0: 17060; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17061; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17062; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17063; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 17064; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 17065; GFX9-NEXT: s_movk_i32 s4, 0x7fff 17066; GFX9-NEXT: v_rcp_f32_e32 v4, v2 17067; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 17068; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 17069; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4 17070; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3 17071; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5 17072; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 17073; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 17074; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17075; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 17076; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 17077; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 17078; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17079; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 17080; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17081; GFX9-NEXT: s_setpc_b64 s[30:31] 17082; 17083; GFX10-LABEL: v_fdiv_bf16: 17084; GFX10: ; %bb.0: 17085; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17086; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17087; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17088; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 17089; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 17090; GFX10-NEXT: v_rcp_f32_e32 v3, v2 17091; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 17092; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 17093; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3 17094; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5 17095; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3 17096; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5 17097; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4 17098; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17099; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 17100; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 17101; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17102; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 17103; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 17104; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17105; GFX10-NEXT: s_setpc_b64 s[30:31] 17106; 17107; GFX11-LABEL: v_fdiv_bf16: 17108; GFX11: ; %bb.0: 17109; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17110; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17111; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17113; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 17114; GFX11-NEXT: v_rcp_f32_e32 v3, v2 17115; GFX11-NEXT: s_waitcnt_depctr 0xfff 17116; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 17117; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 17118; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 17119; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 17120; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3 17121; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17122; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5 17123; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3 17124; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17125; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5 17126; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4 17127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17128; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 17129; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 17130; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 17131; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17132; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 17133; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 17134; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 17135; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17136; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17137; GFX11-NEXT: s_setpc_b64 s[30:31] 17138 %op = fdiv bfloat %a, %b 17139 ret bfloat %op 17140} 17141 17142declare bfloat @llvm.fabs.bf16(bfloat) 17143 17144define bfloat @v_fabs_bf16(bfloat %a) { 17145; GCN-LABEL: v_fabs_bf16: 17146; GCN: ; %bb.0: 17147; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17148; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17149; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17150; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 17151; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17152; GCN-NEXT: s_setpc_b64 s[30:31] 17153; 17154; GFX7-LABEL: v_fabs_bf16: 17155; GFX7: ; %bb.0: 17156; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17157; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17158; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17159; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 17160; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17161; GFX7-NEXT: s_setpc_b64 s[30:31] 17162; 17163; GFX8-LABEL: v_fabs_bf16: 17164; GFX8: ; %bb.0: 17165; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17166; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0 17167; GFX8-NEXT: s_setpc_b64 s[30:31] 17168; 17169; GFX9-LABEL: v_fabs_bf16: 17170; GFX9: ; %bb.0: 17171; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17172; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 17173; GFX9-NEXT: s_setpc_b64 s[30:31] 17174; 17175; GFX10-LABEL: v_fabs_bf16: 17176; GFX10: ; %bb.0: 17177; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17178; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0 17179; GFX10-NEXT: s_setpc_b64 s[30:31] 17180; 17181; GFX11TRUE16-LABEL: v_fabs_bf16: 17182; GFX11TRUE16: ; %bb.0: 17183; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17184; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l 17185; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 17186; 17187; GFX11FAKE16-LABEL: v_fabs_bf16: 17188; GFX11FAKE16: ; %bb.0: 17189; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17190; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 17191; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 17192 %op = call bfloat @llvm.fabs.bf16(bfloat %a) 17193 ret bfloat %op 17194} 17195 17196define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { 17197; GCN-LABEL: s_fabs_bf16: 17198; GCN: ; %bb.0: 17199; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 17200; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 17201; GCN-NEXT: v_readfirstlane_b32 s0, v0 17202; GCN-NEXT: ; return to shader part epilog 17203; 17204; GFX7-LABEL: s_fabs_bf16: 17205; GFX7: ; %bb.0: 17206; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 17207; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 17208; GFX7-NEXT: v_readfirstlane_b32 s0, v0 17209; GFX7-NEXT: ; return to shader part epilog 17210; 17211; GFX8-LABEL: s_fabs_bf16: 17212; GFX8: ; %bb.0: 17213; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff 17214; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 17215; GFX8-NEXT: ; return to shader part epilog 17216; 17217; GFX9-LABEL: s_fabs_bf16: 17218; GFX9: ; %bb.0: 17219; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff 17220; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 17221; GFX9-NEXT: ; return to shader part epilog 17222; 17223; GFX10-LABEL: s_fabs_bf16: 17224; GFX10: ; %bb.0: 17225; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff 17226; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 17227; GFX10-NEXT: ; return to shader part epilog 17228; 17229; GFX11-LABEL: s_fabs_bf16: 17230; GFX11: ; %bb.0: 17231; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff 17232; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17233; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 17234; GFX11-NEXT: ; return to shader part epilog 17235 %op = call bfloat @llvm.fabs.bf16(bfloat %a) 17236 %cast = bitcast bfloat %op to i16 17237 %zext = zext i16 %cast to i32 17238 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 17239 ret i32 %readlane 17240} 17241 17242define bfloat @v_fneg_bf16(bfloat %a) { 17243; GCN-LABEL: v_fneg_bf16: 17244; GCN: ; %bb.0: 17245; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17246; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 17247; GCN-NEXT: s_setpc_b64 s[30:31] 17248; 17249; GFX7-LABEL: v_fneg_bf16: 17250; GFX7: ; %bb.0: 17251; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17252; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 17253; GFX7-NEXT: s_setpc_b64 s[30:31] 17254; 17255; GFX8-LABEL: v_fneg_bf16: 17256; GFX8: ; %bb.0: 17257; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17258; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0 17259; GFX8-NEXT: s_setpc_b64 s[30:31] 17260; 17261; GFX9-LABEL: v_fneg_bf16: 17262; GFX9: ; %bb.0: 17263; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17264; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0 17265; GFX9-NEXT: s_setpc_b64 s[30:31] 17266; 17267; GFX10-LABEL: v_fneg_bf16: 17268; GFX10: ; %bb.0: 17269; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17270; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 17271; GFX10-NEXT: s_setpc_b64 s[30:31] 17272; 17273; GFX11TRUE16-LABEL: v_fneg_bf16: 17274; GFX11TRUE16: ; %bb.0: 17275; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17276; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l 17277; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 17278; 17279; GFX11FAKE16-LABEL: v_fneg_bf16: 17280; GFX11FAKE16: ; %bb.0: 17281; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17282; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 17283; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 17284 %op = fneg bfloat %a 17285 ret bfloat %op 17286} 17287 17288declare i32 @llvm.amdgcn.readfirstlane(i32) 17289 17290; FIXME: readfirstlane hack for other bugs 17291define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { 17292; GCN-LABEL: s_fneg_bf16: 17293; GCN: ; %bb.0: 17294; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0 17295; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17296; GCN-NEXT: v_readfirstlane_b32 s0, v0 17297; GCN-NEXT: ; return to shader part epilog 17298; 17299; GFX7-LABEL: s_fneg_bf16: 17300; GFX7: ; %bb.0: 17301; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0 17302; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17303; GFX7-NEXT: v_readfirstlane_b32 s0, v0 17304; GFX7-NEXT: ; return to shader part epilog 17305; 17306; GFX8-LABEL: s_fneg_bf16: 17307; GFX8: ; %bb.0: 17308; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000 17309; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 17310; GFX8-NEXT: ; return to shader part epilog 17311; 17312; GFX9-LABEL: s_fneg_bf16: 17313; GFX9: ; %bb.0: 17314; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000 17315; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 17316; GFX9-NEXT: ; return to shader part epilog 17317; 17318; GFX10-LABEL: s_fneg_bf16: 17319; GFX10: ; %bb.0: 17320; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000 17321; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 17322; GFX10-NEXT: ; return to shader part epilog 17323; 17324; GFX11-LABEL: s_fneg_bf16: 17325; GFX11: ; %bb.0: 17326; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000 17327; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17328; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 17329; GFX11-NEXT: ; return to shader part epilog 17330 %op = fneg bfloat %a 17331 %cast = bitcast bfloat %op to i16 17332 %zext = zext i16 %cast to i32 17333 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 17334 ret i32 %readlane 17335} 17336 17337define bfloat @v_fneg_fabs_bf16(bfloat %a) { 17338; GCN-LABEL: v_fneg_fabs_bf16: 17339; GCN: ; %bb.0: 17340; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17341; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17342; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17343; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 17344; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17345; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 17346; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17347; GCN-NEXT: s_setpc_b64 s[30:31] 17348; 17349; GFX7-LABEL: v_fneg_fabs_bf16: 17350; GFX7: ; %bb.0: 17351; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17352; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17353; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17354; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 17355; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17356; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 17357; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17358; GFX7-NEXT: s_setpc_b64 s[30:31] 17359; 17360; GFX8-LABEL: v_fneg_fabs_bf16: 17361; GFX8: ; %bb.0: 17362; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17363; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0 17364; GFX8-NEXT: s_setpc_b64 s[30:31] 17365; 17366; GFX9-LABEL: v_fneg_fabs_bf16: 17367; GFX9: ; %bb.0: 17368; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17369; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 17370; GFX9-NEXT: s_setpc_b64 s[30:31] 17371; 17372; GFX10-LABEL: v_fneg_fabs_bf16: 17373; GFX10: ; %bb.0: 17374; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17375; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0 17376; GFX10-NEXT: s_setpc_b64 s[30:31] 17377; 17378; GFX11TRUE16-LABEL: v_fneg_fabs_bf16: 17379; GFX11TRUE16: ; %bb.0: 17380; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17381; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l 17382; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 17383; 17384; GFX11FAKE16-LABEL: v_fneg_fabs_bf16: 17385; GFX11FAKE16: ; %bb.0: 17386; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17387; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 17388; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 17389 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) 17390 %op = fneg bfloat %fabs 17391 ret bfloat %op 17392} 17393 17394; FIXME: readfirstlane hack for other bugs 17395define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { 17396; GCN-LABEL: s_fneg_fabs_bf16: 17397; GCN: ; %bb.0: 17398; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 17399; GCN-NEXT: v_readfirstlane_b32 s0, v0 17400; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 17401; GCN-NEXT: s_bitset0_b32 s0, 31 17402; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 17403; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000 17404; GCN-NEXT: s_lshr_b32 s0, s0, 16 17405; GCN-NEXT: ; return to shader part epilog 17406; 17407; GFX7-LABEL: s_fneg_fabs_bf16: 17408; GFX7: ; %bb.0: 17409; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 17410; GFX7-NEXT: v_readfirstlane_b32 s0, v0 17411; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 17412; GFX7-NEXT: s_bitset0_b32 s0, 31 17413; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 17414; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000 17415; GFX7-NEXT: s_lshr_b32 s0, s0, 16 17416; GFX7-NEXT: ; return to shader part epilog 17417; 17418; GFX8-LABEL: s_fneg_fabs_bf16: 17419; GFX8: ; %bb.0: 17420; GFX8-NEXT: s_bitset1_b32 s0, 15 17421; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 17422; GFX8-NEXT: ; return to shader part epilog 17423; 17424; GFX9-LABEL: s_fneg_fabs_bf16: 17425; GFX9: ; %bb.0: 17426; GFX9-NEXT: s_bitset1_b32 s0, 15 17427; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 17428; GFX9-NEXT: ; return to shader part epilog 17429; 17430; GFX10-LABEL: s_fneg_fabs_bf16: 17431; GFX10: ; %bb.0: 17432; GFX10-NEXT: s_bitset1_b32 s0, 15 17433; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 17434; GFX10-NEXT: ; return to shader part epilog 17435; 17436; GFX11-LABEL: s_fneg_fabs_bf16: 17437; GFX11: ; %bb.0: 17438; GFX11-NEXT: s_bitset1_b32 s0, 15 17439; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 17440; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 17441; GFX11-NEXT: ; return to shader part epilog 17442 %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) 17443 %op = fneg bfloat %fabs 17444 %cast = bitcast bfloat %op to i16 17445 %zext = zext i16 %cast to i32 17446 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 17447 ret i32 %readlane 17448} 17449 17450declare bfloat @llvm.minnum.bf16(bfloat, bfloat) 17451declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>) 17452declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>) 17453declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>) 17454declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>) 17455declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>) 17456declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>) 17457 17458define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { 17459; GCN-LABEL: v_minnum_bf16: 17460; GCN: ; %bb.0: 17461; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17462; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17463; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 17464; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17465; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17466; GCN-NEXT: v_min_f32_e32 v0, v0, v1 17467; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17468; GCN-NEXT: s_setpc_b64 s[30:31] 17469; 17470; GFX7-LABEL: v_minnum_bf16: 17471; GFX7: ; %bb.0: 17472; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17473; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17474; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17475; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17476; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17477; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 17478; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17479; GFX7-NEXT: s_setpc_b64 s[30:31] 17480; 17481; GFX8-LABEL: v_minnum_bf16: 17482; GFX8: ; %bb.0: 17483; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17484; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17485; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17486; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 17487; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 17488; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 17489; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 17490; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 17491; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17492; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 17493; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17494; GFX8-NEXT: s_setpc_b64 s[30:31] 17495; 17496; GFX9-LABEL: v_minnum_bf16: 17497; GFX9: ; %bb.0: 17498; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17499; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17500; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17501; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 17502; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 17503; GFX9-NEXT: s_movk_i32 s4, 0x7fff 17504; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 17505; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 17506; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17507; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 17508; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17509; GFX9-NEXT: s_setpc_b64 s[30:31] 17510; 17511; GFX10-LABEL: v_minnum_bf16: 17512; GFX10: ; %bb.0: 17513; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17514; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17515; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17516; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 17517; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 17518; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 17519; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17520; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 17521; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 17522; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17523; GFX10-NEXT: s_setpc_b64 s[30:31] 17524; 17525; GFX11-LABEL: v_minnum_bf16: 17526; GFX11: ; %bb.0: 17527; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17528; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17529; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 17530; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 17531; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 17532; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 17533; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 17534; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17535; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 17536; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 17537; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 17538; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17539; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17540; GFX11-NEXT: s_setpc_b64 s[30:31] 17541 %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) 17542 ret bfloat %op 17543} 17544 17545define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { 17546; GCN-LABEL: v_minnum_v2bf16: 17547; GCN: ; %bb.0: 17548; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17549; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17550; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 17551; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 17552; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 17553; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17554; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17555; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17556; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17557; GCN-NEXT: v_min_f32_e32 v1, v1, v3 17558; GCN-NEXT: v_min_f32_e32 v0, v0, v2 17559; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17560; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17561; GCN-NEXT: s_setpc_b64 s[30:31] 17562; 17563; GFX7-LABEL: v_minnum_v2bf16: 17564; GFX7: ; %bb.0: 17565; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17566; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17567; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 17568; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17569; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17570; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17571; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17572; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17573; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17574; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 17575; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 17576; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17577; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17578; GFX7-NEXT: s_setpc_b64 s[30:31] 17579; 17580; GFX8-LABEL: v_minnum_v2bf16: 17581; GFX8: ; %bb.0: 17582; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17583; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 17584; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 17585; GFX8-NEXT: v_min_f32_e32 v2, v3, v2 17586; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 17587; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 17588; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17589; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17590; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 17591; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 17592; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 17593; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 17594; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 17595; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 17596; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 17597; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 17598; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 17599; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17600; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 17601; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17602; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 17603; GFX8-NEXT: s_setpc_b64 s[30:31] 17604; 17605; GFX9-LABEL: v_minnum_v2bf16: 17606; GFX9: ; %bb.0: 17607; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17608; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 17609; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 17610; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 17611; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17612; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17613; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 17614; GFX9-NEXT: s_movk_i32 s4, 0x7fff 17615; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 17616; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 17617; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 17618; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 17619; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 17620; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 17621; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 17622; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 17623; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17624; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 17625; GFX9-NEXT: s_mov_b32 s4, 0x7060302 17626; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 17627; GFX9-NEXT: s_setpc_b64 s[30:31] 17628; 17629; GFX10-LABEL: v_minnum_v2bf16: 17630; GFX10: ; %bb.0: 17631; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17632; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 17633; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 17634; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17635; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17636; GFX10-NEXT: v_min_f32_e32 v2, v3, v2 17637; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 17638; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 17639; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 17640; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 17641; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 17642; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 17643; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 17644; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 17645; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 17646; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17647; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 17648; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 17649; GFX10-NEXT: s_setpc_b64 s[30:31] 17650; 17651; GFX11-LABEL: v_minnum_v2bf16: 17652; GFX11: ; %bb.0: 17653; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17654; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 17655; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17656; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 17657; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17658; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 17659; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 17660; GFX11-NEXT: v_min_f32_e32 v2, v3, v2 17661; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 17662; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 17663; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 17664; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 17665; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 17666; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 17667; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 17668; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 17669; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 17670; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 17671; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17672; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 17673; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 17674; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 17675; GFX11-NEXT: s_setpc_b64 s[30:31] 17676 %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) 17677 ret <2 x bfloat> %op 17678} 17679 17680define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { 17681; GCN-LABEL: v_minnum_v3bf16: 17682; GCN: ; %bb.0: 17683; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17684; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17685; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 17686; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 17687; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 17688; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 17689; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 17690; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 17691; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17692; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 17693; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17694; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17695; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17696; GCN-NEXT: v_min_f32_e32 v2, v2, v5 17697; GCN-NEXT: v_min_f32_e32 v1, v1, v4 17698; GCN-NEXT: v_min_f32_e32 v0, v0, v3 17699; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17700; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17701; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17702; GCN-NEXT: s_setpc_b64 s[30:31] 17703; 17704; GFX7-LABEL: v_minnum_v3bf16: 17705; GFX7: ; %bb.0: 17706; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17707; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17708; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17709; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17710; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 17711; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 17712; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 17713; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 17714; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17715; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 17716; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17717; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17718; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17719; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 17720; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 17721; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 17722; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17723; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17724; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17725; GFX7-NEXT: s_setpc_b64 s[30:31] 17726; 17727; GFX8-LABEL: v_minnum_v3bf16: 17728; GFX8: ; %bb.0: 17729; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17730; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 17731; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17732; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 17733; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 17734; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 17735; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 17736; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 17737; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 17738; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 17739; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 17740; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 17741; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 17742; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 17743; GFX8-NEXT: s_movk_i32 s4, 0x7fff 17744; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 17745; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17746; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17747; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 17748; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 17749; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 17750; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 17751; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 17752; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 17753; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 17754; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 17755; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 17756; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17757; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 17758; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17759; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 17760; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 17761; GFX8-NEXT: s_setpc_b64 s[30:31] 17762; 17763; GFX9-LABEL: v_minnum_v3bf16: 17764; GFX9: ; %bb.0: 17765; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17766; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 17767; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17768; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 17769; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 17770; GFX9-NEXT: s_movk_i32 s4, 0x7fff 17771; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 17772; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 17773; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 17774; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 17775; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 17776; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 17777; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 17778; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17779; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17780; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 17781; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 17782; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 17783; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 17784; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 17785; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 17786; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 17787; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 17788; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 17789; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17790; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 17791; GFX9-NEXT: s_mov_b32 s4, 0x7060302 17792; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 17793; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 17794; GFX9-NEXT: s_setpc_b64 s[30:31] 17795; 17796; GFX10-LABEL: v_minnum_v3bf16: 17797; GFX10: ; %bb.0: 17798; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17799; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17800; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 17801; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17802; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17803; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 17804; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 17805; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 17806; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 17807; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 17808; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 17809; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 17810; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 17811; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 17812; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 17813; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 17814; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 17815; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 17816; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 17817; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 17818; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 17819; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17820; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 17821; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 17822; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 17823; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 17824; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 17825; GFX10-NEXT: s_setpc_b64 s[30:31] 17826; 17827; GFX11TRUE16-LABEL: v_minnum_v3bf16: 17828; GFX11TRUE16: ; %bb.0: 17829; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17830; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17831; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 17832; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 17833; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17834; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17835; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 17836; GFX11TRUE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 17837; GFX11TRUE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 17838; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 17839; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 17840; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 17841; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 17842; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 17843; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 17844; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 17845; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 17846; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 17847; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 17848; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 17849; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 17850; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17851; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 17852; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 17853; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 17854; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 17855; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 17856; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 17857; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 17858; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 17859; 17860; GFX11FAKE16-LABEL: v_minnum_v3bf16: 17861; GFX11FAKE16: ; %bb.0: 17862; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17863; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 17864; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 17865; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 17866; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17867; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17868; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 17869; GFX11FAKE16-NEXT: v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 17870; GFX11FAKE16-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 17871; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 17872; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 17873; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 17874; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 17875; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 17876; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 17877; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 17878; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 17879; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 17880; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 17881; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 17882; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 17883; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 17884; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 17885; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 17886; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 17887; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 17888; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 17889; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 17890; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 17891; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 17892 %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) 17893 ret <3 x bfloat> %op 17894} 17895 17896define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 17897; GCN-LABEL: v_minnum_v4bf16: 17898; GCN: ; %bb.0: 17899; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17900; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 17901; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 17902; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 17903; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 17904; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 17905; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 17906; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 17907; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 17908; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 17909; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17910; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 17911; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17912; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 17913; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17914; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 17915; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17916; GCN-NEXT: v_min_f32_e32 v3, v3, v7 17917; GCN-NEXT: v_min_f32_e32 v2, v2, v6 17918; GCN-NEXT: v_min_f32_e32 v1, v1, v5 17919; GCN-NEXT: v_min_f32_e32 v0, v0, v4 17920; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17921; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17922; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17923; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17924; GCN-NEXT: s_setpc_b64 s[30:31] 17925; 17926; GFX7-LABEL: v_minnum_v4bf16: 17927; GFX7: ; %bb.0: 17928; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17929; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 17930; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 17931; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 17932; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 17933; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 17934; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 17935; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 17936; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 17937; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 17938; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17939; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 17940; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17941; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 17942; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17943; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 17944; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17945; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 17946; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 17947; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 17948; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 17949; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17950; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17951; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17952; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17953; GFX7-NEXT: s_setpc_b64 s[30:31] 17954; 17955; GFX8-LABEL: v_minnum_v4bf16: 17956; GFX8: ; %bb.0: 17957; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17958; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 17959; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 17960; GFX8-NEXT: v_min_f32_e32 v4, v5, v4 17961; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 17962; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 17963; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 17964; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 17965; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 17966; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 17967; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 17968; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 17969; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 17970; GFX8-NEXT: s_movk_i32 s4, 0x7fff 17971; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 17972; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 17973; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 17974; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 17975; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 17976; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 17977; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 17978; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 17979; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 17980; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 17981; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 17982; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 17983; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 17984; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 17985; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 17986; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 17987; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 17988; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 17989; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 17990; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 17991; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 17992; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 17993; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 17994; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 17995; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 17996; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 17997; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 17998; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 17999; GFX8-NEXT: s_setpc_b64 s[30:31] 18000; 18001; GFX9-LABEL: v_minnum_v4bf16: 18002; GFX9: ; %bb.0: 18003; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18004; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 18005; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 18006; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 18007; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18008; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18009; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 18010; GFX9-NEXT: s_movk_i32 s4, 0x7fff 18011; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 18012; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 18013; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 18014; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 18015; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 18016; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 18017; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 18018; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 18019; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 18020; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 18021; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 18022; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 18023; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 18024; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18025; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18026; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 18027; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 18028; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 18029; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 18030; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 18031; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 18032; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 18033; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 18034; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 18035; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 18036; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 18037; GFX9-NEXT: s_mov_b32 s4, 0x7060302 18038; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 18039; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 18040; GFX9-NEXT: s_setpc_b64 s[30:31] 18041; 18042; GFX10-LABEL: v_minnum_v4bf16: 18043; GFX10: ; %bb.0: 18044; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18045; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 18046; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 18047; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18048; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18049; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 18050; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 18051; GFX10-NEXT: v_min_f32_e32 v4, v5, v4 18052; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18053; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18054; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 18055; GFX10-NEXT: v_min_f32_e32 v3, v7, v6 18056; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 18057; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 18058; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 18059; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 18060; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 18061; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 18062; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 18063; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 18064; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 18065; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 18066; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 18067; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 18068; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 18069; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 18070; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 18071; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 18072; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 18073; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 18074; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 18075; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 18076; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 18077; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 18078; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 18079; GFX10-NEXT: s_setpc_b64 s[30:31] 18080; 18081; GFX11-LABEL: v_minnum_v4bf16: 18082; GFX11: ; %bb.0: 18083; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18084; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 18085; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 18086; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18087; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18088; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 18089; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 18090; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 18091; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 18092; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18093; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 18094; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 18095; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 18096; GFX11-NEXT: v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4 18097; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 18098; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 18099; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 18100; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 18101; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 18102; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 18103; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 18104; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 18105; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 18106; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 18107; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) 18108; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 18109; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 18110; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 18111; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 18112; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 18113; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 18114; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 18115; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 18116; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 18117; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 18118; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 18119; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 18120; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 18121; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 18122; GFX11-NEXT: s_setpc_b64 s[30:31] 18123 %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) 18124 ret <4 x bfloat> %op 18125} 18126 18127define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 18128; GCN-LABEL: v_minnum_v8bf16: 18129; GCN: ; %bb.0: 18130; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18131; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 18132; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 18133; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 18134; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 18135; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 18136; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 18137; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 18138; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 18139; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 18140; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 18141; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 18142; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 18143; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 18144; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 18145; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 18146; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 18147; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18148; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18149; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18150; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18151; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18152; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18153; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18154; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18155; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18156; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18157; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18158; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18159; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18160; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18161; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18162; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18163; GCN-NEXT: v_min_f32_e32 v7, v7, v15 18164; GCN-NEXT: v_min_f32_e32 v6, v6, v14 18165; GCN-NEXT: v_min_f32_e32 v5, v5, v13 18166; GCN-NEXT: v_min_f32_e32 v4, v4, v12 18167; GCN-NEXT: v_min_f32_e32 v3, v3, v11 18168; GCN-NEXT: v_min_f32_e32 v2, v2, v10 18169; GCN-NEXT: v_min_f32_e32 v1, v1, v9 18170; GCN-NEXT: v_min_f32_e32 v0, v0, v8 18171; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18172; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18173; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18174; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18175; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18176; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18177; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18178; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18179; GCN-NEXT: s_setpc_b64 s[30:31] 18180; 18181; GFX7-LABEL: v_minnum_v8bf16: 18182; GFX7: ; %bb.0: 18183; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18184; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 18185; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 18186; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 18187; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 18188; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18189; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 18190; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18191; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 18192; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 18193; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 18194; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 18195; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 18196; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 18197; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 18198; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 18199; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 18200; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18201; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18202; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18203; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18204; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18205; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18206; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18207; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18208; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18209; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18210; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18211; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18212; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18213; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18214; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18215; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18216; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 18217; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 18218; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 18219; GFX7-NEXT: v_min_f32_e32 v4, v4, v12 18220; GFX7-NEXT: v_min_f32_e32 v3, v3, v11 18221; GFX7-NEXT: v_min_f32_e32 v2, v2, v10 18222; GFX7-NEXT: v_min_f32_e32 v1, v1, v9 18223; GFX7-NEXT: v_min_f32_e32 v0, v0, v8 18224; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18225; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18226; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18227; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18228; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18229; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18230; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18231; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18232; GFX7-NEXT: s_setpc_b64 s[30:31] 18233; 18234; GFX8-LABEL: v_minnum_v8bf16: 18235; GFX8: ; %bb.0: 18236; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18237; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 18238; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 18239; GFX8-NEXT: v_min_f32_e32 v8, v9, v8 18240; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 18241; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 18242; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18243; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18244; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 18245; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 18246; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 18247; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 18248; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 18249; GFX8-NEXT: s_movk_i32 s4, 0x7fff 18250; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 18251; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 18252; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 18253; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 18254; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 18255; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 18256; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 18257; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 18258; GFX8-NEXT: v_min_f32_e32 v7, v9, v7 18259; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 18260; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 18261; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18262; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18263; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 18264; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 18265; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 18266; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 18267; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 18268; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 18269; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 18270; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 18271; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 18272; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 18273; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 18274; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 18275; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 18276; GFX8-NEXT: v_min_f32_e32 v6, v9, v6 18277; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 18278; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 18279; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18280; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18281; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 18282; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 18283; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 18284; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18285; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 18286; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18287; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 18288; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 18289; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 18290; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 18291; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 18292; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 18293; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 18294; GFX8-NEXT: v_min_f32_e32 v5, v9, v5 18295; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 18296; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 18297; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18298; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18299; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 18300; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 18301; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 18302; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18303; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 18304; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18305; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 18306; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 18307; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 18308; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 18309; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 18310; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 18311; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 18312; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 18313; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 18314; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 18315; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 18316; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 18317; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 18318; GFX8-NEXT: s_setpc_b64 s[30:31] 18319; 18320; GFX9-LABEL: v_minnum_v8bf16: 18321; GFX9: ; %bb.0: 18322; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18323; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 18324; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 18325; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 18326; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18327; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18328; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 18329; GFX9-NEXT: s_movk_i32 s4, 0x7fff 18330; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 18331; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 18332; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 18333; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 18334; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 18335; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 18336; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 18337; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 18338; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 18339; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 18340; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 18341; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 18342; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 18343; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18344; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18345; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 18346; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 18347; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 18348; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 18349; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 18350; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 18351; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 18352; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 18353; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 18354; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 18355; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 18356; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 18357; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 18358; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 18359; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18360; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18361; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 18362; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 18363; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 18364; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 18365; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18366; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 18367; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 18368; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 18369; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 18370; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 18371; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 18372; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 18373; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 18374; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 18375; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18376; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18377; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 18378; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 18379; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 18380; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 18381; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18382; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 18383; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 18384; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 18385; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 18386; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 18387; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 18388; GFX9-NEXT: s_mov_b32 s4, 0x7060302 18389; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 18390; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 18391; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 18392; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 18393; GFX9-NEXT: s_setpc_b64 s[30:31] 18394; 18395; GFX10-LABEL: v_minnum_v8bf16: 18396; GFX10: ; %bb.0: 18397; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18398; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 18399; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 18400; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18401; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18402; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 18403; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18404; GFX10-NEXT: v_min_f32_e32 v8, v9, v8 18405; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 18406; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18407; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 18408; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 18409; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 18410; GFX10-NEXT: v_min_f32_e32 v7, v10, v9 18411; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 18412; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 18413; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 18414; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 18415; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 18416; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 18417; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 18418; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 18419; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 18420; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 18421; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 18422; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 18423; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 18424; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 18425; GFX10-NEXT: v_min_f32_e32 v6, v10, v6 18426; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 18427; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18428; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18429; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 18430; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo 18431; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 18432; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 18433; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18434; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18435; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 18436; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 18437; GFX10-NEXT: v_min_f32_e32 v5, v15, v13 18438; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 18439; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 18440; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 18441; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 18442; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 18443; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 18444; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 18445; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 18446; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 18447; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 18448; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 18449; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 18450; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo 18451; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 18452; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18453; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 18454; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 18455; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 18456; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 18457; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 18458; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 18459; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 18460; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 18461; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 18462; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 18463; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 18464; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 18465; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 18466; GFX10-NEXT: s_setpc_b64 s[30:31] 18467; 18468; GFX11-LABEL: v_minnum_v8bf16: 18469; GFX11: ; %bb.0: 18470; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18471; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 18472; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 18473; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18474; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 18475; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 18476; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 18477; GFX11-NEXT: v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 18478; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 18479; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 18480; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18481; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 18482; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 18483; GFX11-NEXT: v_min_f32_e32 v3, v3, v7 18484; GFX11-NEXT: v_min_f32_e32 v7, v10, v9 18485; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 18486; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 18487; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 18488; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 18489; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 18490; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 18491; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 18492; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 18493; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 18494; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 18495; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 18496; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 18497; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18498; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 18499; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18500; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 18501; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6 18502; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 18503; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18504; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 18505; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 18506; GFX11-NEXT: v_min_f32_e32 v6, v10, v6 18507; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 18508; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 18509; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 18510; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 18511; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 18512; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 18513; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 18514; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 18515; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 18516; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18517; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18518; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 18519; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 18520; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 18521; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 18522; GFX11-NEXT: v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 18523; GFX11-NEXT: v_min_f32_e32 v5, v15, v13 18524; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 18525; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 18526; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 18527; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 18528; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 18529; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 18530; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 18531; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 18532; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 18533; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 18534; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 18535; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 18536; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 18537; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 18538; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 18539; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 18540; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 18541; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 18542; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 18543; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 18544; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 18545; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 18546; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 18547; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 18548; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 18549; GFX11-NEXT: s_setpc_b64 s[30:31] 18550 %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) 18551 ret <8 x bfloat> %op 18552} 18553 18554define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { 18555; GCN-LABEL: v_minnum_v16bf16: 18556; GCN: ; %bb.0: 18557; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18558; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 18559; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 18560; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 18561; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18562; GCN-NEXT: v_min_f32_e32 v14, v14, v30 18563; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 18564; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 18565; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 18566; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18567; GCN-NEXT: v_min_f32_e32 v13, v13, v29 18568; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 18569; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 18570; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 18571; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18572; GCN-NEXT: v_min_f32_e32 v12, v12, v28 18573; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 18574; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 18575; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 18576; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 18577; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 18578; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 18579; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 18580; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 18581; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 18582; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 18583; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 18584; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 18585; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 18586; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 18587; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 18588; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 18589; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 18590; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 18591; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 18592; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 18593; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 18594; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 18595; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 18596; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 18597; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 18598; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 18599; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18600; GCN-NEXT: v_min_f32_e32 v11, v11, v27 18601; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 18602; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 18603; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18604; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 18605; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18606; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 18607; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18608; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 18609; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18610; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 18611; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18612; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 18613; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18614; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 18615; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18616; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 18617; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18618; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 18619; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18620; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 18621; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18622; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 18623; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18624; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18625; GCN-NEXT: v_min_f32_e32 v10, v10, v26 18626; GCN-NEXT: v_min_f32_e32 v9, v9, v25 18627; GCN-NEXT: v_min_f32_e32 v8, v8, v24 18628; GCN-NEXT: v_min_f32_e32 v7, v7, v23 18629; GCN-NEXT: v_min_f32_e32 v6, v6, v22 18630; GCN-NEXT: v_min_f32_e32 v5, v5, v21 18631; GCN-NEXT: v_min_f32_e32 v4, v4, v20 18632; GCN-NEXT: v_min_f32_e32 v3, v3, v19 18633; GCN-NEXT: v_min_f32_e32 v2, v2, v18 18634; GCN-NEXT: v_min_f32_e32 v1, v1, v17 18635; GCN-NEXT: v_min_f32_e32 v0, v0, v16 18636; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18637; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18638; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18639; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18640; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18641; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18642; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18643; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18644; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18645; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18646; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18647; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18648; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18649; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18650; GCN-NEXT: s_waitcnt vmcnt(0) 18651; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 18652; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 18653; GCN-NEXT: v_min_f32_e32 v15, v15, v16 18654; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18655; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18656; GCN-NEXT: s_setpc_b64 s[30:31] 18657; 18658; GFX7-LABEL: v_minnum_v16bf16: 18659; GFX7: ; %bb.0: 18660; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18661; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 18662; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 18663; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 18664; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18665; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 18666; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 18667; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 18668; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 18669; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 18670; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18671; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 18672; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 18673; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 18674; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 18675; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 18676; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 18677; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 18678; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 18679; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 18680; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 18681; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 18682; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 18683; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 18684; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 18685; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 18686; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 18687; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 18688; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 18689; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 18690; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 18691; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 18692; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 18693; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 18694; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 18695; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 18696; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 18697; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 18698; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 18699; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 18700; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18701; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 18702; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18703; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 18704; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18705; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 18706; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18707; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 18708; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18709; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 18710; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18711; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 18712; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18713; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18714; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 18715; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18716; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 18717; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18718; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 18719; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18720; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 18721; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18722; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 18723; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18724; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 18725; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18726; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 18727; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 18728; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 18729; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 18730; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 18731; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 18732; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 18733; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 18734; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 18735; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 18736; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 18737; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 18738; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 18739; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18740; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18741; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18742; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18743; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18744; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18745; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18746; GFX7-NEXT: s_waitcnt vmcnt(0) 18747; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 18748; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 18749; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 18750; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18751; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18752; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18753; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18754; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18755; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18756; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18757; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18758; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18759; GFX7-NEXT: s_setpc_b64 s[30:31] 18760; 18761; GFX8-LABEL: v_minnum_v16bf16: 18762; GFX8: ; %bb.0: 18763; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18764; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 18765; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 18766; GFX8-NEXT: v_min_f32_e32 v16, v17, v16 18767; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 18768; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 18769; GFX8-NEXT: s_movk_i32 s4, 0x7fff 18770; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18771; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18772; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18773; GFX8-NEXT: v_min_f32_e32 v7, v7, v15 18774; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 18775; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 18776; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 18777; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 18778; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 18779; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 18780; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 18781; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 18782; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 18783; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 18784; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 18785; GFX8-NEXT: v_min_f32_e32 v15, v17, v15 18786; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 18787; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 18788; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18789; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18790; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18791; GFX8-NEXT: v_min_f32_e32 v6, v6, v14 18792; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 18793; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 18794; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 18795; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 18796; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 18797; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 18798; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 18799; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18800; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 18801; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 18802; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 18803; GFX8-NEXT: v_min_f32_e32 v14, v17, v14 18804; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 18805; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 18806; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18807; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18808; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18809; GFX8-NEXT: v_min_f32_e32 v5, v5, v13 18810; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 18811; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 18812; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 18813; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 18814; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 18815; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 18816; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 18817; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18818; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 18819; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 18820; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 18821; GFX8-NEXT: v_min_f32_e32 v13, v17, v13 18822; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 18823; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 18824; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18825; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18826; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18827; GFX8-NEXT: v_min_f32_e32 v4, v4, v12 18828; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 18829; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 18830; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 18831; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 18832; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 18833; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 18834; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 18835; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 18836; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 18837; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 18838; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 18839; GFX8-NEXT: v_min_f32_e32 v12, v17, v12 18840; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 18841; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 18842; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18843; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 18844; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18845; GFX8-NEXT: v_min_f32_e32 v3, v3, v11 18846; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 18847; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 18848; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 18849; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 18850; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 18851; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 18852; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 18853; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 18854; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 18855; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 18856; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 18857; GFX8-NEXT: v_min_f32_e32 v11, v17, v11 18858; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 18859; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 18860; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 18861; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 18862; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18863; GFX8-NEXT: v_min_f32_e32 v2, v2, v10 18864; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 18865; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 18866; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 18867; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 18868; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 18869; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 18870; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 18871; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 18872; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 18873; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 18874; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 18875; GFX8-NEXT: v_min_f32_e32 v10, v17, v10 18876; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 18877; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 18878; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 18879; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 18880; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18881; GFX8-NEXT: v_min_f32_e32 v1, v1, v9 18882; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 18883; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 18884; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 18885; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 18886; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 18887; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 18888; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 18889; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 18890; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 18891; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 18892; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 18893; GFX8-NEXT: v_min_f32_e32 v9, v17, v9 18894; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 18895; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 18896; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 18897; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 18898; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 18899; GFX8-NEXT: v_min_f32_e32 v0, v0, v8 18900; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 18901; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 18902; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 18903; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 18904; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 18905; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 18906; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 18907; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 18908; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 18909; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 18910; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 18911; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 18912; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 18913; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 18914; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 18915; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 18916; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 18917; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 18918; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 18919; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 18920; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 18921; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 18922; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 18923; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 18924; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 18925; GFX8-NEXT: s_setpc_b64 s[30:31] 18926; 18927; GFX9-LABEL: v_minnum_v16bf16: 18928; GFX9: ; %bb.0: 18929; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18930; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 18931; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 18932; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 18933; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 18934; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 18935; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 18936; GFX9-NEXT: s_movk_i32 s4, 0x7fff 18937; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 18938; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 18939; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 18940; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 18941; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 18942; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 18943; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 18944; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 18945; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 18946; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 18947; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 18948; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 18949; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 18950; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 18951; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 18952; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 18953; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 18954; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 18955; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 18956; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 18957; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 18958; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 18959; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 18960; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 18961; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 18962; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 18963; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 18964; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 18965; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 18966; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 18967; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 18968; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 18969; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 18970; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 18971; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 18972; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 18973; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 18974; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 18975; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 18976; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 18977; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 18978; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 18979; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 18980; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 18981; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 18982; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 18983; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 18984; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 18985; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 18986; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 18987; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 18988; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 18989; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 18990; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 18991; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 18992; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 18993; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 18994; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 18995; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 18996; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 18997; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 18998; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 18999; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19000; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 19001; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 19002; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 19003; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 19004; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 19005; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 19006; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 19007; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 19008; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 19009; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 19010; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 19011; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 19012; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 19013; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 19014; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19015; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19016; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 19017; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 19018; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 19019; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 19020; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 19021; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 19022; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 19023; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 19024; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 19025; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 19026; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 19027; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 19028; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 19029; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 19030; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19031; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19032; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 19033; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 19034; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 19035; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 19036; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 19037; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 19038; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 19039; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 19040; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 19041; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 19042; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 19043; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 19044; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 19045; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 19046; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19047; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19048; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 19049; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 19050; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 19051; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 19052; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 19053; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 19054; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 19055; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 19056; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 19057; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 19058; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 19059; GFX9-NEXT: s_mov_b32 s4, 0x7060302 19060; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 19061; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 19062; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 19063; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 19064; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 19065; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 19066; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 19067; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 19068; GFX9-NEXT: s_setpc_b64 s[30:31] 19069; 19070; GFX10-LABEL: v_minnum_v16bf16: 19071; GFX10: ; %bb.0: 19072; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19073; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15 19074; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 19075; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19076; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19077; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 19078; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19079; GFX10-NEXT: v_min_f32_e32 v16, v17, v16 19080; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 19081; GFX10-NEXT: v_min_f32_e32 v7, v7, v15 19082; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19083; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 19084; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 19085; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 19086; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 19087; GFX10-NEXT: v_min_f32_e32 v17, v18, v17 19088; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 19089; GFX10-NEXT: v_min_f32_e32 v6, v6, v14 19090; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 19091; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 19092; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 19093; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo 19094; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 19095; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 19096; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 19097; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 19098; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19099; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 19100; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 19101; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 19102; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19103; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 19104; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 19105; GFX10-NEXT: v_min_f32_e32 v17, v20, v19 19106; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 19107; GFX10-NEXT: v_min_f32_e32 v5, v5, v13 19108; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo 19109; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 19110; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 19111; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 19112; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 19113; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19114; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 19115; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19116; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19117; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo 19118; GFX10-NEXT: v_min_f32_e32 v13, v19, v18 19119; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 19120; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 19121; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 19122; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 19123; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 19124; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 19125; GFX10-NEXT: v_min_f32_e32 v4, v4, v12 19126; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 19127; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 19128; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 19129; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 19130; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 19131; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19132; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 19133; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 19134; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19135; GFX10-NEXT: v_min_f32_e32 v12, v18, v12 19136; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 19137; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 19138; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 19139; GFX10-NEXT: v_min_f32_e32 v3, v3, v11 19140; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 19141; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 19142; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 19143; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 19144; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 19145; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 19146; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19147; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 19148; GFX10-NEXT: v_min_f32_e32 v18, v19, v18 19149; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19150; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 19151; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 19152; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 19153; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 19154; GFX10-NEXT: v_min_f32_e32 v2, v2, v10 19155; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 19156; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 19157; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 19158; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 19159; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 19160; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19161; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 19162; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 19163; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 19164; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 19165; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19166; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 19167; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 19168; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 19169; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 19170; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 19171; GFX10-NEXT: v_min_f32_e32 v19, v22, v20 19172; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 19173; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 19174; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19175; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19176; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 19177; GFX10-NEXT: v_min_f32_e32 v1, v1, v9 19178; GFX10-NEXT: v_min_f32_e32 v9, v22, v20 19179; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 19180; GFX10-NEXT: v_min_f32_e32 v0, v0, v8 19181; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 19182; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 19183; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 19184; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 19185; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 19186; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 19187; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 19188; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 19189; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 19190; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 19191; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 19192; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 19193; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 19194; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 19195; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 19196; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 19197; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 19198; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 19199; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 19200; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 19201; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 19202; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 19203; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 19204; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 19205; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 19206; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 19207; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 19208; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 19209; GFX10-NEXT: s_setpc_b64 s[30:31] 19210; 19211; GFX11-LABEL: v_minnum_v16bf16: 19212; GFX11: ; %bb.0: 19213; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19214; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 19215; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 19216; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19217; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 19218; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 19219; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 19220; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 19221; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19222; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 19223; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 19224; GFX11-NEXT: v_min_f32_e32 v17, v18, v17 19225; GFX11-NEXT: v_min_f32_e32 v6, v6, v14 19226; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 19227; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 19228; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 19229; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 19230; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 19232; GFX11-NEXT: v_min_f32_e32 v7, v7, v15 19233; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 19234; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 19235; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 19236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 19237; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 19238; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 19239; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 19240; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 19241; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 19242; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 19243; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 19244; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 19245; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 19246; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 19247; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 19248; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 19249; GFX11-NEXT: v_dual_min_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 19250; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 19251; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 19252; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 19253; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19254; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19255; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19256; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 19257; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 19258; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) 19259; GFX11-NEXT: v_min_f32_e32 v4, v4, v12 19260; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 19261; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19262; GFX11-NEXT: v_min_f32_e32 v5, v5, v13 19263; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 19264; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 19265; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18 19266; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 19267; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 19268; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 19269; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 19270; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) 19271; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 19272; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 19273; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 19274; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 19275; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 19276; GFX11-NEXT: v_min_f32_e32 v12, v18, v12 19277; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 19278; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 19279; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 19280; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 19281; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 19282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 19283; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 19284; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 19285; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 19286; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 19287; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 19288; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 19289; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 19290; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 19291; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 19292; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 19293; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 19294; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19295; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19296; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 19297; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 19298; GFX11-NEXT: v_min_f32_e32 v18, v19, v18 19299; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 19300; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 19301; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19302; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19303; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 19304; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 19305; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 19306; GFX11-NEXT: v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 19307; GFX11-NEXT: v_min_f32_e32 v3, v3, v11 19308; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 19309; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 19310; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 19311; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 19312; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 19313; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 19314; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 19315; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 19316; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 19317; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 19318; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 19319; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 19320; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 19321; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 19322; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 19323; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 19324; GFX11-NEXT: v_min_f32_e32 v19, v22, v20 19325; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 19326; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 19327; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) 19329; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 19330; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19331; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19332; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 19333; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v1, v1, v9 19334; GFX11-NEXT: v_min_f32_e32 v9, v22, v20 19335; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 19336; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 19337; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 19338; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 19339; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 19340; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 19341; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 19342; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 19343; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 19344; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 19345; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 19346; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 19347; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 19348; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 19349; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 19350; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 19351; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 19352; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 19353; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 19354; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 19355; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 19356; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 19357; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 19358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 19359; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 19360; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 19361; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 19362; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 19363; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 19364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 19365; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 19366; GFX11-NEXT: s_setpc_b64 s[30:31] 19367 %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) 19368 ret <16 x bfloat> %op 19369} 19370 19371define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { 19372; GCN-LABEL: v_minnum_v32bf16: 19373; GCN: ; %bb.0: 19374; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19375; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 19376; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 19377; GCN-NEXT: s_waitcnt vmcnt(1) 19378; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 19379; GCN-NEXT: s_waitcnt vmcnt(0) 19380; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 19381; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19382; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 19383; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 19384; GCN-NEXT: v_min_f32_e32 v31, v31, v32 19385; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 19386; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19387; GCN-NEXT: s_waitcnt vmcnt(0) 19388; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19389; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19390; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 19391; GCN-NEXT: v_min_f32_e32 v30, v30, v32 19392; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 19393; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 19394; GCN-NEXT: s_waitcnt vmcnt(0) 19395; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19396; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19397; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 19398; GCN-NEXT: v_min_f32_e32 v29, v29, v32 19399; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 19400; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 19401; GCN-NEXT: s_waitcnt vmcnt(0) 19402; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19403; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19404; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 19405; GCN-NEXT: v_min_f32_e32 v28, v28, v32 19406; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 19407; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 19408; GCN-NEXT: s_waitcnt vmcnt(0) 19409; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19410; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19411; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 19412; GCN-NEXT: v_min_f32_e32 v27, v27, v32 19413; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 19414; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 19415; GCN-NEXT: s_waitcnt vmcnt(0) 19416; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19417; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19418; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 19419; GCN-NEXT: v_min_f32_e32 v26, v26, v32 19420; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 19421; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 19422; GCN-NEXT: s_waitcnt vmcnt(0) 19423; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19424; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19425; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 19426; GCN-NEXT: v_min_f32_e32 v25, v25, v32 19427; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 19428; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 19429; GCN-NEXT: s_waitcnt vmcnt(0) 19430; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19431; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19432; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 19433; GCN-NEXT: v_min_f32_e32 v24, v24, v32 19434; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 19435; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 19436; GCN-NEXT: s_waitcnt vmcnt(0) 19437; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19438; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19439; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 19440; GCN-NEXT: v_min_f32_e32 v23, v23, v32 19441; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 19442; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 19443; GCN-NEXT: s_waitcnt vmcnt(0) 19444; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19445; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19446; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 19447; GCN-NEXT: v_min_f32_e32 v22, v22, v32 19448; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 19449; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 19450; GCN-NEXT: s_waitcnt vmcnt(0) 19451; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19452; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19453; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 19454; GCN-NEXT: v_min_f32_e32 v21, v21, v32 19455; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 19456; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 19457; GCN-NEXT: s_waitcnt vmcnt(0) 19458; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19459; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19460; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 19461; GCN-NEXT: v_min_f32_e32 v20, v20, v32 19462; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 19463; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 19464; GCN-NEXT: s_waitcnt vmcnt(0) 19465; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19466; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19467; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 19468; GCN-NEXT: v_min_f32_e32 v19, v19, v32 19469; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 19470; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 19471; GCN-NEXT: s_waitcnt vmcnt(0) 19472; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19473; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19474; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 19475; GCN-NEXT: v_min_f32_e32 v18, v18, v32 19476; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 19477; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 19478; GCN-NEXT: s_waitcnt vmcnt(0) 19479; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19480; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19481; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 19482; GCN-NEXT: v_min_f32_e32 v17, v17, v32 19483; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 19484; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 19485; GCN-NEXT: s_waitcnt vmcnt(0) 19486; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19487; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19488; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 19489; GCN-NEXT: v_min_f32_e32 v16, v16, v32 19490; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 19491; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19492; GCN-NEXT: s_waitcnt vmcnt(0) 19493; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19494; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19495; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 19496; GCN-NEXT: v_min_f32_e32 v15, v15, v32 19497; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 19498; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19499; GCN-NEXT: s_waitcnt vmcnt(0) 19500; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19501; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19502; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 19503; GCN-NEXT: v_min_f32_e32 v14, v14, v32 19504; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 19505; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19506; GCN-NEXT: s_waitcnt vmcnt(0) 19507; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19508; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19509; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 19510; GCN-NEXT: v_min_f32_e32 v13, v13, v32 19511; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 19512; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19513; GCN-NEXT: s_waitcnt vmcnt(0) 19514; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19515; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19516; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 19517; GCN-NEXT: v_min_f32_e32 v12, v12, v32 19518; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 19519; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19520; GCN-NEXT: s_waitcnt vmcnt(0) 19521; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19522; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19523; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 19524; GCN-NEXT: v_min_f32_e32 v11, v11, v32 19525; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 19526; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19527; GCN-NEXT: s_waitcnt vmcnt(0) 19528; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19529; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19530; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 19531; GCN-NEXT: v_min_f32_e32 v10, v10, v32 19532; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 19533; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19534; GCN-NEXT: s_waitcnt vmcnt(0) 19535; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19536; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19537; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 19538; GCN-NEXT: v_min_f32_e32 v9, v9, v32 19539; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 19540; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19541; GCN-NEXT: s_waitcnt vmcnt(0) 19542; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19543; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19544; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 19545; GCN-NEXT: v_min_f32_e32 v8, v8, v32 19546; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 19547; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19548; GCN-NEXT: s_waitcnt vmcnt(0) 19549; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19550; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19551; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 19552; GCN-NEXT: v_min_f32_e32 v7, v7, v32 19553; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 19554; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19555; GCN-NEXT: s_waitcnt vmcnt(0) 19556; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19557; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19558; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 19559; GCN-NEXT: v_min_f32_e32 v6, v6, v32 19560; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 19561; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19562; GCN-NEXT: s_waitcnt vmcnt(0) 19563; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19564; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19565; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 19566; GCN-NEXT: v_min_f32_e32 v5, v5, v32 19567; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 19568; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19569; GCN-NEXT: s_waitcnt vmcnt(0) 19570; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19571; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19572; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 19573; GCN-NEXT: v_min_f32_e32 v4, v4, v32 19574; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 19575; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19576; GCN-NEXT: s_waitcnt vmcnt(0) 19577; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19578; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19579; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 19580; GCN-NEXT: v_min_f32_e32 v3, v3, v32 19581; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 19582; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19583; GCN-NEXT: s_waitcnt vmcnt(0) 19584; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19585; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19586; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 19587; GCN-NEXT: v_min_f32_e32 v2, v2, v32 19588; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 19589; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19590; GCN-NEXT: s_waitcnt vmcnt(0) 19591; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19592; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19593; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 19594; GCN-NEXT: v_min_f32_e32 v1, v1, v32 19595; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 19596; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19597; GCN-NEXT: s_waitcnt vmcnt(0) 19598; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 19599; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19600; GCN-NEXT: v_min_f32_e32 v0, v0, v32 19601; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19602; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19603; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19604; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19605; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19606; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19607; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19608; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19609; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19610; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19611; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19612; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19613; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19614; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19615; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19616; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19617; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 19618; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 19619; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 19620; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 19621; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 19622; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 19623; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 19624; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 19625; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 19626; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 19627; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 19628; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 19629; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 19630; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 19631; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19632; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 19633; GCN-NEXT: s_setpc_b64 s[30:31] 19634; 19635; GFX7-LABEL: v_minnum_v32bf16: 19636; GFX7: ; %bb.0: 19637; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19638; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 19639; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 19640; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 19641; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19642; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 19643; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 19644; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 19645; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 19646; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 19647; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 19648; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 19649; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 19650; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 19651; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 19652; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 19653; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 19654; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 19655; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 19656; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 19657; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 19658; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 19659; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 19660; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 19661; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 19662; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 19663; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 19664; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 19665; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 19666; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 19667; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 19668; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 19669; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 19670; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 19671; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19672; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 19673; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19674; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 19675; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19676; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 19677; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19678; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 19679; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19680; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 19681; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19682; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 19683; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19684; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 19685; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19686; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 19687; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19688; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 19689; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19690; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 19691; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19692; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 19693; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19694; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 19695; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19696; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 19697; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19698; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 19699; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19700; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 19701; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19702; GFX7-NEXT: s_waitcnt vmcnt(1) 19703; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 19704; GFX7-NEXT: s_waitcnt vmcnt(0) 19705; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19706; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19707; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 19708; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 19709; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 19710; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 19711; GFX7-NEXT: s_waitcnt vmcnt(0) 19712; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19713; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19714; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 19715; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 19716; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19717; GFX7-NEXT: s_waitcnt vmcnt(0) 19718; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19719; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19720; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 19721; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 19722; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 19723; GFX7-NEXT: s_waitcnt vmcnt(0) 19724; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19725; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19726; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 19727; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 19728; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 19729; GFX7-NEXT: s_waitcnt vmcnt(0) 19730; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19731; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19732; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 19733; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 19734; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 19735; GFX7-NEXT: s_waitcnt vmcnt(0) 19736; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19737; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19738; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 19739; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 19740; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 19741; GFX7-NEXT: s_waitcnt vmcnt(0) 19742; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19743; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19744; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 19745; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 19746; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 19747; GFX7-NEXT: s_waitcnt vmcnt(0) 19748; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19749; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19750; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 19751; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 19752; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 19753; GFX7-NEXT: s_waitcnt vmcnt(0) 19754; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19755; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19756; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 19757; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 19758; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 19759; GFX7-NEXT: s_waitcnt vmcnt(0) 19760; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19761; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19762; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 19763; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 19764; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 19765; GFX7-NEXT: s_waitcnt vmcnt(0) 19766; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19767; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19768; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 19769; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 19770; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 19771; GFX7-NEXT: s_waitcnt vmcnt(0) 19772; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19773; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19774; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 19775; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 19776; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 19777; GFX7-NEXT: s_waitcnt vmcnt(0) 19778; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19779; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19780; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 19781; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 19782; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 19783; GFX7-NEXT: s_waitcnt vmcnt(0) 19784; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19785; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19786; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 19787; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 19788; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 19789; GFX7-NEXT: s_waitcnt vmcnt(0) 19790; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19791; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19792; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 19793; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 19794; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 19795; GFX7-NEXT: s_waitcnt vmcnt(0) 19796; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19797; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19798; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 19799; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 19800; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 19801; GFX7-NEXT: s_waitcnt vmcnt(0) 19802; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19803; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19804; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 19805; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 19806; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19807; GFX7-NEXT: s_waitcnt vmcnt(0) 19808; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19809; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19810; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 19811; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 19812; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19813; GFX7-NEXT: s_waitcnt vmcnt(0) 19814; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19815; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19816; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 19817; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 19818; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19819; GFX7-NEXT: s_waitcnt vmcnt(0) 19820; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19821; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19822; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 19823; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 19824; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19825; GFX7-NEXT: s_waitcnt vmcnt(0) 19826; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19827; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19828; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 19829; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 19830; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19831; GFX7-NEXT: s_waitcnt vmcnt(0) 19832; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19833; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19834; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 19835; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 19836; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 19837; GFX7-NEXT: s_waitcnt vmcnt(0) 19838; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19839; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19840; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 19841; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 19842; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 19843; GFX7-NEXT: s_waitcnt vmcnt(0) 19844; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19845; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19846; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 19847; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 19848; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 19849; GFX7-NEXT: s_waitcnt vmcnt(0) 19850; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19851; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19852; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 19853; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 19854; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 19855; GFX7-NEXT: s_waitcnt vmcnt(0) 19856; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19857; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19858; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 19859; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 19860; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 19861; GFX7-NEXT: s_waitcnt vmcnt(0) 19862; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19863; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19864; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 19865; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 19866; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 19867; GFX7-NEXT: s_waitcnt vmcnt(0) 19868; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19869; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19870; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 19871; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 19872; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 19873; GFX7-NEXT: s_waitcnt vmcnt(0) 19874; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19875; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19876; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 19877; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 19878; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 19879; GFX7-NEXT: s_waitcnt vmcnt(0) 19880; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19881; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19882; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 19883; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 19884; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 19885; GFX7-NEXT: s_waitcnt vmcnt(0) 19886; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19887; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19888; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 19889; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 19890; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 19891; GFX7-NEXT: s_waitcnt vmcnt(0) 19892; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 19893; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 19894; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 19895; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 19896; GFX7-NEXT: s_setpc_b64 s[30:31] 19897; 19898; GFX8-LABEL: v_minnum_v32bf16: 19899; GFX8: ; %bb.0: 19900; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19901; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 19902; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 19903; GFX8-NEXT: v_min_f32_e32 v31, v32, v31 19904; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 19905; GFX8-NEXT: s_movk_i32 s4, 0x7fff 19906; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 19907; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19908; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 19909; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 19910; GFX8-NEXT: v_min_f32_e32 v14, v14, v30 19911; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 19912; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 19913; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 19914; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 19915; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 19916; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 19917; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 19918; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 19919; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 19920; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 19921; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 19922; GFX8-NEXT: v_min_f32_e32 v32, v32, v30 19923; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 19924; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 19925; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 19926; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 19927; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 19928; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 19929; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 19930; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 19931; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 19932; GFX8-NEXT: s_waitcnt vmcnt(0) 19933; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 19934; GFX8-NEXT: v_min_f32_e32 v33, v33, v34 19935; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 19936; GFX8-NEXT: v_min_f32_e32 v30, v15, v30 19937; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 19938; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 19939; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 19940; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 19941; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 19942; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 19943; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 19944; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 19945; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 19946; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 19947; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 19948; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc 19949; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 19950; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 19951; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 19952; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 19953; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 19954; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 19955; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 19956; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 19957; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 19958; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 19959; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc 19960; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 19961; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 19962; GFX8-NEXT: v_min_f32_e32 v29, v33, v29 19963; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 19964; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 19965; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 19966; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 19967; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 19968; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 19969; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 19970; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 19971; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 19972; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 19973; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 19974; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 19975; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 19976; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 19977; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 19978; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 19979; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 19980; GFX8-NEXT: v_min_f32_e32 v28, v33, v28 19981; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 19982; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 19983; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 19984; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 19985; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 19986; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 19987; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 19988; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 19989; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 19990; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 19991; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 19992; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 19993; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 19994; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 19995; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 19996; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 19997; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 19998; GFX8-NEXT: v_min_f32_e32 v27, v33, v27 19999; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 20000; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 20001; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 20002; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 20003; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20004; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 20005; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 20006; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 20007; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 20008; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 20009; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 20010; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 20011; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 20012; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 20013; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 20014; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 20015; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 20016; GFX8-NEXT: v_min_f32_e32 v26, v33, v26 20017; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 20018; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 20019; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 20020; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 20021; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20022; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 20023; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 20024; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 20025; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 20026; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 20027; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 20028; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 20029; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 20030; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 20031; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 20032; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 20033; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 20034; GFX8-NEXT: v_min_f32_e32 v25, v33, v25 20035; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 20036; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 20037; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 20038; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 20039; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20040; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 20041; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 20042; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 20043; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 20044; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 20045; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 20046; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 20047; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 20048; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 20049; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 20050; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 20051; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 20052; GFX8-NEXT: v_min_f32_e32 v24, v33, v24 20053; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 20054; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 20055; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 20056; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 20057; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20058; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 20059; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 20060; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 20061; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 20062; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 20063; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 20064; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 20065; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 20066; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 20067; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 20068; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 20069; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 20070; GFX8-NEXT: v_min_f32_e32 v23, v33, v23 20071; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 20072; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 20073; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 20074; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 20075; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20076; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 20077; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 20078; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 20079; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 20080; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 20081; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 20082; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 20083; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 20084; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 20085; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 20086; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 20087; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 20088; GFX8-NEXT: v_min_f32_e32 v22, v33, v22 20089; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 20090; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 20091; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 20092; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 20093; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20094; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 20095; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 20096; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 20097; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 20098; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 20099; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 20100; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 20101; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 20102; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 20103; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 20104; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 20105; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 20106; GFX8-NEXT: v_min_f32_e32 v21, v33, v21 20107; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 20108; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 20109; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 20110; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 20111; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20112; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 20113; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 20114; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 20115; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 20116; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 20117; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 20118; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 20119; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 20120; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 20121; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 20122; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 20123; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 20124; GFX8-NEXT: v_min_f32_e32 v20, v33, v20 20125; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 20126; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 20127; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 20128; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 20129; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20130; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 20131; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 20132; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 20133; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 20134; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 20135; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 20136; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 20137; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 20138; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 20139; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 20140; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 20141; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 20142; GFX8-NEXT: v_min_f32_e32 v19, v33, v19 20143; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 20144; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 20145; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 20146; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 20147; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20148; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 20149; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 20150; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 20151; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 20152; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 20153; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 20154; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 20155; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 20156; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 20157; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 20158; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 20159; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 20160; GFX8-NEXT: v_min_f32_e32 v18, v33, v18 20161; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 20162; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 20163; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 20164; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 20165; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20166; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 20167; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 20168; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 20169; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 20170; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 20171; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 20172; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 20173; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 20174; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 20175; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 20176; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 20177; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 20178; GFX8-NEXT: v_min_f32_e32 v17, v33, v17 20179; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 20180; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 20181; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 20182; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 20183; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 20184; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 20185; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 20186; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 20187; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 20188; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 20189; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 20190; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 20191; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 20192; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 20193; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 20194; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 20195; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 20196; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 20197; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 20198; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 20199; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 20200; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 20201; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 20202; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 20203; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 20204; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 20205; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 20206; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 20207; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 20208; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 20209; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 20210; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 20211; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 20212; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 20213; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 20214; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 20215; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 20216; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 20217; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 20218; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 20219; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 20220; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 20221; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 20222; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 20223; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 20224; GFX8-NEXT: s_setpc_b64 s[30:31] 20225; 20226; GFX9-LABEL: v_minnum_v32bf16: 20227; GFX9: ; %bb.0: 20228; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20229; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 20230; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 20231; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 20232; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 20233; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 20234; GFX9-NEXT: s_movk_i32 s4, 0x7fff 20235; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 20236; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 20237; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 20238; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 20239; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 20240; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 20241; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 20242; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 20243; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 20244; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 20245; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 20246; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 20247; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 20248; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 20249; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 20250; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 20251; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 20252; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 20253; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 20254; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 20255; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 20256; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 20257; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc 20258; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 20259; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 20260; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 20261; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc 20262; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 20263; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 20264; GFX9-NEXT: v_min_f32_e32 v32, v32, v29 20265; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 20266; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 20267; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 20268; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 20269; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 20270; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 20271; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 20272; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 20273; GFX9-NEXT: s_waitcnt vmcnt(0) 20274; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 20275; GFX9-NEXT: v_min_f32_e32 v33, v33, v34 20276; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 20277; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 20278; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 20279; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 20280; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 20281; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 20282; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 20283; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 20284; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 20285; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 20286; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 20287; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 20288; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 20289; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 20290; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 20291; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 20292; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 20293; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 20294; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 20295; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 20296; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 20297; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 20298; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 20299; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 20300; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 20301; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 20302; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 20303; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 20304; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 20305; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 20306; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 20307; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 20308; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 20309; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 20310; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 20311; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 20312; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 20313; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 20314; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 20315; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 20316; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 20317; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 20318; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 20319; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 20320; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 20321; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 20322; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 20323; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 20324; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 20325; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 20326; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 20327; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 20328; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 20329; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 20330; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 20331; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 20332; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 20333; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 20334; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 20335; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 20336; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 20337; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 20338; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 20339; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 20340; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 20341; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 20342; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 20343; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 20344; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 20345; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 20346; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 20347; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 20348; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 20349; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 20350; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 20351; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 20352; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 20353; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 20354; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 20355; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 20356; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 20357; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 20358; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 20359; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 20360; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 20361; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 20362; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 20363; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 20364; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 20365; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 20366; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 20367; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 20368; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 20369; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 20370; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 20371; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 20372; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 20373; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 20374; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 20375; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 20376; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 20377; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 20378; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 20379; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 20380; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 20381; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 20382; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 20383; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 20384; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 20385; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 20386; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 20387; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 20388; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 20389; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 20390; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 20391; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 20392; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 20393; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 20394; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 20395; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 20396; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 20397; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 20398; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 20399; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 20400; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 20401; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 20402; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 20403; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 20404; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 20405; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 20406; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 20407; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 20408; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 20409; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 20410; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 20411; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 20412; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 20413; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 20414; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 20415; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 20416; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 20417; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 20418; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 20419; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 20420; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 20421; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 20422; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 20423; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 20424; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 20425; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 20426; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 20427; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 20428; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 20429; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 20430; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 20431; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 20432; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 20433; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 20434; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 20435; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 20436; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 20437; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 20438; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 20439; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 20440; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 20441; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 20442; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 20443; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 20444; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 20445; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 20446; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 20447; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 20448; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 20449; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 20450; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 20451; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 20452; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 20453; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 20454; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 20455; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 20456; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 20457; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 20458; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 20459; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 20460; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 20461; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 20462; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 20463; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 20464; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 20465; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 20466; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 20467; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 20468; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 20469; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 20470; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 20471; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 20472; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 20473; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 20474; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 20475; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 20476; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 20477; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 20478; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 20479; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 20480; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 20481; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 20482; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 20483; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 20484; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 20485; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 20486; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 20487; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 20488; GFX9-NEXT: s_mov_b32 s4, 0x7060302 20489; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 20490; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 20491; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 20492; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 20493; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 20494; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 20495; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 20496; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 20497; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 20498; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 20499; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 20500; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 20501; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 20502; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 20503; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 20504; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 20505; GFX9-NEXT: s_setpc_b64 s[30:31] 20506; 20507; GFX10-LABEL: v_minnum_v32bf16: 20508; GFX10: ; %bb.0: 20509; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20510; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 20511; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 20512; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 20513; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 20514; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 20515; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 20516; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 20517; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 20518; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 20519; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 20520; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 20521; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 20522; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 20523; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 20524; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 20525; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 20526; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 20527; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 20528; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 20529; GFX10-NEXT: v_min_f32_e32 v12, v12, v28 20530; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 20531; GFX10-NEXT: v_min_f32_e32 v39, v48, v39 20532; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 20533; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 20534; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 20535; GFX10-NEXT: v_min_f32_e32 v11, v11, v27 20536; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 20537; GFX10-NEXT: v_min_f32_e32 v49, v50, v49 20538; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 20539; GFX10-NEXT: v_min_f32_e32 v33, v34, v33 20540; GFX10-NEXT: v_min_f32_e32 v14, v14, v30 20541; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 20542; GFX10-NEXT: v_min_f32_e32 v35, v36, v35 20543; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 20544; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 20545; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 20546; GFX10-NEXT: v_min_f32_e32 v13, v13, v29 20547; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 20548; GFX10-NEXT: v_min_f32_e32 v37, v38, v37 20549; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 20550; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 20551; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 20552; GFX10-NEXT: v_min_f32_e32 v6, v6, v22 20553; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 20554; GFX10-NEXT: v_min_f32_e32 v27, v50, v27 20555; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 20556; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 20557; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 20558; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 20559; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 20560; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 20561; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 20562; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 20563; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 20564; GFX10-NEXT: v_min_f32_e32 v8, v8, v24 20565; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 20566; GFX10-NEXT: v_min_f32_e32 v29, v38, v29 20567; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 20568; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 20569; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 20570; GFX10-NEXT: v_min_f32_e32 v7, v7, v23 20571; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 20572; GFX10-NEXT: v_min_f32_e32 v28, v48, v28 20573; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 20574; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 20575; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 20576; GFX10-NEXT: v_min_f32_e32 v0, v0, v16 20577; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 20578; GFX10-NEXT: v_min_f32_e32 v10, v10, v26 20579; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 20580; GFX10-NEXT: v_min_f32_e32 v34, v34, v51 20581; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 20582; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 20583; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 20584; GFX10-NEXT: v_min_f32_e32 v9, v9, v25 20585; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 20586; GFX10-NEXT: v_min_f32_e32 v30, v36, v30 20587; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 20588; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 20589; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 20590; GFX10-NEXT: v_min_f32_e32 v2, v2, v18 20591; GFX10-NEXT: v_min_f32_e32 v18, v48, v23 20592; GFX10-NEXT: v_min_f32_e32 v1, v1, v17 20593; GFX10-NEXT: v_min_f32_e32 v17, v50, v22 20594; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 20595; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 20596; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 20597; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 20598; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 20599; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 20600; GFX10-NEXT: v_min_f32_e32 v4, v4, v20 20601; GFX10-NEXT: v_min_f32_e32 v20, v36, v25 20602; GFX10-NEXT: v_min_f32_e32 v3, v3, v19 20603; GFX10-NEXT: v_min_f32_e32 v19, v38, v24 20604; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 20605; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 20606; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff 20607; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo 20608; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 20609; GFX10-NEXT: v_min_f32_e32 v5, v5, v21 20610; GFX10-NEXT: v_min_f32_e32 v21, v51, v26 20611; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 20612; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 20613; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff 20614; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo 20615; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 20616; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 20617; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 20618; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff 20619; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 20620; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo 20621; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 20622; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 20623; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff 20624; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 20625; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 20626; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo 20627; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 20628; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 20629; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 20630; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 20631; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff 20632; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo 20633; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 20634; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 20635; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 20636; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff 20637; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 20638; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo 20639; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 20640; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 20641; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff 20642; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 20643; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 20644; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo 20645; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 20646; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff 20647; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 20648; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 20649; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff 20650; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo 20651; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 20652; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 20653; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 20654; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff 20655; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 20656; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo 20657; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 20658; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 20659; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff 20660; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 20661; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 20662; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo 20663; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 20664; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff 20665; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 20666; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 20667; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff 20668; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo 20669; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 20670; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 20671; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 20672; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff 20673; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 20674; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo 20675; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 20676; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 20677; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff 20678; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 20679; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 20680; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo 20681; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 20682; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 20683; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff 20684; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 20685; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 20686; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo 20687; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 20688; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 20689; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff 20690; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 20691; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo 20692; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 20693; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 20694; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff 20695; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 20696; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo 20697; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 20698; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 20699; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff 20700; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 20701; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo 20702; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 20703; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 20704; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff 20705; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 20706; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo 20707; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 20708; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff 20709; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 20710; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 20711; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo 20712; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 20713; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 20714; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 20715; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff 20716; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo 20717; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 20718; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff 20719; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 20720; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 20721; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo 20722; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 20723; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 20724; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 20725; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff 20726; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo 20727; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 20728; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 20729; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff 20730; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 20731; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo 20732; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 20733; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 20734; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff 20735; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 20736; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo 20737; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 20738; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 20739; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff 20740; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 20741; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo 20742; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 20743; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff 20744; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo 20745; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 20746; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 20747; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo 20748; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 20749; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 20750; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 20751; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo 20752; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 20753; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 20754; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo 20755; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 20756; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 20757; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo 20758; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 20759; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 20760; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 20761; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 20762; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 20763; GFX10-NEXT: s_waitcnt vmcnt(0) 20764; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 20765; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 20766; GFX10-NEXT: v_min_f32_e32 v17, v31, v17 20767; GFX10-NEXT: v_min_f32_e32 v15, v15, v18 20768; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 20769; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 20770; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 20771; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 20772; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 20773; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff 20774; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff 20775; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 20776; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 20777; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo 20778; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 20779; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 20780; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo 20781; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 20782; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 20783; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 20784; GFX10-NEXT: s_setpc_b64 s[30:31] 20785; 20786; GFX11-LABEL: v_minnum_v32bf16: 20787; GFX11: ; %bb.0: 20788; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20789; GFX11-NEXT: scratch_load_b32 v32, off, s32 20790; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 20791; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 20792; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 20793; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 20794; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 20795; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 20796; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 20797; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 20798; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 20799; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 20800; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 20801; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 20802; GFX11-NEXT: v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 20803; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 20804; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 20805; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 20806; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 20807; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 20808; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 20809; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 20810; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff 20811; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 20812; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 20813; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff 20814; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 20815; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 20816; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 20817; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 20818; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 20819; GFX11-NEXT: v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 20820; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 20821; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 20822; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 20823; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 20824; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 20825; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 20826; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 20827; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 20828; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 20829; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff 20830; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 20831; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 20832; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 20833; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 20834; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 20835; GFX11-NEXT: v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 20836; GFX11-NEXT: v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 20837; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 20838; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff 20839; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 20840; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 20841; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 20842; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 20843; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 20844; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 20845; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff 20846; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 20847; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 20848; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 20849; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 20850; GFX11-NEXT: v_min_f32_e32 v2, v2, v18 20851; GFX11-NEXT: v_min_f32_e32 v0, v0, v16 20852; GFX11-NEXT: v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 20853; GFX11-NEXT: v_min_f32_e32 v7, v7, v23 20854; GFX11-NEXT: v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83 20855; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 20856; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 20857; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 20858; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 20859; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 20860; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 20861; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 20862; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff 20863; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 20864; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 20865; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff 20866; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 20867; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 20868; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 20869; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff 20870; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 20871; GFX11-NEXT: v_min_f32_e32 v4, v4, v20 20872; GFX11-NEXT: v_min_f32_e32 v20, v80, v71 20873; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 20874; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 20875; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 20876; GFX11-NEXT: v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 20877; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 20878; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff 20879; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 20880; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 20881; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 20882; GFX11-NEXT: v_min_f32_e32 v26, v52, v51 20883; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 20884; GFX11-NEXT: v_min_f32_e32 v6, v6, v22 20885; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 20886; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 20887; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 20888; GFX11-NEXT: v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 20889; GFX11-NEXT: v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 20890; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 20891; GFX11-NEXT: v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 20892; GFX11-NEXT: v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 20893; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 20894; GFX11-NEXT: v_min_f32_e32 v29, v38, v37 20895; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 20896; GFX11-NEXT: v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 20897; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 20898; GFX11-NEXT: v_min_f32_e32 v14, v14, v30 20899; GFX11-NEXT: v_min_f32_e32 v28, v48, v39 20900; GFX11-NEXT: v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33 20901; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 20902; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 20903; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 20904; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 20905; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 20906; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 20907; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 20908; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 20909; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff 20910; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 20911; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 20912; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff 20913; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 20914; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 20915; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff 20916; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo 20917; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 20918; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 20919; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 20920; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff 20921; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 20922; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo 20923; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 20924; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 20925; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 20926; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 20927; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 20928; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo 20929; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 20930; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff 20931; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 20932; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 20933; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff 20934; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo 20935; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 20936; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 20937; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 20938; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff 20939; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 20940; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo 20941; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 20942; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 20943; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff 20944; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 20945; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 20946; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo 20947; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 20948; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff 20949; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 20950; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 20951; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff 20952; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo 20953; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 20954; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 20955; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff 20956; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 20957; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 20958; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo 20959; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 20960; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 20961; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff 20962; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 20963; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 20964; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo 20965; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 20966; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff 20967; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 20968; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 20969; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff 20970; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo 20971; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 20972; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 20973; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 20974; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff 20975; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 20976; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo 20977; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 20978; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff 20979; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 20980; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 20981; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 20982; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo 20983; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 20984; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff 20985; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 20986; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff 20987; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 20988; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo 20989; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 20990; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 20991; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 20992; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff 20993; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 20994; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo 20995; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 20996; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 20997; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 20998; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 20999; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 21000; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo 21001; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 21002; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 21003; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo 21004; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 21005; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 21006; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 21007; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo 21008; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 21009; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo 21010; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 21011; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 21012; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo 21013; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 21014; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo 21015; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 21016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 21017; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 21018; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo 21019; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21020; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo 21021; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 21022; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 21023; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo 21024; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 21025; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo 21026; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 21027; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo 21028; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21029; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo 21030; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 21031; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 21032; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 21033; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo 21034; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21035; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo 21036; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 21037; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 21038; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo 21039; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 21040; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 21041; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 21042; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo 21043; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 21044; GFX11-NEXT: s_waitcnt vmcnt(0) 21045; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 21046; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 21047; GFX11-NEXT: v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 21048; GFX11-NEXT: v_min_f32_e32 v15, v15, v18 21049; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 21050; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 21051; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 21052; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 21053; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 21054; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 21055; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff 21056; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff 21057; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 21058; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo 21059; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 21060; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo 21061; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 21062; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 21063; GFX11-NEXT: s_setpc_b64 s[30:31] 21064 %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) 21065 ret <32 x bfloat> %op 21066} 21067 21068 21069declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) 21070declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>) 21071declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>) 21072declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>) 21073declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>) 21074declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>) 21075declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>) 21076 21077define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { 21078; GCN-LABEL: v_maxnum_bf16: 21079; GCN: ; %bb.0: 21080; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21081; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 21082; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 21083; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21084; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21085; GCN-NEXT: v_max_f32_e32 v0, v0, v1 21086; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21087; GCN-NEXT: s_setpc_b64 s[30:31] 21088; 21089; GFX7-LABEL: v_maxnum_bf16: 21090; GFX7: ; %bb.0: 21091; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21092; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 21093; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 21094; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21095; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21096; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 21097; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21098; GFX7-NEXT: s_setpc_b64 s[30:31] 21099; 21100; GFX8-LABEL: v_maxnum_bf16: 21101; GFX8: ; %bb.0: 21102; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21103; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21104; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 21105; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 21106; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 21107; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 21108; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 21109; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 21110; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21111; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 21112; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21113; GFX8-NEXT: s_setpc_b64 s[30:31] 21114; 21115; GFX9-LABEL: v_maxnum_bf16: 21116; GFX9: ; %bb.0: 21117; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21118; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21119; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 21120; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 21121; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 21122; GFX9-NEXT: s_movk_i32 s4, 0x7fff 21123; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 21124; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 21125; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21126; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 21127; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21128; GFX9-NEXT: s_setpc_b64 s[30:31] 21129; 21130; GFX10-LABEL: v_maxnum_bf16: 21131; GFX10: ; %bb.0: 21132; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21133; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21134; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 21135; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 21136; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 21137; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 21138; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21139; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 21140; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 21141; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21142; GFX10-NEXT: s_setpc_b64 s[30:31] 21143; 21144; GFX11-LABEL: v_maxnum_bf16: 21145; GFX11: ; %bb.0: 21146; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21147; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21148; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 21149; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 21150; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 21151; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 21152; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 21153; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21154; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 21155; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 21156; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 21157; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 21158; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21159; GFX11-NEXT: s_setpc_b64 s[30:31] 21160 %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) 21161 ret bfloat %op 21162} 21163 21164define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { 21165; GCN-LABEL: v_maxnum_v2bf16: 21166; GCN: ; %bb.0: 21167; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21168; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 21169; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 21170; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 21171; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 21172; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21173; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21174; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21175; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21176; GCN-NEXT: v_max_f32_e32 v1, v1, v3 21177; GCN-NEXT: v_max_f32_e32 v0, v0, v2 21178; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21179; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21180; GCN-NEXT: s_setpc_b64 s[30:31] 21181; 21182; GFX7-LABEL: v_maxnum_v2bf16: 21183; GFX7: ; %bb.0: 21184; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21185; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 21186; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 21187; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 21188; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 21189; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21190; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21191; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21192; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21193; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 21194; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 21195; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21196; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21197; GFX7-NEXT: s_setpc_b64 s[30:31] 21198; 21199; GFX8-LABEL: v_maxnum_v2bf16: 21200; GFX8: ; %bb.0: 21201; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21202; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 21203; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 21204; GFX8-NEXT: v_max_f32_e32 v2, v3, v2 21205; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 21206; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 21207; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21208; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21209; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 21210; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 21211; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 21212; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 21213; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 21214; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 21215; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 21216; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 21217; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 21218; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21219; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 21220; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21221; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 21222; GFX8-NEXT: s_setpc_b64 s[30:31] 21223; 21224; GFX9-LABEL: v_maxnum_v2bf16: 21225; GFX9: ; %bb.0: 21226; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21227; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 21228; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 21229; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 21230; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21231; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21232; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 21233; GFX9-NEXT: s_movk_i32 s4, 0x7fff 21234; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 21235; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 21236; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 21237; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 21238; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 21239; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 21240; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 21241; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 21242; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21243; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 21244; GFX9-NEXT: s_mov_b32 s4, 0x7060302 21245; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 21246; GFX9-NEXT: s_setpc_b64 s[30:31] 21247; 21248; GFX10-LABEL: v_maxnum_v2bf16: 21249; GFX10: ; %bb.0: 21250; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21251; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 21252; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 21253; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21254; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21255; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 21256; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 21257; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 21258; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 21259; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 21260; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 21261; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 21262; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 21263; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 21264; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 21265; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21266; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 21267; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 21268; GFX10-NEXT: s_setpc_b64 s[30:31] 21269; 21270; GFX11-LABEL: v_maxnum_v2bf16: 21271; GFX11: ; %bb.0: 21272; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21273; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 21274; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21275; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 21276; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21277; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 21278; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 21279; GFX11-NEXT: v_max_f32_e32 v2, v3, v2 21280; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 21281; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 21282; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 21283; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 21284; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 21285; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 21286; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 21287; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 21288; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 21289; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo 21290; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21291; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 21292; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 21293; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 21294; GFX11-NEXT: s_setpc_b64 s[30:31] 21295 %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) 21296 ret <2 x bfloat> %op 21297} 21298 21299define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { 21300; GCN-LABEL: v_maxnum_v3bf16: 21301; GCN: ; %bb.0: 21302; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21303; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 21304; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 21305; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 21306; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 21307; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 21308; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 21309; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21310; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21311; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21312; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21313; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21314; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21315; GCN-NEXT: v_max_f32_e32 v2, v2, v5 21316; GCN-NEXT: v_max_f32_e32 v1, v1, v4 21317; GCN-NEXT: v_max_f32_e32 v0, v0, v3 21318; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21319; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21320; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21321; GCN-NEXT: s_setpc_b64 s[30:31] 21322; 21323; GFX7-LABEL: v_maxnum_v3bf16: 21324; GFX7: ; %bb.0: 21325; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21326; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 21327; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 21328; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 21329; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 21330; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 21331; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 21332; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21333; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21334; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21335; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21336; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21337; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21338; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 21339; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 21340; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 21341; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21342; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21343; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21344; GFX7-NEXT: s_setpc_b64 s[30:31] 21345; 21346; GFX8-LABEL: v_maxnum_v3bf16: 21347; GFX8: ; %bb.0: 21348; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21349; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 21350; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21351; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 21352; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 21353; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 21354; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 21355; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 21356; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21357; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 21358; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 21359; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 21360; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 21361; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 21362; GFX8-NEXT: s_movk_i32 s4, 0x7fff 21363; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 21364; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21365; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21366; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 21367; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 21368; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 21369; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21370; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 21371; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 21372; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 21373; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 21374; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 21375; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21376; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 21377; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21378; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 21379; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 21380; GFX8-NEXT: s_setpc_b64 s[30:31] 21381; 21382; GFX9-LABEL: v_maxnum_v3bf16: 21383; GFX9: ; %bb.0: 21384; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21385; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 21386; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21387; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 21388; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 21389; GFX9-NEXT: s_movk_i32 s4, 0x7fff 21390; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 21391; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 21392; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21393; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 21394; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 21395; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 21396; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 21397; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21398; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21399; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 21400; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 21401; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 21402; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 21403; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21404; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 21405; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 21406; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 21407; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 21408; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21409; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 21410; GFX9-NEXT: s_mov_b32 s4, 0x7060302 21411; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 21412; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 21413; GFX9-NEXT: s_setpc_b64 s[30:31] 21414; 21415; GFX10-LABEL: v_maxnum_v3bf16: 21416; GFX10: ; %bb.0: 21417; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21418; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 21419; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 21420; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21421; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21422; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 21423; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21424; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 21425; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 21426; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 21427; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 21428; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 21429; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 21430; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21431; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 21432; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 21433; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 21434; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 21435; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 21436; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 21437; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 21438; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21439; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 21440; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21441; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 21442; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 21443; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 21444; GFX10-NEXT: s_setpc_b64 s[30:31] 21445; 21446; GFX11TRUE16-LABEL: v_maxnum_v3bf16: 21447; GFX11TRUE16: ; %bb.0: 21448; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21449; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 21450; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 21451; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 21452; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21453; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21454; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 21455; GFX11TRUE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 21456; GFX11TRUE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 21457; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 21458; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 21459; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 21460; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21461; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 21462; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 21463; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 21464; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 21465; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 21466; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 21467; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 21468; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 21469; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21470; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 21471; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 21472; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21473; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 21474; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 21475; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 21476; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 21477; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 21478; 21479; GFX11FAKE16-LABEL: v_maxnum_v3bf16: 21480; GFX11FAKE16: ; %bb.0: 21481; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21482; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 21483; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 21484; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 21485; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21486; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21487; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 21488; GFX11FAKE16-NEXT: v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 21489; GFX11FAKE16-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 21490; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 21491; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 21492; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 21493; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21494; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 21495; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 21496; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 21497; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 21498; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 21499; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 21500; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 21501; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 21502; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21503; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 21504; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 21505; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21506; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 21507; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo 21508; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 21509; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 21510; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 21511 %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) 21512 ret <3 x bfloat> %op 21513} 21514 21515define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { 21516; GCN-LABEL: v_maxnum_v4bf16: 21517; GCN: ; %bb.0: 21518; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21519; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 21520; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 21521; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 21522; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 21523; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 21524; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 21525; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 21526; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 21527; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21528; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21529; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21530; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21531; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21532; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21533; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21534; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21535; GCN-NEXT: v_max_f32_e32 v3, v3, v7 21536; GCN-NEXT: v_max_f32_e32 v2, v2, v6 21537; GCN-NEXT: v_max_f32_e32 v1, v1, v5 21538; GCN-NEXT: v_max_f32_e32 v0, v0, v4 21539; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21540; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21541; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21542; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21543; GCN-NEXT: s_setpc_b64 s[30:31] 21544; 21545; GFX7-LABEL: v_maxnum_v4bf16: 21546; GFX7: ; %bb.0: 21547; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21548; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 21549; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 21550; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 21551; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 21552; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 21553; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 21554; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 21555; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 21556; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21557; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21558; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21559; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21560; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21561; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21562; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21563; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21564; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 21565; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 21566; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 21567; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 21568; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21569; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21570; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21571; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21572; GFX7-NEXT: s_setpc_b64 s[30:31] 21573; 21574; GFX8-LABEL: v_maxnum_v4bf16: 21575; GFX8: ; %bb.0: 21576; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21577; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 21578; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 21579; GFX8-NEXT: v_max_f32_e32 v4, v5, v4 21580; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 21581; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 21582; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21583; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21584; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 21585; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 21586; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 21587; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 21588; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 21589; GFX8-NEXT: s_movk_i32 s4, 0x7fff 21590; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 21591; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 21592; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 21593; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 21594; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21595; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 21596; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 21597; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 21598; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 21599; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 21600; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 21601; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21602; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21603; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 21604; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 21605; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 21606; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21607; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 21608; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 21609; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 21610; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 21611; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 21612; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21613; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 21614; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 21615; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21616; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 21617; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 21618; GFX8-NEXT: s_setpc_b64 s[30:31] 21619; 21620; GFX9-LABEL: v_maxnum_v4bf16: 21621; GFX9: ; %bb.0: 21622; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21623; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 21624; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 21625; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 21626; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21627; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21628; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 21629; GFX9-NEXT: s_movk_i32 s4, 0x7fff 21630; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 21631; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 21632; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 21633; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 21634; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 21635; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 21636; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 21637; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 21638; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21639; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 21640; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 21641; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 21642; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 21643; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21644; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21645; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 21646; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 21647; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 21648; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 21649; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21650; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 21651; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 21652; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 21653; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 21654; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21655; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 21656; GFX9-NEXT: s_mov_b32 s4, 0x7060302 21657; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 21658; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 21659; GFX9-NEXT: s_setpc_b64 s[30:31] 21660; 21661; GFX10-LABEL: v_maxnum_v4bf16: 21662; GFX10: ; %bb.0: 21663; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21664; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 21665; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1 21666; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21667; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21668; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 21669; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 21670; GFX10-NEXT: v_max_f32_e32 v4, v5, v4 21671; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21672; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21673; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 21674; GFX10-NEXT: v_max_f32_e32 v3, v7, v6 21675; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 21676; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 21677; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 21678; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21679; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 21680; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 21681; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 21682; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 21683; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 21684; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 21685; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 21686; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 21687; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 21688; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 21689; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 21690; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 21691; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 21692; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21693; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 21694; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21695; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 21696; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 21697; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 21698; GFX10-NEXT: s_setpc_b64 s[30:31] 21699; 21700; GFX11-LABEL: v_maxnum_v4bf16: 21701; GFX11: ; %bb.0: 21702; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21703; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 21704; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 21705; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21706; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21707; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3 21708; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1 21709; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 21710; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1 21711; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21712; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 21713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 21714; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 21715; GFX11-NEXT: v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4 21716; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 21717; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 21718; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 21719; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 21720; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 21721; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 21722; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 21723; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 21724; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 21725; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 21726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) 21727; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo 21728; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 21729; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 21730; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 21731; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 21732; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 21733; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 21734; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 21735; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 21736; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 21737; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 21738; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc_lo 21739; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 21740; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 21741; GFX11-NEXT: s_setpc_b64 s[30:31] 21742 %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) 21743 ret <4 x bfloat> %op 21744} 21745 21746define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { 21747; GCN-LABEL: v_maxnum_v8bf16: 21748; GCN: ; %bb.0: 21749; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21750; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 21751; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 21752; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 21753; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 21754; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 21755; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 21756; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 21757; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 21758; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 21759; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 21760; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 21761; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 21762; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 21763; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 21764; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 21765; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 21766; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 21767; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21768; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 21769; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21770; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 21771; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21772; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 21773; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21774; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 21775; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21776; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 21777; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21778; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 21779; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21780; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 21781; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21782; GCN-NEXT: v_max_f32_e32 v7, v7, v15 21783; GCN-NEXT: v_max_f32_e32 v6, v6, v14 21784; GCN-NEXT: v_max_f32_e32 v5, v5, v13 21785; GCN-NEXT: v_max_f32_e32 v4, v4, v12 21786; GCN-NEXT: v_max_f32_e32 v3, v3, v11 21787; GCN-NEXT: v_max_f32_e32 v2, v2, v10 21788; GCN-NEXT: v_max_f32_e32 v1, v1, v9 21789; GCN-NEXT: v_max_f32_e32 v0, v0, v8 21790; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21791; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21792; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21793; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21794; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21795; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21796; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21797; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21798; GCN-NEXT: s_setpc_b64 s[30:31] 21799; 21800; GFX7-LABEL: v_maxnum_v8bf16: 21801; GFX7: ; %bb.0: 21802; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21803; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 21804; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 21805; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 21806; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 21807; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 21808; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 21809; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 21810; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 21811; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 21812; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 21813; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 21814; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 21815; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 21816; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 21817; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 21818; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 21819; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 21820; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21821; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 21822; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21823; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 21824; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21825; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 21826; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21827; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 21828; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21829; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 21830; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21831; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 21832; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21833; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 21834; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21835; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 21836; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 21837; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 21838; GFX7-NEXT: v_max_f32_e32 v4, v4, v12 21839; GFX7-NEXT: v_max_f32_e32 v3, v3, v11 21840; GFX7-NEXT: v_max_f32_e32 v2, v2, v10 21841; GFX7-NEXT: v_max_f32_e32 v1, v1, v9 21842; GFX7-NEXT: v_max_f32_e32 v0, v0, v8 21843; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21844; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21845; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21846; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21847; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21848; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21849; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21850; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21851; GFX7-NEXT: s_setpc_b64 s[30:31] 21852; 21853; GFX8-LABEL: v_maxnum_v8bf16: 21854; GFX8: ; %bb.0: 21855; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21856; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v7 21857; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 21858; GFX8-NEXT: v_max_f32_e32 v8, v9, v8 21859; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1 21860; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8 21861; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21862; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21863; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 21864; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 21865; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8 21866; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 21867; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 21868; GFX8-NEXT: s_movk_i32 s4, 0x7fff 21869; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 21870; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 21871; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 21872; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 21873; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21874; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 21875; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6 21876; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2 21877; GFX8-NEXT: v_max_f32_e32 v7, v9, v7 21878; GFX8-NEXT: v_bfe_u32 v9, v7, 16, 1 21879; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 21880; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21881; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21882; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 21883; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 21884; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v7 21885; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 21886; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 21887; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 21888; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 21889; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 21890; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 21891; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 21892; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 21893; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 21894; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1 21895; GFX8-NEXT: v_max_f32_e32 v6, v9, v6 21896; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 21897; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 21898; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21899; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21900; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 21901; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 21902; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 21903; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 21904; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 21905; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 21906; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 21907; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 21908; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 21909; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21910; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 21911; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 21912; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0 21913; GFX8-NEXT: v_max_f32_e32 v5, v9, v5 21914; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 21915; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 21916; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21917; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21918; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 21919; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 21920; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 21921; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 21922; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 21923; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 21924; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 21925; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 21926; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 21927; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 21928; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 21929; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 21930; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 21931; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 21932; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 21933; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 21934; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 21935; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 21936; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 21937; GFX8-NEXT: s_setpc_b64 s[30:31] 21938; 21939; GFX9-LABEL: v_maxnum_v8bf16: 21940; GFX9: ; %bb.0: 21941; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21942; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 21943; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 21944; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 21945; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 21946; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 21947; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 21948; GFX9-NEXT: s_movk_i32 s4, 0x7fff 21949; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 21950; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 21951; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 21952; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 21953; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 21954; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc 21955; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 21956; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 21957; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 21958; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc 21959; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 21960; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 21961; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 21962; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 21963; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 21964; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 21965; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 21966; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 21967; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 21968; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 21969; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 21970; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc 21971; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 21972; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 21973; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 21974; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 21975; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 21976; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 21977; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 21978; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 21979; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 21980; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 21981; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 21982; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 21983; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 21984; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 21985; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 21986; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 21987; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 21988; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 21989; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 21990; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 21991; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 21992; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 21993; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 21994; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 21995; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 21996; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 21997; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 21998; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 21999; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 22000; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 22001; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 22002; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 22003; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 22004; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 22005; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 22006; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc 22007; GFX9-NEXT: s_mov_b32 s4, 0x7060302 22008; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 22009; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 22010; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 22011; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 22012; GFX9-NEXT: s_setpc_b64 s[30:31] 22013; 22014; GFX10-LABEL: v_maxnum_v8bf16: 22015; GFX10: ; %bb.0: 22016; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22017; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v7 22018; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 22019; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22020; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22021; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v2 22022; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22023; GFX10-NEXT: v_max_f32_e32 v8, v9, v8 22024; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 22025; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22026; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 22027; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0 22028; GFX10-NEXT: v_bfe_u32 v11, v8, 16, 1 22029; GFX10-NEXT: v_max_f32_e32 v7, v10, v9 22030; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v8 22031; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 22032; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 22033; GFX10-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 22034; GFX10-NEXT: v_bfe_u32 v11, v3, 16, 1 22035; GFX10-NEXT: v_bfe_u32 v12, v7, 16, 1 22036; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 22037; GFX10-NEXT: v_bfe_u32 v13, v2, 16, 1 22038; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 22039; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v1 22040; GFX10-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 22041; GFX10-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 22042; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 22043; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 22044; GFX10-NEXT: v_max_f32_e32 v6, v10, v6 22045; GFX10-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 22046; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22047; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22048; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4 22049; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo 22050; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 22051; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1 22052; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22053; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22054; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 22055; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 22056; GFX10-NEXT: v_max_f32_e32 v5, v15, v13 22057; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v3 22058; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 22059; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 22060; GFX10-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 22061; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 22062; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1 22063; GFX10-NEXT: v_bfe_u32 v12, v5, 16, 1 22064; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 22065; GFX10-NEXT: v_bfe_u32 v13, v0, 16, 1 22066; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v1 22067; GFX10-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 22068; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 22069; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo 22070; GFX10-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 22071; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 22072; GFX10-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 22073; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v0 22074; GFX10-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 22075; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 22076; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 22077; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 22078; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 22079; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 22080; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 22081; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 22082; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 22083; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 22084; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 22085; GFX10-NEXT: s_setpc_b64 s[30:31] 22086; 22087; GFX11-LABEL: v_maxnum_v8bf16: 22088; GFX11: ; %bb.0: 22089; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22090; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 22091; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 22092; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22093; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v3 22094; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v0 22095; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 22096; GFX11-NEXT: v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7 22097; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 22098; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 22099; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22100; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 22101; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22102; GFX11-NEXT: v_max_f32_e32 v3, v3, v7 22103; GFX11-NEXT: v_max_f32_e32 v7, v10, v9 22104; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v8 22105; GFX11-NEXT: v_add3_u32 v10, v11, v8, 0x7fff 22106; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 22107; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 22108; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 22109; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 22110; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 22111; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo 22112; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 22113; GFX11-NEXT: v_add3_u32 v9, v11, v3, 0x7fff 22114; GFX11-NEXT: v_add3_u32 v11, v12, v7, 0x7fff 22115; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 22116; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22117; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v1 22118; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22119; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 22120; GFX11-NEXT: v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6 22121; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 22122; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22123; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 22124; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22125; GFX11-NEXT: v_max_f32_e32 v6, v10, v6 22126; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 22127; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 22128; GFX11-NEXT: v_add3_u32 v10, v13, v2, 0x7fff 22129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 22130; GFX11-NEXT: v_bfe_u32 v12, v6, 16, 1 22131; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo 22132; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 22133; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 22134; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v4 22135; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22136; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22137; GFX11-NEXT: v_perm_b32 v2, v2, v7, 0x7060302 22138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 22139; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 22140; GFX11-NEXT: v_add3_u32 v4, v12, v6, 0x7fff 22141; GFX11-NEXT: v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10 22142; GFX11-NEXT: v_max_f32_e32 v5, v15, v13 22143; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22144; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 1 22145; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 22146; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 22147; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 22148; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22149; GFX11-NEXT: v_add3_u32 v6, v11, v1, 0x7fff 22150; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 22151; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 22152; GFX11-NEXT: v_add3_u32 v10, v12, v5, 0x7fff 22153; GFX11-NEXT: v_add3_u32 v12, v13, v0, 0x7fff 22154; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 22155; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 22156; GFX11-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo 22157; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 22158; GFX11-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc_lo 22159; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 22160; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 22161; GFX11-NEXT: v_perm_b32 v0, v0, v5, 0x7060302 22162; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v15, vcc_lo 22163; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 22164; GFX11-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 22165; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v14, vcc_lo 22166; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 22167; GFX11-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 22168; GFX11-NEXT: s_setpc_b64 s[30:31] 22169 %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) 22170 ret <8 x bfloat> %op 22171} 22172 22173define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { 22174; GCN-LABEL: v_maxnum_v16bf16: 22175; GCN: ; %bb.0: 22176; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22177; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 22178; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 22179; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 22180; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22181; GCN-NEXT: v_max_f32_e32 v14, v14, v30 22182; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 22183; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 22184; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 22185; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22186; GCN-NEXT: v_max_f32_e32 v13, v13, v29 22187; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 22188; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 22189; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 22190; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22191; GCN-NEXT: v_max_f32_e32 v12, v12, v28 22192; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 22193; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 22194; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 22195; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 22196; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 22197; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 22198; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 22199; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 22200; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 22201; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 22202; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 22203; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 22204; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 22205; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 22206; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 22207; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 22208; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 22209; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 22210; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 22211; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 22212; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 22213; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 22214; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 22215; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 22216; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 22217; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 22218; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22219; GCN-NEXT: v_max_f32_e32 v11, v11, v27 22220; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 22221; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 22222; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22223; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 22224; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22225; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 22226; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22227; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 22228; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22229; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 22230; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22231; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 22232; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22233; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 22234; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22235; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 22236; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22237; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 22238; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22239; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 22240; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22241; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 22242; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22243; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22244; GCN-NEXT: v_max_f32_e32 v10, v10, v26 22245; GCN-NEXT: v_max_f32_e32 v9, v9, v25 22246; GCN-NEXT: v_max_f32_e32 v8, v8, v24 22247; GCN-NEXT: v_max_f32_e32 v7, v7, v23 22248; GCN-NEXT: v_max_f32_e32 v6, v6, v22 22249; GCN-NEXT: v_max_f32_e32 v5, v5, v21 22250; GCN-NEXT: v_max_f32_e32 v4, v4, v20 22251; GCN-NEXT: v_max_f32_e32 v3, v3, v19 22252; GCN-NEXT: v_max_f32_e32 v2, v2, v18 22253; GCN-NEXT: v_max_f32_e32 v1, v1, v17 22254; GCN-NEXT: v_max_f32_e32 v0, v0, v16 22255; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22256; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22257; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22258; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22259; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22260; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22261; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22262; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22263; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22264; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22265; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22266; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22267; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22268; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22269; GCN-NEXT: s_waitcnt vmcnt(0) 22270; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 22271; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 22272; GCN-NEXT: v_max_f32_e32 v15, v15, v16 22273; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22274; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22275; GCN-NEXT: s_setpc_b64 s[30:31] 22276; 22277; GFX7-LABEL: v_maxnum_v16bf16: 22278; GFX7: ; %bb.0: 22279; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22280; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 22281; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 22282; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 22283; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22284; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 22285; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 22286; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 22287; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 22288; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 22289; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22290; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 22291; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 22292; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 22293; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 22294; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 22295; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 22296; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 22297; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 22298; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 22299; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 22300; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 22301; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 22302; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 22303; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 22304; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 22305; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 22306; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 22307; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 22308; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 22309; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 22310; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 22311; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 22312; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 22313; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 22314; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 22315; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 22316; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 22317; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 22318; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 22319; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22320; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 22321; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22322; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 22323; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22324; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 22325; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22326; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 22327; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22328; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 22329; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22330; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 22331; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22332; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22333; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 22334; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22335; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 22336; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22337; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 22338; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22339; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 22340; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22341; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 22342; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22343; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 22344; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22345; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 22346; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 22347; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 22348; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 22349; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 22350; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 22351; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 22352; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 22353; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 22354; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 22355; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 22356; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 22357; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 22358; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22359; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22360; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22361; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22362; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22363; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22364; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22365; GFX7-NEXT: s_waitcnt vmcnt(0) 22366; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27 22367; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 22368; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 22369; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22370; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22371; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22372; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22373; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22374; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22375; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22376; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22377; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22378; GFX7-NEXT: s_setpc_b64 s[30:31] 22379; 22380; GFX8-LABEL: v_maxnum_v16bf16: 22381; GFX8: ; %bb.0: 22382; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22383; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v15 22384; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v7 22385; GFX8-NEXT: v_max_f32_e32 v16, v17, v16 22386; GFX8-NEXT: v_bfe_u32 v17, v16, 16, 1 22387; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v16 22388; GFX8-NEXT: s_movk_i32 s4, 0x7fff 22389; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22390; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22391; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22392; GFX8-NEXT: v_max_f32_e32 v7, v7, v15 22393; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v16 22394; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 22395; GFX8-NEXT: v_bfe_u32 v15, v7, 16, 1 22396; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 22397; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v7 22398; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 22399; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 22400; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 22401; GFX8-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 22402; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14 22403; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v6 22404; GFX8-NEXT: v_max_f32_e32 v15, v17, v15 22405; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 22406; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 22407; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22408; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22409; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22410; GFX8-NEXT: v_max_f32_e32 v6, v6, v14 22411; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v15 22412; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 22413; GFX8-NEXT: v_bfe_u32 v14, v6, 16, 1 22414; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 22415; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v6 22416; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 22417; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 22418; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 22419; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 22420; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13 22421; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v5 22422; GFX8-NEXT: v_max_f32_e32 v14, v17, v14 22423; GFX8-NEXT: v_bfe_u32 v17, v14, 16, 1 22424; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v14 22425; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22426; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22427; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22428; GFX8-NEXT: v_max_f32_e32 v5, v5, v13 22429; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v14 22430; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 22431; GFX8-NEXT: v_bfe_u32 v13, v5, 16, 1 22432; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 22433; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v5 22434; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 22435; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 22436; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 22437; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 22438; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12 22439; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v4 22440; GFX8-NEXT: v_max_f32_e32 v13, v17, v13 22441; GFX8-NEXT: v_bfe_u32 v17, v13, 16, 1 22442; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v13 22443; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22444; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22445; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22446; GFX8-NEXT: v_max_f32_e32 v4, v4, v12 22447; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v13 22448; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 22449; GFX8-NEXT: v_bfe_u32 v12, v4, 16, 1 22450; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 22451; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v4 22452; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 22453; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 22454; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 22455; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 22456; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 22457; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v3 22458; GFX8-NEXT: v_max_f32_e32 v12, v17, v12 22459; GFX8-NEXT: v_bfe_u32 v17, v12, 16, 1 22460; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v12 22461; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22462; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22463; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22464; GFX8-NEXT: v_max_f32_e32 v3, v3, v11 22465; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v12 22466; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 22467; GFX8-NEXT: v_bfe_u32 v11, v3, 16, 1 22468; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 22469; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v3 22470; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 22471; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 22472; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 22473; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 22474; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10 22475; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v2 22476; GFX8-NEXT: v_max_f32_e32 v11, v17, v11 22477; GFX8-NEXT: v_bfe_u32 v17, v11, 16, 1 22478; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v11 22479; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22480; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22481; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22482; GFX8-NEXT: v_max_f32_e32 v2, v2, v10 22483; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v11 22484; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 22485; GFX8-NEXT: v_bfe_u32 v10, v2, 16, 1 22486; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 22487; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v2 22488; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 22489; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 22490; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 22491; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 22492; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 22493; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v1 22494; GFX8-NEXT: v_max_f32_e32 v10, v17, v10 22495; GFX8-NEXT: v_bfe_u32 v17, v10, 16, 1 22496; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v10 22497; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22498; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22499; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22500; GFX8-NEXT: v_max_f32_e32 v1, v1, v9 22501; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v10 22502; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 22503; GFX8-NEXT: v_bfe_u32 v9, v1, 16, 1 22504; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 22505; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v1 22506; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 22507; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 22508; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 22509; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 22510; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8 22511; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0 22512; GFX8-NEXT: v_max_f32_e32 v9, v17, v9 22513; GFX8-NEXT: v_bfe_u32 v17, v9, 16, 1 22514; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v9 22515; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22516; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22517; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 22518; GFX8-NEXT: v_max_f32_e32 v0, v0, v8 22519; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v9 22520; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 22521; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 22522; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 22523; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 22524; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 22525; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 22526; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 22527; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 22528; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 22529; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 22530; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 22531; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 22532; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 22533; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 22534; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 22535; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 22536; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 22537; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 22538; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 22539; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 22540; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 22541; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 22542; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 22543; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 22544; GFX8-NEXT: s_setpc_b64 s[30:31] 22545; 22546; GFX9-LABEL: v_maxnum_v16bf16: 22547; GFX9: ; %bb.0: 22548; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22549; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 22550; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 22551; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 22552; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22553; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22554; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 22555; GFX9-NEXT: s_movk_i32 s4, 0x7fff 22556; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 22557; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 22558; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 22559; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 22560; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 22561; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc 22562; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 22563; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 22564; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 22565; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc 22566; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 22567; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 22568; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 22569; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22570; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22571; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 22572; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 22573; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 22574; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 22575; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 22576; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 22577; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc 22578; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 22579; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 22580; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 22581; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc 22582; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 22583; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 22584; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 22585; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22586; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22587; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 22588; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 22589; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 22590; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 22591; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 22592; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 22593; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc 22594; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 22595; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 22596; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 22597; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc 22598; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 22599; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 22600; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 22601; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22602; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22603; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 22604; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 22605; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 22606; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 22607; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 22608; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 22609; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc 22610; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 22611; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 22612; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 22613; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc 22614; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 22615; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 22616; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 22617; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22618; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22619; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 22620; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 22621; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 22622; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 22623; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 22624; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 22625; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc 22626; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 22627; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 22628; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 22629; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc 22630; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 22631; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 22632; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 22633; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22634; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22635; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 22636; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 22637; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 22638; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 22639; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 22640; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 22641; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc 22642; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 22643; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 22644; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 22645; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc 22646; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 22647; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 22648; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 22649; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22650; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22651; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 22652; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 22653; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 22654; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 22655; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 22656; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 22657; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc 22658; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 22659; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 22660; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 22661; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc 22662; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 22663; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 22664; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 22665; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22666; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22667; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 22668; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 22669; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 22670; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 22671; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 22672; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 22673; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc 22674; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 22675; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 22676; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 22677; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc 22678; GFX9-NEXT: s_mov_b32 s4, 0x7060302 22679; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 22680; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 22681; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 22682; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 22683; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 22684; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 22685; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 22686; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 22687; GFX9-NEXT: s_setpc_b64 s[30:31] 22688; 22689; GFX10-LABEL: v_maxnum_v16bf16: 22690; GFX10: ; %bb.0: 22691; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22692; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v15 22693; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v7 22694; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 22695; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22696; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v6 22697; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22698; GFX10-NEXT: v_max_f32_e32 v16, v17, v16 22699; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v14 22700; GFX10-NEXT: v_max_f32_e32 v7, v7, v15 22701; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22702; GFX10-NEXT: v_bfe_u32 v15, v16, 16, 1 22703; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v16 22704; GFX10-NEXT: v_bfe_u32 v19, v7, 16, 1 22705; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 22706; GFX10-NEXT: v_max_f32_e32 v17, v18, v17 22707; GFX10-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 22708; GFX10-NEXT: v_max_f32_e32 v6, v6, v14 22709; GFX10-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 22710; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v7 22711; GFX10-NEXT: v_bfe_u32 v21, v17, 16, 1 22712; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v20, vcc_lo 22713; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 22714; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v5 22715; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v17 22716; GFX10-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 22717; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22718; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 22719; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13 22720; GFX10-NEXT: v_bfe_u32 v18, v6, 16, 1 22721; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22722; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 22723; GFX10-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 22724; GFX10-NEXT: v_max_f32_e32 v17, v20, v19 22725; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v4 22726; GFX10-NEXT: v_max_f32_e32 v5, v5, v13 22727; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc_lo 22728; GFX10-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 22729; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v6 22730; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v12 22731; GFX10-NEXT: v_bfe_u32 v20, v17, 16, 1 22732; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 22733; GFX10-NEXT: v_bfe_u32 v21, v5, 16, 1 22734; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22735; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22736; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v13, vcc_lo 22737; GFX10-NEXT: v_max_f32_e32 v13, v19, v18 22738; GFX10-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 22739; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v17 22740; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 22741; GFX10-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 22742; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v5 22743; GFX10-NEXT: v_bfe_u32 v21, v13, 16, 1 22744; GFX10-NEXT: v_max_f32_e32 v4, v4, v12 22745; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 22746; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 22747; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 22748; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v3 22749; GFX10-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 22750; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22751; GFX10-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 22752; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13 22753; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22754; GFX10-NEXT: v_max_f32_e32 v12, v18, v12 22755; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 22756; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 22757; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10 22758; GFX10-NEXT: v_max_f32_e32 v3, v3, v11 22759; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v12 22760; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 22761; GFX10-NEXT: v_bfe_u32 v17, v12, 16, 1 22762; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2 22763; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 22764; GFX10-NEXT: v_bfe_u32 v20, v3, 16, 1 22765; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22766; GFX10-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 22767; GFX10-NEXT: v_max_f32_e32 v18, v19, v18 22768; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22769; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 22770; GFX10-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 22771; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 22772; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1 22773; GFX10-NEXT: v_max_f32_e32 v2, v2, v10 22774; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 22775; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 22776; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v18 22777; GFX10-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 22778; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v1 22779; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 22780; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 22781; GFX10-NEXT: v_bfe_u32 v19, v2, 16, 1 22782; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v9 22783; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 22784; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22785; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v2 22786; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v4 22787; GFX10-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 22788; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 22789; GFX10-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 22790; GFX10-NEXT: v_max_f32_e32 v19, v22, v20 22791; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v8 22792; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 22793; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22794; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22795; GFX10-NEXT: v_bfe_u32 v23, v19, 16, 1 22796; GFX10-NEXT: v_max_f32_e32 v1, v1, v9 22797; GFX10-NEXT: v_max_f32_e32 v9, v22, v20 22798; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 22799; GFX10-NEXT: v_max_f32_e32 v0, v0, v8 22800; GFX10-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 22801; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 22802; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 22803; GFX10-NEXT: v_bfe_u32 v23, v9, 16, 1 22804; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v9 22805; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v0 22806; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 22807; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 22808; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v1 22809; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 22810; GFX10-NEXT: v_bfe_u32 v20, v0, 16, 1 22811; GFX10-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 22812; GFX10-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 22813; GFX10-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 22814; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 22815; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 22816; GFX10-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 22817; GFX10-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 22818; GFX10-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 22819; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 22820; GFX10-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 22821; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 22822; GFX10-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 22823; GFX10-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 22824; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 22825; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 22826; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 22827; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 22828; GFX10-NEXT: s_setpc_b64 s[30:31] 22829; 22830; GFX11-LABEL: v_maxnum_v16bf16: 22831; GFX11: ; %bb.0: 22832; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22833; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 22834; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 22835; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 22836; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v7 22837; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 22838; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15 22839; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 22840; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 22841; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v16 22842; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 22843; GFX11-NEXT: v_max_f32_e32 v17, v18, v17 22844; GFX11-NEXT: v_max_f32_e32 v6, v6, v14 22845; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 22846; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 22847; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 22848; GFX11-NEXT: v_add3_u32 v14, v21, v17, 0x7fff 22849; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 22850; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 22851; GFX11-NEXT: v_max_f32_e32 v7, v7, v15 22852; GFX11-NEXT: v_bfe_u32 v15, v16, 16, 1 22853; GFX11-NEXT: v_add3_u32 v15, v15, v16, 0x7fff 22854; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v17 22855; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 22856; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5 22857; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 22858; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 22859; GFX11-NEXT: v_add3_u32 v18, v19, v7, 0x7fff 22860; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v7 22861; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 22862; GFX11-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc_lo 22863; GFX11-NEXT: v_bfe_u32 v18, v6, 16, 1 22864; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v13 22865; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 22866; GFX11-NEXT: v_perm_b32 v7, v7, v15, 0x7060302 22867; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 22868; GFX11-NEXT: v_dual_max_f32 v17, v20, v19 :: v_dual_cndmask_b32 v14, v14, v16 22869; GFX11-NEXT: v_add3_u32 v16, v18, v6, 0x7fff 22870; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12 22871; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v4 22872; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 22873; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 22874; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 22875; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 22876; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 22877; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) 22878; GFX11-NEXT: v_max_f32_e32 v4, v4, v12 22879; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v11 22880; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 22881; GFX11-NEXT: v_max_f32_e32 v5, v5, v13 22882; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v6 22883; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 22884; GFX11-NEXT: v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18 22885; GFX11-NEXT: v_add3_u32 v16, v20, v17, 0x7fff 22886; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 22887; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 22888; GFX11-NEXT: v_perm_b32 v6, v6, v14, 0x7060302 22889; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) 22890; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc_lo 22891; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v3 22892; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 22893; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 22894; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 22895; GFX11-NEXT: v_max_f32_e32 v12, v18, v12 22896; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22897; GFX11-NEXT: v_add3_u32 v19, v21, v5, 0x7fff 22898; GFX11-NEXT: v_bfe_u32 v21, v13, 16, 1 22899; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 22900; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 22901; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 22902; GFX11-NEXT: v_cndmask_b32_e32 v5, v19, v20, vcc_lo 22903; GFX11-NEXT: v_add3_u32 v17, v21, v13, 0x7fff 22904; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v13 22905; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 22906; GFX11-NEXT: v_bfe_u32 v20, v4, 16, 1 22907; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v4 22908; GFX11-NEXT: v_perm_b32 v5, v5, v16, 0x7060302 22909; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo 22910; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 22911; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 22912; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v2 22913; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 22914; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 22915; GFX11-NEXT: v_add3_u32 v17, v17, v12, 0x7fff 22916; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 22917; GFX11-NEXT: v_max_f32_e32 v18, v19, v18 22918; GFX11-NEXT: v_cndmask_b32_e32 v12, v17, v22, vcc_lo 22919; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 22920; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 22921; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 22922; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 22923; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 22924; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 22925; GFX11-NEXT: v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 22926; GFX11-NEXT: v_max_f32_e32 v3, v3, v11 22927; GFX11-NEXT: v_add3_u32 v11, v20, v4, 0x7fff 22928; GFX11-NEXT: v_add3_u32 v10, v23, v18, 0x7fff 22929; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 22930; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 22931; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 22932; GFX11-NEXT: v_add3_u32 v19, v20, v3, 0x7fff 22933; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 22934; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 22935; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo 22936; GFX11-NEXT: v_bfe_u32 v19, v2, 16, 1 22937; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v9 22938; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 22939; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v2 22940; GFX11-NEXT: v_perm_b32 v3, v3, v12, 0x7060302 22941; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v17, vcc_lo 22942; GFX11-NEXT: v_add3_u32 v17, v19, v2, 0x7fff 22943; GFX11-NEXT: v_max_f32_e32 v19, v22, v20 22944; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v8 22945; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v0 22946; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 22947; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) 22948; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 22949; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 22950; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 22951; GFX11-NEXT: v_bfe_u32 v23, v19, 16, 1 22952; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v1, v1, v9 22953; GFX11-NEXT: v_max_f32_e32 v9, v22, v20 22954; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 22955; GFX11-NEXT: v_add3_u32 v20, v23, v19, 0x7fff 22956; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 22957; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v0 22958; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 22959; GFX11-NEXT: v_bfe_u32 v23, v9, 16, 1 22960; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v9 22961; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v22, vcc_lo 22962; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v1 22963; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 22964; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 22965; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 22966; GFX11-NEXT: v_add3_u32 v23, v23, v9, 0x7fff 22967; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 22968; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v22, vcc_lo 22969; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 22970; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff 22971; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) 22972; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 22973; GFX11-NEXT: v_cndmask_b32_e32 v8, v23, v24, vcc_lo 22974; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 22975; GFX11-NEXT: v_cndmask_b32_e32 v0, v20, v25, vcc_lo 22976; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 22977; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 22978; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 22979; GFX11-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc_lo 22980; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 22981; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 22982; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v21, vcc_lo 22983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 22984; GFX11-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 22985; GFX11-NEXT: s_setpc_b64 s[30:31] 22986 %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) 22987 ret <16 x bfloat> %op 22988} 22989 22990define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { 22991; GCN-LABEL: v_maxnum_v32bf16: 22992; GCN: ; %bb.0: 22993; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22994; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 22995; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 22996; GCN-NEXT: s_waitcnt vmcnt(1) 22997; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 22998; GCN-NEXT: s_waitcnt vmcnt(0) 22999; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 23000; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23001; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 23002; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 23003; GCN-NEXT: v_max_f32_e32 v31, v31, v32 23004; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 23005; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23006; GCN-NEXT: s_waitcnt vmcnt(0) 23007; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23008; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23009; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 23010; GCN-NEXT: v_max_f32_e32 v30, v30, v32 23011; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 23012; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23013; GCN-NEXT: s_waitcnt vmcnt(0) 23014; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23015; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23016; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 23017; GCN-NEXT: v_max_f32_e32 v29, v29, v32 23018; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 23019; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23020; GCN-NEXT: s_waitcnt vmcnt(0) 23021; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23022; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23023; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 23024; GCN-NEXT: v_max_f32_e32 v28, v28, v32 23025; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 23026; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23027; GCN-NEXT: s_waitcnt vmcnt(0) 23028; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23029; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23030; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 23031; GCN-NEXT: v_max_f32_e32 v27, v27, v32 23032; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 23033; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23034; GCN-NEXT: s_waitcnt vmcnt(0) 23035; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23036; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23037; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 23038; GCN-NEXT: v_max_f32_e32 v26, v26, v32 23039; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 23040; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23041; GCN-NEXT: s_waitcnt vmcnt(0) 23042; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23043; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23044; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 23045; GCN-NEXT: v_max_f32_e32 v25, v25, v32 23046; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 23047; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23048; GCN-NEXT: s_waitcnt vmcnt(0) 23049; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23050; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23051; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 23052; GCN-NEXT: v_max_f32_e32 v24, v24, v32 23053; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 23054; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23055; GCN-NEXT: s_waitcnt vmcnt(0) 23056; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23057; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23058; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 23059; GCN-NEXT: v_max_f32_e32 v23, v23, v32 23060; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 23061; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23062; GCN-NEXT: s_waitcnt vmcnt(0) 23063; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23064; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23065; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 23066; GCN-NEXT: v_max_f32_e32 v22, v22, v32 23067; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 23068; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 23069; GCN-NEXT: s_waitcnt vmcnt(0) 23070; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23071; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23072; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 23073; GCN-NEXT: v_max_f32_e32 v21, v21, v32 23074; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 23075; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 23076; GCN-NEXT: s_waitcnt vmcnt(0) 23077; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23078; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23079; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 23080; GCN-NEXT: v_max_f32_e32 v20, v20, v32 23081; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 23082; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 23083; GCN-NEXT: s_waitcnt vmcnt(0) 23084; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23085; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23086; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 23087; GCN-NEXT: v_max_f32_e32 v19, v19, v32 23088; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 23089; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 23090; GCN-NEXT: s_waitcnt vmcnt(0) 23091; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23092; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23093; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 23094; GCN-NEXT: v_max_f32_e32 v18, v18, v32 23095; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 23096; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 23097; GCN-NEXT: s_waitcnt vmcnt(0) 23098; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23099; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23100; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 23101; GCN-NEXT: v_max_f32_e32 v17, v17, v32 23102; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 23103; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 23104; GCN-NEXT: s_waitcnt vmcnt(0) 23105; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23106; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23107; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 23108; GCN-NEXT: v_max_f32_e32 v16, v16, v32 23109; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 23110; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23111; GCN-NEXT: s_waitcnt vmcnt(0) 23112; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23113; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23114; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 23115; GCN-NEXT: v_max_f32_e32 v15, v15, v32 23116; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 23117; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23118; GCN-NEXT: s_waitcnt vmcnt(0) 23119; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23120; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23121; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 23122; GCN-NEXT: v_max_f32_e32 v14, v14, v32 23123; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 23124; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23125; GCN-NEXT: s_waitcnt vmcnt(0) 23126; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23127; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23128; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 23129; GCN-NEXT: v_max_f32_e32 v13, v13, v32 23130; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 23131; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23132; GCN-NEXT: s_waitcnt vmcnt(0) 23133; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23134; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23135; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 23136; GCN-NEXT: v_max_f32_e32 v12, v12, v32 23137; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 23138; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23139; GCN-NEXT: s_waitcnt vmcnt(0) 23140; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23141; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23142; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 23143; GCN-NEXT: v_max_f32_e32 v11, v11, v32 23144; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 23145; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23146; GCN-NEXT: s_waitcnt vmcnt(0) 23147; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23148; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23149; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 23150; GCN-NEXT: v_max_f32_e32 v10, v10, v32 23151; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 23152; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23153; GCN-NEXT: s_waitcnt vmcnt(0) 23154; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23155; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23156; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 23157; GCN-NEXT: v_max_f32_e32 v9, v9, v32 23158; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 23159; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23160; GCN-NEXT: s_waitcnt vmcnt(0) 23161; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23162; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23163; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 23164; GCN-NEXT: v_max_f32_e32 v8, v8, v32 23165; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 23166; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23167; GCN-NEXT: s_waitcnt vmcnt(0) 23168; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23169; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23170; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 23171; GCN-NEXT: v_max_f32_e32 v7, v7, v32 23172; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 23173; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 23174; GCN-NEXT: s_waitcnt vmcnt(0) 23175; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23176; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23177; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 23178; GCN-NEXT: v_max_f32_e32 v6, v6, v32 23179; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 23180; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 23181; GCN-NEXT: s_waitcnt vmcnt(0) 23182; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23183; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23184; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 23185; GCN-NEXT: v_max_f32_e32 v5, v5, v32 23186; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 23187; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 23188; GCN-NEXT: s_waitcnt vmcnt(0) 23189; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23190; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23191; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 23192; GCN-NEXT: v_max_f32_e32 v4, v4, v32 23193; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 23194; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 23195; GCN-NEXT: s_waitcnt vmcnt(0) 23196; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23197; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23198; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 23199; GCN-NEXT: v_max_f32_e32 v3, v3, v32 23200; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 23201; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 23202; GCN-NEXT: s_waitcnt vmcnt(0) 23203; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23204; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23205; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 23206; GCN-NEXT: v_max_f32_e32 v2, v2, v32 23207; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 23208; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 23209; GCN-NEXT: s_waitcnt vmcnt(0) 23210; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23211; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23212; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 23213; GCN-NEXT: v_max_f32_e32 v1, v1, v32 23214; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 23215; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 23216; GCN-NEXT: s_waitcnt vmcnt(0) 23217; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 23218; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23219; GCN-NEXT: v_max_f32_e32 v0, v0, v32 23220; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 23221; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 23222; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 23223; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 23224; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 23225; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 23226; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 23227; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23228; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23229; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23230; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23231; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23232; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23233; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23234; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23235; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23236; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 23237; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 23238; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 23239; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 23240; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 23241; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 23242; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23243; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23244; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23245; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23246; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23247; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23248; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23249; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23250; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23251; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 23252; GCN-NEXT: s_setpc_b64 s[30:31] 23253; 23254; GFX7-LABEL: v_maxnum_v32bf16: 23255; GFX7: ; %bb.0: 23256; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23257; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 23258; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 23259; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 23260; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23261; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 23262; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23263; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 23264; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23265; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 23266; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23267; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 23268; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23269; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 23270; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23271; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 23272; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23273; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 23274; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23275; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 23276; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23277; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 23278; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 23279; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 23280; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 23281; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 23282; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 23283; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 23284; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 23285; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 23286; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 23287; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 23288; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 23289; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 23290; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23291; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 23292; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23293; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 23294; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23295; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 23296; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23297; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 23298; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23299; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 23300; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23301; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 23302; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23303; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 23304; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23305; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 23306; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23307; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 23308; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 23309; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 23310; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 23311; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 23312; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 23313; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 23314; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 23315; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 23316; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 23317; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 23318; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 23319; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 23320; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 23321; GFX7-NEXT: s_waitcnt vmcnt(1) 23322; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 23323; GFX7-NEXT: s_waitcnt vmcnt(0) 23324; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23325; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23326; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 23327; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 23328; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 23329; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 23330; GFX7-NEXT: s_waitcnt vmcnt(0) 23331; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23332; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23333; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 23334; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 23335; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23336; GFX7-NEXT: s_waitcnt vmcnt(0) 23337; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23338; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23339; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 23340; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 23341; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23342; GFX7-NEXT: s_waitcnt vmcnt(0) 23343; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23344; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23345; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 23346; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 23347; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23348; GFX7-NEXT: s_waitcnt vmcnt(0) 23349; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23350; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23351; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 23352; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 23353; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23354; GFX7-NEXT: s_waitcnt vmcnt(0) 23355; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23356; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23357; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 23358; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 23359; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23360; GFX7-NEXT: s_waitcnt vmcnt(0) 23361; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23362; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23363; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 23364; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 23365; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23366; GFX7-NEXT: s_waitcnt vmcnt(0) 23367; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23368; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23369; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 23370; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 23371; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23372; GFX7-NEXT: s_waitcnt vmcnt(0) 23373; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23374; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23375; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 23376; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 23377; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23378; GFX7-NEXT: s_waitcnt vmcnt(0) 23379; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23380; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23381; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 23382; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 23383; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23384; GFX7-NEXT: s_waitcnt vmcnt(0) 23385; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23386; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23387; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 23388; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 23389; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 23390; GFX7-NEXT: s_waitcnt vmcnt(0) 23391; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23392; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23393; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 23394; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 23395; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 23396; GFX7-NEXT: s_waitcnt vmcnt(0) 23397; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23398; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23399; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 23400; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 23401; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 23402; GFX7-NEXT: s_waitcnt vmcnt(0) 23403; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23404; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23405; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 23406; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 23407; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 23408; GFX7-NEXT: s_waitcnt vmcnt(0) 23409; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23410; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23411; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 23412; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 23413; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 23414; GFX7-NEXT: s_waitcnt vmcnt(0) 23415; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23416; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23417; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 23418; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 23419; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 23420; GFX7-NEXT: s_waitcnt vmcnt(0) 23421; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23422; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23423; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 23424; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 23425; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23426; GFX7-NEXT: s_waitcnt vmcnt(0) 23427; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23428; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23429; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 23430; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 23431; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23432; GFX7-NEXT: s_waitcnt vmcnt(0) 23433; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23434; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23435; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 23436; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 23437; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23438; GFX7-NEXT: s_waitcnt vmcnt(0) 23439; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23440; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23441; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 23442; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 23443; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23444; GFX7-NEXT: s_waitcnt vmcnt(0) 23445; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23446; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23447; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 23448; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 23449; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23450; GFX7-NEXT: s_waitcnt vmcnt(0) 23451; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23452; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23453; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 23454; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 23455; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23456; GFX7-NEXT: s_waitcnt vmcnt(0) 23457; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23458; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23459; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 23460; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 23461; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23462; GFX7-NEXT: s_waitcnt vmcnt(0) 23463; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23464; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23465; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 23466; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 23467; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23468; GFX7-NEXT: s_waitcnt vmcnt(0) 23469; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23470; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23471; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 23472; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 23473; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23474; GFX7-NEXT: s_waitcnt vmcnt(0) 23475; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23476; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23477; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 23478; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 23479; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 23480; GFX7-NEXT: s_waitcnt vmcnt(0) 23481; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23482; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23483; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 23484; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 23485; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 23486; GFX7-NEXT: s_waitcnt vmcnt(0) 23487; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23488; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23489; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 23490; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 23491; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 23492; GFX7-NEXT: s_waitcnt vmcnt(0) 23493; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23494; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23495; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 23496; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 23497; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 23498; GFX7-NEXT: s_waitcnt vmcnt(0) 23499; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23500; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23501; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 23502; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 23503; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 23504; GFX7-NEXT: s_waitcnt vmcnt(0) 23505; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23506; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23507; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 23508; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 23509; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 23510; GFX7-NEXT: s_waitcnt vmcnt(0) 23511; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 23512; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 23513; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 23514; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 23515; GFX7-NEXT: s_setpc_b64 s[30:31] 23516; 23517; GFX8-LABEL: v_maxnum_v32bf16: 23518; GFX8: ; %bb.0: 23519; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23520; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v30 23521; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v14 23522; GFX8-NEXT: v_max_f32_e32 v31, v32, v31 23523; GFX8-NEXT: v_bfe_u32 v32, v31, 16, 1 23524; GFX8-NEXT: s_movk_i32 s4, 0x7fff 23525; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 23526; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23527; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23528; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 23529; GFX8-NEXT: v_max_f32_e32 v14, v14, v30 23530; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v31 23531; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 23532; GFX8-NEXT: v_bfe_u32 v30, v14, 16, 1 23533; GFX8-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 23534; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v14 23535; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 23536; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v14 23537; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 23538; GFX8-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 23539; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 23540; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v13 23541; GFX8-NEXT: v_max_f32_e32 v32, v32, v30 23542; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 23543; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v15 23544; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23545; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23546; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23547; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 23548; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 23549; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 23550; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 23551; GFX8-NEXT: s_waitcnt vmcnt(0) 23552; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 23553; GFX8-NEXT: v_max_f32_e32 v33, v33, v34 23554; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23555; GFX8-NEXT: v_max_f32_e32 v30, v15, v30 23556; GFX8-NEXT: v_bfe_u32 v15, v33, 16, 1 23557; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v33 23558; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 23559; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v33 23560; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 23561; GFX8-NEXT: v_bfe_u32 v33, v30, 16, 1 23562; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 23563; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v30 23564; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23565; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v30 23566; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 23567; GFX8-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc 23568; GFX8-NEXT: v_bfe_u32 v33, v32, 16, 1 23569; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v32 23570; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23571; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v32 23572; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 23573; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 23574; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v13 23575; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 23576; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v13 23577; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 23578; GFX8-NEXT: v_cndmask_b32_e32 v13, v29, v33, vcc 23579; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 23580; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v12 23581; GFX8-NEXT: v_max_f32_e32 v29, v33, v29 23582; GFX8-NEXT: v_bfe_u32 v33, v29, 16, 1 23583; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v29 23584; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23585; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23586; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23587; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 23588; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v29 23589; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 23590; GFX8-NEXT: v_bfe_u32 v28, v12, 16, 1 23591; GFX8-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 23592; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v12 23593; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 23594; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v12 23595; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 23596; GFX8-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 23597; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 23598; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v11 23599; GFX8-NEXT: v_max_f32_e32 v28, v33, v28 23600; GFX8-NEXT: v_bfe_u32 v33, v28, 16, 1 23601; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v28 23602; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23603; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23604; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23605; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 23606; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v28 23607; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 23608; GFX8-NEXT: v_bfe_u32 v27, v11, 16, 1 23609; GFX8-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 23610; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v11 23611; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 23612; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v11 23613; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 23614; GFX8-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 23615; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 23616; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v10 23617; GFX8-NEXT: v_max_f32_e32 v27, v33, v27 23618; GFX8-NEXT: v_bfe_u32 v33, v27, 16, 1 23619; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v27 23620; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23621; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23622; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23623; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 23624; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v27 23625; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 23626; GFX8-NEXT: v_bfe_u32 v26, v10, 16, 1 23627; GFX8-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 23628; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v10 23629; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 23630; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v10 23631; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 23632; GFX8-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 23633; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25 23634; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v9 23635; GFX8-NEXT: v_max_f32_e32 v26, v33, v26 23636; GFX8-NEXT: v_bfe_u32 v33, v26, 16, 1 23637; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v26 23638; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23639; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23640; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23641; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 23642; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v26 23643; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 23644; GFX8-NEXT: v_bfe_u32 v25, v9, 16, 1 23645; GFX8-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 23646; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v9 23647; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25 23648; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v9 23649; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 23650; GFX8-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 23651; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24 23652; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v8 23653; GFX8-NEXT: v_max_f32_e32 v25, v33, v25 23654; GFX8-NEXT: v_bfe_u32 v33, v25, 16, 1 23655; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v25 23656; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23657; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23658; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23659; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 23660; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v25 23661; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 23662; GFX8-NEXT: v_bfe_u32 v24, v8, 16, 1 23663; GFX8-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 23664; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v8 23665; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24 23666; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v8 23667; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 23668; GFX8-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 23669; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 23670; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v7 23671; GFX8-NEXT: v_max_f32_e32 v24, v33, v24 23672; GFX8-NEXT: v_bfe_u32 v33, v24, 16, 1 23673; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v24 23674; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23675; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23676; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23677; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 23678; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v24 23679; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 23680; GFX8-NEXT: v_bfe_u32 v23, v7, 16, 1 23681; GFX8-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 23682; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v7 23683; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23 23684; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v7 23685; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 23686; GFX8-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 23687; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22 23688; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v6 23689; GFX8-NEXT: v_max_f32_e32 v23, v33, v23 23690; GFX8-NEXT: v_bfe_u32 v33, v23, 16, 1 23691; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v23 23692; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23693; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 23694; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23695; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 23696; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v23 23697; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 23698; GFX8-NEXT: v_bfe_u32 v22, v6, 16, 1 23699; GFX8-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 23700; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v6 23701; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22 23702; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v6 23703; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 23704; GFX8-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 23705; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21 23706; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v5 23707; GFX8-NEXT: v_max_f32_e32 v22, v33, v22 23708; GFX8-NEXT: v_bfe_u32 v33, v22, 16, 1 23709; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v22 23710; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 23711; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 23712; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23713; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 23714; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v22 23715; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 23716; GFX8-NEXT: v_bfe_u32 v21, v5, 16, 1 23717; GFX8-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 23718; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v5 23719; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21 23720; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v5 23721; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 23722; GFX8-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 23723; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20 23724; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v4 23725; GFX8-NEXT: v_max_f32_e32 v21, v33, v21 23726; GFX8-NEXT: v_bfe_u32 v33, v21, 16, 1 23727; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v21 23728; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 23729; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 23730; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23731; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 23732; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v21 23733; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 23734; GFX8-NEXT: v_bfe_u32 v20, v4, 16, 1 23735; GFX8-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 23736; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v4 23737; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20 23738; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v4 23739; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 23740; GFX8-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 23741; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19 23742; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v3 23743; GFX8-NEXT: v_max_f32_e32 v20, v33, v20 23744; GFX8-NEXT: v_bfe_u32 v33, v20, 16, 1 23745; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v20 23746; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 23747; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 23748; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23749; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 23750; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v20 23751; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 23752; GFX8-NEXT: v_bfe_u32 v19, v3, 16, 1 23753; GFX8-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 23754; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v3 23755; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19 23756; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v3 23757; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 23758; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 23759; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18 23760; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v2 23761; GFX8-NEXT: v_max_f32_e32 v19, v33, v19 23762; GFX8-NEXT: v_bfe_u32 v33, v19, 16, 1 23763; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v19 23764; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 23765; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 23766; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23767; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 23768; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v19 23769; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 23770; GFX8-NEXT: v_bfe_u32 v18, v2, 16, 1 23771; GFX8-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 23772; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v2 23773; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 23774; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v2 23775; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 23776; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 23777; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 23778; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v1 23779; GFX8-NEXT: v_max_f32_e32 v18, v33, v18 23780; GFX8-NEXT: v_bfe_u32 v33, v18, 16, 1 23781; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v18 23782; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 23783; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 23784; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23785; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 23786; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v18 23787; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 23788; GFX8-NEXT: v_bfe_u32 v17, v1, 16, 1 23789; GFX8-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 23790; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v1 23791; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 23792; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v1 23793; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 23794; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 23795; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16 23796; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v0 23797; GFX8-NEXT: v_max_f32_e32 v17, v33, v17 23798; GFX8-NEXT: v_bfe_u32 v33, v17, 16, 1 23799; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v17 23800; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 23801; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 23802; GFX8-NEXT: v_add_u32_e32 v33, vcc, s4, v33 23803; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 23804; GFX8-NEXT: v_or_b32_e32 v34, 0x400000, v17 23805; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 23806; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 23807; GFX8-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 23808; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 23809; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 23810; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 23811; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 23812; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 23813; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 23814; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 23815; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 23816; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 23817; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 23818; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 23819; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 23820; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 23821; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 23822; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 23823; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 23824; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 23825; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 23826; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 23827; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 23828; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 23829; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 23830; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 23831; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 23832; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 23833; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 23834; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 23835; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 23836; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 23837; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 23838; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 23839; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 23840; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 23841; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 23842; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 23843; GFX8-NEXT: s_setpc_b64 s[30:31] 23844; 23845; GFX9-LABEL: v_maxnum_v32bf16: 23846; GFX9: ; %bb.0: 23847; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23848; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 23849; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 23850; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 23851; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 23852; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 23853; GFX9-NEXT: s_movk_i32 s4, 0x7fff 23854; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 23855; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 23856; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 23857; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 23858; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 23859; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 23860; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc 23861; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 23862; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 23863; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 23864; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc 23865; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 23866; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 23867; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 23868; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23869; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 23870; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 23871; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 23872; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 23873; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 23874; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 23875; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 23876; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc 23877; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 23878; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 23879; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 23880; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc 23881; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 23882; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 23883; GFX9-NEXT: v_max_f32_e32 v32, v32, v29 23884; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 23885; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 23886; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 23887; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 23888; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 23889; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 23890; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 23891; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 23892; GFX9-NEXT: s_waitcnt vmcnt(0) 23893; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 23894; GFX9-NEXT: v_max_f32_e32 v33, v33, v34 23895; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 23896; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 23897; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 23898; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 23899; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 23900; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 23901; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 23902; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc 23903; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 23904; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 23905; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 23906; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc 23907; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 23908; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 23909; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 23910; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 23911; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc 23912; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 23913; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 23914; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc 23915; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 23916; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 23917; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 23918; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 23919; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 23920; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 23921; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 23922; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 23923; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 23924; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 23925; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 23926; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc 23927; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 23928; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 23929; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 23930; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc 23931; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 23932; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 23933; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 23934; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 23935; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 23936; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 23937; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 23938; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 23939; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 23940; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 23941; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 23942; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc 23943; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 23944; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 23945; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 23946; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc 23947; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 23948; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 23949; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 23950; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 23951; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 23952; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 23953; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 23954; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 23955; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 23956; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 23957; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 23958; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc 23959; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 23960; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 23961; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 23962; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc 23963; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 23964; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 23965; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 23966; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 23967; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 23968; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 23969; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 23970; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 23971; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 23972; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 23973; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 23974; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc 23975; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 23976; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 23977; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 23978; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc 23979; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 23980; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 23981; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 23982; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 23983; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 23984; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 23985; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 23986; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 23987; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 23988; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 23989; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 23990; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc 23991; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 23992; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 23993; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 23994; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc 23995; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 23996; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 23997; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 23998; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 23999; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 24000; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 24001; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 24002; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 24003; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 24004; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 24005; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 24006; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc 24007; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 24008; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 24009; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 24010; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc 24011; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 24012; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 24013; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 24014; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 24015; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 24016; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 24017; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 24018; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 24019; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 24020; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 24021; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 24022; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc 24023; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 24024; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 24025; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 24026; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc 24027; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 24028; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 24029; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 24030; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 24031; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 24032; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 24033; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 24034; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 24035; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 24036; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 24037; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 24038; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc 24039; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 24040; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 24041; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 24042; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc 24043; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 24044; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 24045; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 24046; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 24047; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 24048; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 24049; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 24050; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 24051; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 24052; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 24053; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 24054; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc 24055; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 24056; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 24057; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 24058; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc 24059; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 24060; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 24061; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 24062; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 24063; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 24064; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 24065; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 24066; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 24067; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 24068; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 24069; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 24070; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc 24071; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 24072; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 24073; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 24074; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc 24075; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 24076; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 24077; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 24078; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 24079; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 24080; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 24081; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 24082; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 24083; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 24084; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 24085; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 24086; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc 24087; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 24088; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 24089; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 24090; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc 24091; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 24092; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 24093; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 24094; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 24095; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24096; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 24097; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 24098; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 24099; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 24100; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 24101; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 24102; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc 24103; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 24104; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 24105; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24106; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc 24107; GFX9-NEXT: s_mov_b32 s4, 0x7060302 24108; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 24109; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 24110; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 24111; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 24112; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 24113; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 24114; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 24115; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 24116; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 24117; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 24118; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 24119; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 24120; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 24121; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 24122; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 24123; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 24124; GFX9-NEXT: s_setpc_b64 s[30:31] 24125; 24126; GFX10-LABEL: v_maxnum_v32bf16: 24127; GFX10: ; %bb.0: 24128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24129; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 24130; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28 24131; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12 24132; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 24133; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 24134; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27 24135; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11 24136; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 24137; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 24138; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 24139; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 24140; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30 24141; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14 24142; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 24143; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 24144; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29 24145; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13 24146; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 24147; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 24148; GFX10-NEXT: v_max_f32_e32 v12, v12, v28 24149; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22 24150; GFX10-NEXT: v_max_f32_e32 v39, v48, v39 24151; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6 24152; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 24153; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 24154; GFX10-NEXT: v_max_f32_e32 v11, v11, v27 24155; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21 24156; GFX10-NEXT: v_max_f32_e32 v49, v50, v49 24157; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5 24158; GFX10-NEXT: v_max_f32_e32 v33, v34, v33 24159; GFX10-NEXT: v_max_f32_e32 v14, v14, v30 24160; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24 24161; GFX10-NEXT: v_max_f32_e32 v35, v36, v35 24162; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8 24163; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 24164; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 24165; GFX10-NEXT: v_max_f32_e32 v13, v13, v29 24166; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23 24167; GFX10-NEXT: v_max_f32_e32 v37, v38, v37 24168; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7 24169; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 24170; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 24171; GFX10-NEXT: v_max_f32_e32 v6, v6, v22 24172; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16 24173; GFX10-NEXT: v_max_f32_e32 v27, v50, v27 24174; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0 24175; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 24176; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24177; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 24178; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 24179; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25 24180; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9 24181; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 24182; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 24183; GFX10-NEXT: v_max_f32_e32 v8, v8, v24 24184; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18 24185; GFX10-NEXT: v_max_f32_e32 v29, v38, v29 24186; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2 24187; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 24188; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 24189; GFX10-NEXT: v_max_f32_e32 v7, v7, v23 24190; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17 24191; GFX10-NEXT: v_max_f32_e32 v28, v48, v28 24192; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1 24193; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 24194; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 24195; GFX10-NEXT: v_max_f32_e32 v0, v0, v16 24196; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 24197; GFX10-NEXT: v_max_f32_e32 v10, v10, v26 24198; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20 24199; GFX10-NEXT: v_max_f32_e32 v34, v34, v51 24200; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4 24201; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 24202; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 24203; GFX10-NEXT: v_max_f32_e32 v9, v9, v25 24204; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19 24205; GFX10-NEXT: v_max_f32_e32 v30, v36, v30 24206; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3 24207; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 24208; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 24209; GFX10-NEXT: v_max_f32_e32 v2, v2, v18 24210; GFX10-NEXT: v_max_f32_e32 v18, v48, v23 24211; GFX10-NEXT: v_max_f32_e32 v1, v1, v17 24212; GFX10-NEXT: v_max_f32_e32 v17, v50, v22 24213; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33 24214; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1 24215; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 24216; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 24217; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 24218; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 24219; GFX10-NEXT: v_max_f32_e32 v4, v4, v20 24220; GFX10-NEXT: v_max_f32_e32 v20, v36, v25 24221; GFX10-NEXT: v_max_f32_e32 v3, v3, v19 24222; GFX10-NEXT: v_max_f32_e32 v19, v38, v24 24223; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14 24224; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1 24225; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff 24226; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo 24227; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 24228; GFX10-NEXT: v_max_f32_e32 v5, v5, v21 24229; GFX10-NEXT: v_max_f32_e32 v21, v51, v26 24230; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35 24231; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1 24232; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff 24233; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo 24234; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 24235; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13 24236; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1 24237; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff 24238; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37 24239; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo 24240; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 24241; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1 24242; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff 24243; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12 24244; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1 24245; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo 24246; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 24247; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 24248; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39 24249; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1 24250; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff 24251; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo 24252; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 24253; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11 24254; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1 24255; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff 24256; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49 24257; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo 24258; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 24259; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1 24260; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff 24261; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10 24262; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1 24263; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo 24264; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 24265; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff 24266; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34 24267; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1 24268; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff 24269; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo 24270; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 24271; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9 24272; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1 24273; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff 24274; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30 24275; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo 24276; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 24277; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1 24278; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff 24279; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8 24280; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1 24281; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo 24282; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 24283; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff 24284; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29 24285; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1 24286; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff 24287; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo 24288; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 24289; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7 24290; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1 24291; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff 24292; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28 24293; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo 24294; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 24295; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1 24296; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff 24297; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6 24298; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15 24299; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo 24300; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 24301; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1 24302; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff 24303; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27 24304; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 24305; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo 24306; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 24307; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1 24308; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff 24309; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5 24310; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo 24311; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 24312; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1 24313; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff 24314; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21 24315; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo 24316; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 24317; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1 24318; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff 24319; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4 24320; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo 24321; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 24322; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1 24323; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff 24324; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20 24325; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo 24326; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 24327; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff 24328; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1 24329; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3 24330; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo 24331; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 24332; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 24333; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19 24334; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff 24335; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo 24336; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 24337; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff 24338; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1 24339; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2 24340; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo 24341; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 24342; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1 24343; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18 24344; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff 24345; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo 24346; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 24347; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1 24348; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff 24349; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1 24350; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo 24351; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 24352; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1 24353; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff 24354; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17 24355; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo 24356; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 24357; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1 24358; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff 24359; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0 24360; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo 24361; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 24362; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff 24363; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo 24364; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 24365; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 24366; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo 24367; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24368; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302 24369; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302 24370; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo 24371; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 24372; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 24373; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo 24374; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 24375; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 24376; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo 24377; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302 24378; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302 24379; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302 24380; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302 24381; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302 24382; GFX10-NEXT: s_waitcnt vmcnt(0) 24383; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32 24384; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 24385; GFX10-NEXT: v_max_f32_e32 v17, v31, v17 24386; GFX10-NEXT: v_max_f32_e32 v15, v15, v18 24387; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1 24388; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1 24389; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17 24390; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 24391; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15 24392; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff 24393; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff 24394; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302 24395; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302 24396; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo 24397; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 24398; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302 24399; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo 24400; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302 24401; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302 24402; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 24403; GFX10-NEXT: s_setpc_b64 s[30:31] 24404; 24405; GFX11-LABEL: v_maxnum_v32bf16: 24406; GFX11: ; %bb.0: 24407; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24408; GFX11-NEXT: scratch_load_b32 v32, off, s32 24409; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v21 24410; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v5 24411; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 24412; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 24413; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v17 24414; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v1 24415; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 24416; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 24417; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v26 24418; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26 24419; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v24 24420; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 24421; GFX11-NEXT: v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24 24422; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v19 24423; GFX11-NEXT: v_bfe_u32 v103, v5, 16, 1 24424; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 24425; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v18 24426; GFX11-NEXT: v_bfe_u32 v135, v1, 16, 1 24427; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v5 24428; GFX11-NEXT: v_or_b32_e32 v144, 0x400000, v1 24429; GFX11-NEXT: v_add3_u32 v103, v103, v5, 0x7fff 24430; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v3 24431; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 24432; GFX11-NEXT: v_add3_u32 v135, v135, v1, 0x7fff 24433; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v2 24434; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v9 24435; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 24436; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8 24437; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v16 24438; GFX11-NEXT: v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7 24439; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 24440; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v22 24441; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v6 24442; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 24443; GFX11-NEXT: v_bfe_u32 v129, v19, 16, 1 24444; GFX11-NEXT: v_or_b32_e32 v130, 0x400000, v19 24445; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v11 24446; GFX11-NEXT: v_bfe_u32 v119, v3, 16, 1 24447; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v25 24448; GFX11-NEXT: v_add3_u32 v129, v129, v19, 0x7fff 24449; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v0 24450; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 24451; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24452; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 24453; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 24454; GFX11-NEXT: v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2 24455; GFX11-NEXT: v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27 24456; GFX11-NEXT: v_or_b32_e32 v128, 0x400000, v3 24457; GFX11-NEXT: v_add3_u32 v119, v119, v3, 0x7fff 24458; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 24459; GFX11-NEXT: v_bfe_u32 v145, v17, 16, 1 24460; GFX11-NEXT: v_or_b32_e32 v146, 0x400000, v17 24461; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 24462; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 24463; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v4 24464; GFX11-NEXT: v_add3_u32 v145, v145, v17, 0x7fff 24465; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 24466; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v23 24467; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 24468; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v10 24469; GFX11-NEXT: v_max_f32_e32 v2, v2, v18 24470; GFX11-NEXT: v_max_f32_e32 v0, v0, v16 24471; GFX11-NEXT: v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28 24472; GFX11-NEXT: v_max_f32_e32 v7, v7, v23 24473; GFX11-NEXT: v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83 24474; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28 24475; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 24476; GFX11-NEXT: v_bfe_u32 v85, v24, 16, 1 24477; GFX11-NEXT: v_bfe_u32 v97, v23, 16, 1 24478; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v24 24479; GFX11-NEXT: v_or_b32_e32 v98, 0x400000, v23 24480; GFX11-NEXT: v_bfe_u32 v87, v7, 16, 1 24481; GFX11-NEXT: v_add3_u32 v85, v85, v24, 0x7fff 24482; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v20 24483; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 24484; GFX11-NEXT: v_add3_u32 v97, v97, v23, 0x7fff 24485; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 24486; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 24487; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v7 24488; GFX11-NEXT: v_add3_u32 v87, v87, v7, 0x7fff 24489; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 24490; GFX11-NEXT: v_max_f32_e32 v4, v4, v20 24491; GFX11-NEXT: v_max_f32_e32 v20, v80, v71 24492; GFX11-NEXT: v_bfe_u32 v71, v9, 16, 1 24493; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v9 24494; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v29 24495; GFX11-NEXT: v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10 24496; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 24497; GFX11-NEXT: v_add3_u32 v71, v71, v9, 0x7fff 24498; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 24499; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29 24500; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 24501; GFX11-NEXT: v_max_f32_e32 v26, v52, v51 24502; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 24503; GFX11-NEXT: v_max_f32_e32 v6, v6, v22 24504; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v13 24505; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 24506; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14 24507; GFX11-NEXT: v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30 24508; GFX11-NEXT: v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12 24509; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 24510; GFX11-NEXT: v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12 24511; GFX11-NEXT: v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30 24512; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 24513; GFX11-NEXT: v_max_f32_e32 v29, v38, v37 24514; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v15 24515; GFX11-NEXT: v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15 24516; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 24517; GFX11-NEXT: v_max_f32_e32 v14, v14, v30 24518; GFX11-NEXT: v_max_f32_e32 v28, v48, v39 24519; GFX11-NEXT: v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33 24520; GFX11-NEXT: v_bfe_u32 v39, v13, 16, 1 24521; GFX11-NEXT: v_bfe_u32 v35, v14, 16, 1 24522; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v14 24523; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 24524; GFX11-NEXT: v_bfe_u32 v37, v30, 16, 1 24525; GFX11-NEXT: v_bfe_u32 v16, v33, 16, 1 24526; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 24527; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 24528; GFX11-NEXT: v_add3_u32 v35, v35, v14, 0x7fff 24529; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 24530; GFX11-NEXT: v_add3_u32 v16, v16, v33, 0x7fff 24531; GFX11-NEXT: v_add3_u32 v37, v37, v30, 0x7fff 24532; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v13 24533; GFX11-NEXT: v_bfe_u32 v49, v29, 16, 1 24534; GFX11-NEXT: v_add3_u32 v39, v39, v13, 0x7fff 24535; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v34, vcc_lo 24536; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 24537; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v29 24538; GFX11-NEXT: v_bfe_u32 v51, v12, 16, 1 24539; GFX11-NEXT: v_add3_u32 v49, v49, v29, 0x7fff 24540; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v12 24541; GFX11-NEXT: v_cndmask_b32_e32 v14, v35, v36, vcc_lo 24542; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 24543; GFX11-NEXT: v_bfe_u32 v53, v28, 16, 1 24544; GFX11-NEXT: v_add3_u32 v51, v51, v12, 0x7fff 24545; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v28 24546; GFX11-NEXT: v_bfe_u32 v55, v11, 16, 1 24547; GFX11-NEXT: v_cndmask_b32_e32 v30, v37, v38, vcc_lo 24548; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 24549; GFX11-NEXT: v_add3_u32 v53, v53, v28, 0x7fff 24550; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v11 24551; GFX11-NEXT: v_bfe_u32 v65, v27, 16, 1 24552; GFX11-NEXT: v_add3_u32 v55, v55, v11, 0x7fff 24553; GFX11-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc_lo 24554; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 24555; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v27 24556; GFX11-NEXT: v_bfe_u32 v67, v10, 16, 1 24557; GFX11-NEXT: v_add3_u32 v65, v65, v27, 0x7fff 24558; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v10 24559; GFX11-NEXT: v_cndmask_b32_e32 v29, v49, v50, vcc_lo 24560; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 24561; GFX11-NEXT: v_bfe_u32 v69, v26, 16, 1 24562; GFX11-NEXT: v_add3_u32 v67, v67, v10, 0x7fff 24563; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v26 24564; GFX11-NEXT: v_bfe_u32 v81, v25, 16, 1 24565; GFX11-NEXT: v_cndmask_b32_e32 v12, v51, v52, vcc_lo 24566; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 24567; GFX11-NEXT: v_add3_u32 v69, v69, v26, 0x7fff 24568; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v25 24569; GFX11-NEXT: v_bfe_u32 v83, v8, 16, 1 24570; GFX11-NEXT: v_add3_u32 v81, v81, v25, 0x7fff 24571; GFX11-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo 24572; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 24573; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v8 24574; GFX11-NEXT: v_add3_u32 v83, v83, v8, 0x7fff 24575; GFX11-NEXT: v_bfe_u32 v99, v6, 16, 1 24576; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v6 24577; GFX11-NEXT: v_cndmask_b32_e32 v11, v55, v64, vcc_lo 24578; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 24579; GFX11-NEXT: v_bfe_u32 v101, v22, 16, 1 24580; GFX11-NEXT: v_add3_u32 v99, v99, v6, 0x7fff 24581; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v22 24582; GFX11-NEXT: v_bfe_u32 v113, v21, 16, 1 24583; GFX11-NEXT: v_cndmask_b32_e32 v27, v65, v66, vcc_lo 24584; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 24585; GFX11-NEXT: v_add3_u32 v101, v101, v22, 0x7fff 24586; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v21 24587; GFX11-NEXT: v_bfe_u32 v115, v4, 16, 1 24588; GFX11-NEXT: v_add3_u32 v113, v113, v21, 0x7fff 24589; GFX11-NEXT: v_cndmask_b32_e32 v10, v67, v68, vcc_lo 24590; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 24591; GFX11-NEXT: v_or_b32_e32 v116, 0x400000, v4 24592; GFX11-NEXT: v_bfe_u32 v117, v20, 16, 1 24593; GFX11-NEXT: v_add3_u32 v115, v115, v4, 0x7fff 24594; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v20 24595; GFX11-NEXT: v_cndmask_b32_e32 v26, v69, v70, vcc_lo 24596; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 24597; GFX11-NEXT: v_add3_u32 v117, v117, v20, 0x7fff 24598; GFX11-NEXT: v_bfe_u32 v133, v18, 16, 1 24599; GFX11-NEXT: v_or_b32_e32 v134, 0x400000, v18 24600; GFX11-NEXT: v_bfe_u32 v147, v0, 16, 1 24601; GFX11-NEXT: v_cndmask_b32_e32 v9, v71, v80, vcc_lo 24602; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 24603; GFX11-NEXT: v_add3_u32 v133, v133, v18, 0x7fff 24604; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v0 24605; GFX11-NEXT: v_add3_u32 v147, v147, v0, 0x7fff 24606; GFX11-NEXT: v_bfe_u32 v131, v2, 16, 1 24607; GFX11-NEXT: v_cndmask_b32_e32 v25, v81, v82, vcc_lo 24608; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 24609; GFX11-NEXT: v_or_b32_e32 v132, 0x400000, v2 24610; GFX11-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 24611; GFX11-NEXT: v_add3_u32 v131, v131, v2, 0x7fff 24612; GFX11-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 24613; GFX11-NEXT: v_cndmask_b32_e32 v8, v83, v84, vcc_lo 24614; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 24615; GFX11-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 24616; GFX11-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 24617; GFX11-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 24618; GFX11-NEXT: v_perm_b32 v8, v8, v25, 0x7060302 24619; GFX11-NEXT: v_cndmask_b32_e32 v24, v85, v86, vcc_lo 24620; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 24621; GFX11-NEXT: v_perm_b32 v14, v14, v16, 0x7060302 24622; GFX11-NEXT: v_cndmask_b32_e32 v7, v87, v96, vcc_lo 24623; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 24624; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 24625; GFX11-NEXT: v_perm_b32 v7, v7, v24, 0x7060302 24626; GFX11-NEXT: v_cndmask_b32_e32 v23, v97, v98, vcc_lo 24627; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 24628; GFX11-NEXT: v_cndmask_b32_e32 v6, v99, v100, vcc_lo 24629; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 24630; GFX11-NEXT: v_perm_b32 v6, v6, v23, 0x7060302 24631; GFX11-NEXT: v_cndmask_b32_e32 v22, v101, v102, vcc_lo 24632; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 24633; GFX11-NEXT: v_cndmask_b32_e32 v5, v103, v112, vcc_lo 24634; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 24635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 24636; GFX11-NEXT: v_perm_b32 v5, v5, v22, 0x7060302 24637; GFX11-NEXT: v_cndmask_b32_e32 v21, v113, v114, vcc_lo 24638; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 24639; GFX11-NEXT: v_cndmask_b32_e32 v4, v115, v116, vcc_lo 24640; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 24641; GFX11-NEXT: v_perm_b32 v4, v4, v21, 0x7060302 24642; GFX11-NEXT: v_cndmask_b32_e32 v20, v117, v118, vcc_lo 24643; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 24644; GFX11-NEXT: v_cndmask_b32_e32 v19, v129, v130, vcc_lo 24645; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 24646; GFX11-NEXT: v_cndmask_b32_e32 v18, v133, v134, vcc_lo 24647; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 24648; GFX11-NEXT: v_cndmask_b32_e32 v1, v135, v144, vcc_lo 24649; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 24650; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) 24651; GFX11-NEXT: v_perm_b32 v1, v1, v18, 0x7060302 24652; GFX11-NEXT: v_cndmask_b32_e32 v17, v145, v146, vcc_lo 24653; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24654; GFX11-NEXT: v_cndmask_b32_e32 v0, v147, v33, vcc_lo 24655; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 24656; GFX11-NEXT: v_perm_b32 v0, v0, v17, 0x7060302 24657; GFX11-NEXT: v_cndmask_b32_e32 v2, v131, v132, vcc_lo 24658; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 24659; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 24660; GFX11-NEXT: v_perm_b32 v2, v2, v19, 0x7060302 24661; GFX11-NEXT: v_cndmask_b32_e32 v3, v119, v128, vcc_lo 24662; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x7060302 24663; GFX11-NEXT: s_waitcnt vmcnt(0) 24664; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v32 24665; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 24666; GFX11-NEXT: v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32 24667; GFX11-NEXT: v_max_f32_e32 v15, v15, v18 24668; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 24669; GFX11-NEXT: v_bfe_u32 v18, v17, 16, 1 24670; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 24671; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 24672; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 24673; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 24674; GFX11-NEXT: v_add3_u32 v18, v18, v17, 0x7fff 24675; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff 24676; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 24677; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v20, vcc_lo 24678; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 24679; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo 24680; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 24681; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 24682; GFX11-NEXT: s_setpc_b64 s[30:31] 24683 %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) 24684 ret <32 x bfloat> %op 24685} 24686 24687declare bfloat @llvm.sqrt.bf16(bfloat) 24688 24689define bfloat @v_sqrt_bf16(bfloat %a) { 24690; GCN-LABEL: v_sqrt_bf16: 24691; GCN: ; %bb.0: 24692; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24693; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 24694; GCN-NEXT: s_mov_b32 s4, 0xf800000 24695; GCN-NEXT: v_mov_b32_e32 v1, 0x260 24696; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24697; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 24698; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 24699; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 24700; GCN-NEXT: v_sqrt_f32_e32 v2, v0 24701; GCN-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2 24702; GCN-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2 24703; GCN-NEXT: v_fma_f32 v5, -v3, v2, v0 24704; GCN-NEXT: v_fma_f32 v6, -v4, v2, v0 24705; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5 24706; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] 24707; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6 24708; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] 24709; GCN-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 24710; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 24711; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 24712; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 24713; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24714; GCN-NEXT: s_setpc_b64 s[30:31] 24715; 24716; GFX7-LABEL: v_sqrt_bf16: 24717; GFX7: ; %bb.0: 24718; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24719; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 24720; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24721; GFX7-NEXT: s_mov_b32 s4, 0xf800000 24722; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 24723; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 24724; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 24725; GFX7-NEXT: v_sqrt_f32_e32 v1, v0 24726; GFX7-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 24727; GFX7-NEXT: v_fma_f32 v3, -v2, v1, v0 24728; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 24729; GFX7-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] 24730; GFX7-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 24731; GFX7-NEXT: v_fma_f32 v1, -v3, v1, v0 24732; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 24733; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] 24734; GFX7-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 24735; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 24736; GFX7-NEXT: v_mov_b32_e32 v2, 0x260 24737; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 24738; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 24739; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24740; GFX7-NEXT: s_setpc_b64 s[30:31] 24741; 24742; GFX8-LABEL: v_sqrt_bf16: 24743; GFX8: ; %bb.0: 24744; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24745; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24746; GFX8-NEXT: s_mov_b32 s4, 0xf800000 24747; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 24748; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 24749; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 24750; GFX8-NEXT: v_sqrt_f32_e32 v1, v0 24751; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], -1, v1 24752; GFX8-NEXT: v_fma_f32 v3, -v2, v1, v0 24753; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 24754; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] 24755; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], 1, v1 24756; GFX8-NEXT: v_fma_f32 v1, -v3, v1, v0 24757; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 24758; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] 24759; GFX8-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 24760; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 24761; GFX8-NEXT: v_mov_b32_e32 v2, 0x260 24762; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 24763; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 24764; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 24765; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 24766; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 24767; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 24768; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24769; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 24770; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24771; GFX8-NEXT: s_setpc_b64 s[30:31] 24772; 24773; GFX9-LABEL: v_sqrt_bf16: 24774; GFX9: ; %bb.0: 24775; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24776; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24777; GFX9-NEXT: s_mov_b32 s4, 0xf800000 24778; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 24779; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 24780; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 24781; GFX9-NEXT: v_sqrt_f32_e32 v1, v0 24782; GFX9-NEXT: v_add_u32_e32 v2, -1, v1 24783; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0 24784; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 24785; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 24786; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] 24787; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0 24788; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 24789; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] 24790; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 24791; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 24792; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 24793; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 24794; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 24795; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 24796; GFX9-NEXT: s_movk_i32 s4, 0x7fff 24797; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 24798; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 24799; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24800; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 24801; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24802; GFX9-NEXT: s_setpc_b64 s[30:31] 24803; 24804; GFX10-LABEL: v_sqrt_bf16: 24805; GFX10: ; %bb.0: 24806; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24807; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24808; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 24809; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 24810; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 24811; GFX10-NEXT: v_sqrt_f32_e32 v1, v0 24812; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v1 24813; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v1 24814; GFX10-NEXT: v_fma_f32 v4, -v2, v1, v0 24815; GFX10-NEXT: v_fma_f32 v5, -v3, v1, v0 24816; GFX10-NEXT: v_cmp_ge_f32_e64 s4, 0, v4 24817; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s4 24818; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0, v5 24819; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 24820; GFX10-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 24821; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo 24822; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 24823; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 24824; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 24825; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 24826; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24827; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 24828; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 24829; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24830; GFX10-NEXT: s_setpc_b64 s[30:31] 24831; 24832; GFX11-LABEL: v_sqrt_bf16: 24833; GFX11: ; %bb.0: 24834; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24835; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24836; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 24837; GFX11-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 24838; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 24839; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 24840; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 24841; GFX11-NEXT: v_sqrt_f32_e32 v1, v0 24842; GFX11-NEXT: s_waitcnt_depctr 0xfff 24843; GFX11-NEXT: v_add_nc_u32_e32 v2, -1, v1 24844; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1 24845; GFX11-NEXT: v_fma_f32 v4, -v2, v1, v0 24846; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 24847; GFX11-NEXT: v_fma_f32 v5, -v3, v1, v0 24848; GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0, v4 24849; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 24850; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v2, s0 24851; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0, v5 24852; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 24853; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v3, s0 24854; GFX11-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 24855; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 24856; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo 24857; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 0x260 24858; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 24859; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 24860; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 24861; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 24862; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24863; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 24864; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 24865; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 24866; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24867; GFX11-NEXT: s_setpc_b64 s[30:31] 24868 %op = call bfloat @llvm.sqrt.bf16(bfloat %a) 24869 ret bfloat %op 24870} 24871 24872declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32) 24873 24874define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { 24875; GCN-LABEL: v_ldexp_bf16_i32: 24876; GCN: ; %bb.0: 24877; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24878; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 24879; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24880; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 24881; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24882; GCN-NEXT: s_setpc_b64 s[30:31] 24883; 24884; GFX7-LABEL: v_ldexp_bf16_i32: 24885; GFX7: ; %bb.0: 24886; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24887; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 24888; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24889; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 24890; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24891; GFX7-NEXT: s_setpc_b64 s[30:31] 24892; 24893; GFX8-LABEL: v_ldexp_bf16_i32: 24894; GFX8: ; %bb.0: 24895; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24896; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24897; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 24898; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 24899; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 24900; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 24901; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 24902; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24903; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 24904; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24905; GFX8-NEXT: s_setpc_b64 s[30:31] 24906; 24907; GFX9-LABEL: v_ldexp_bf16_i32: 24908; GFX9: ; %bb.0: 24909; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24910; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24911; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 24912; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 24913; GFX9-NEXT: s_movk_i32 s4, 0x7fff 24914; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 24915; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 24916; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24917; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 24918; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24919; GFX9-NEXT: s_setpc_b64 s[30:31] 24920; 24921; GFX10-LABEL: v_ldexp_bf16_i32: 24922; GFX10: ; %bb.0: 24923; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24924; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24925; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 24926; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 24927; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 24928; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24929; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 24930; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 24931; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24932; GFX10-NEXT: s_setpc_b64 s[30:31] 24933; 24934; GFX11-LABEL: v_ldexp_bf16_i32: 24935; GFX11: ; %bb.0: 24936; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24937; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 24938; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 24939; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 24940; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 24941; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 24942; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 24943; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 24944; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 24945; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 24946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 24947; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24948; GFX11-NEXT: s_setpc_b64 s[30:31] 24949 %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b) 24950 ret bfloat %op 24951} 24952 24953declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat) 24954 24955define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { 24956; GCN-LABEL: v_frexp_bf16_i16: 24957; GCN: ; %bb.0: 24958; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24959; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 24960; GCN-NEXT: s_mov_b32 s4, 0x7f800000 24961; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24962; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0 24963; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 24964; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 24965; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 24966; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc 24967; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24968; GCN-NEXT: s_setpc_b64 s[30:31] 24969; 24970; GFX7-LABEL: v_frexp_bf16_i16: 24971; GFX7: ; %bb.0: 24972; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24973; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 24974; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24975; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 24976; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 24977; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 24978; GFX7-NEXT: s_setpc_b64 s[30:31] 24979; 24980; GFX8-LABEL: v_frexp_bf16_i16: 24981; GFX8: ; %bb.0: 24982; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24983; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 24984; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v1 24985; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 24986; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 24987; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 24988; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 24989; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 24990; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 24991; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 24992; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 24993; GFX8-NEXT: s_setpc_b64 s[30:31] 24994; 24995; GFX9-LABEL: v_frexp_bf16_i16: 24996; GFX9: ; %bb.0: 24997; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 24998; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 24999; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 25000; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 25001; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25002; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 25003; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 25004; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25005; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 25006; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25007; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 25008; GFX9-NEXT: s_setpc_b64 s[30:31] 25009; 25010; GFX10-LABEL: v_frexp_bf16_i16: 25011; GFX10: ; %bb.0: 25012; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25013; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 25014; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v1 25015; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 25016; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 25017; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 25018; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25019; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 25020; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 25021; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25022; GFX10-NEXT: s_setpc_b64 s[30:31] 25023; 25024; GFX11-LABEL: v_frexp_bf16_i16: 25025; GFX11: ; %bb.0: 25026; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25027; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 25028; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25029; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v1 25030; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 25031; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 25032; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25033; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 25034; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 25035; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 25036; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 25037; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 25038; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25039; GFX11-NEXT: s_setpc_b64 s[30:31] 25040 %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a) 25041 ret { bfloat, i16 } %op 25042} 25043 25044 25045declare bfloat @llvm.log.bf16(bfloat) 25046declare bfloat @llvm.log2.bf16(bfloat) 25047declare bfloat @llvm.log10.bf16(bfloat) 25048 25049define bfloat @v_log_bf16(bfloat %a) { 25050; GCN-LABEL: v_log_bf16: 25051; GCN: ; %bb.0: 25052; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25053; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25054; GCN-NEXT: s_mov_b32 s4, 0x800000 25055; GCN-NEXT: s_mov_b32 s5, 0x7f800000 25056; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218 25057; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25058; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25059; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 25060; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25061; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 25062; GCN-NEXT: v_log_f32_e32 v0, v0 25063; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 25064; GCN-NEXT: v_sub_f32_e32 v3, v0, v2 25065; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 25066; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 25067; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 25068; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 25069; GCN-NEXT: v_add_f32_e32 v3, v4, v3 25070; GCN-NEXT: v_add_f32_e32 v3, v5, v3 25071; GCN-NEXT: v_add_f32_e32 v2, v2, v3 25072; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5 25073; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] 25074; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25075; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 25076; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25077; GCN-NEXT: s_setpc_b64 s[30:31] 25078; 25079; GFX7-LABEL: v_log_bf16: 25080; GFX7: ; %bb.0: 25081; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25082; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25083; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25084; GFX7-NEXT: s_mov_b32 s4, 0x800000 25085; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25086; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25087; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25088; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 25089; GFX7-NEXT: v_log_f32_e32 v0, v0 25090; GFX7-NEXT: s_mov_b32 s4, 0x3f317217 25091; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 25092; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 25093; GFX7-NEXT: s_mov_b32 s4, 0x3377d1cf 25094; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2 25095; GFX7-NEXT: s_mov_b32 s4, 0x7f800000 25096; GFX7-NEXT: v_add_f32_e32 v1, v1, v2 25097; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25098; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25099; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218 25100; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25101; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 25102; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25103; GFX7-NEXT: s_setpc_b64 s[30:31] 25104; 25105; GFX8-LABEL: v_log_bf16: 25106; GFX8: ; %bb.0: 25107; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25108; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25109; GFX8-NEXT: s_mov_b32 s4, 0x800000 25110; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25111; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25112; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25113; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 25114; GFX8-NEXT: v_log_f32_e32 v0, v0 25115; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 25116; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 25117; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1 25118; GFX8-NEXT: v_mul_f32_e32 v3, 0x3f317000, v2 25119; GFX8-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v2 25120; GFX8-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 25121; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 25122; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 25123; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 25124; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 25125; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25126; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25127; GFX8-NEXT: v_mov_b32_e32 v1, 0x41b17218 25128; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25129; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 25130; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25131; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25132; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25133; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25134; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25135; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25136; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25137; GFX8-NEXT: s_setpc_b64 s[30:31] 25138; 25139; GFX9-LABEL: v_log_bf16: 25140; GFX9: ; %bb.0: 25141; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25142; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25143; GFX9-NEXT: s_mov_b32 s4, 0x800000 25144; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25145; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25146; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25147; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 25148; GFX9-NEXT: v_log_f32_e32 v0, v0 25149; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 25150; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 25151; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 25152; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf 25153; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 25154; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 25155; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 25156; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25157; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25158; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218 25159; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25160; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 25161; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25162; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25163; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25164; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25165; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25166; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25167; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25168; GFX9-NEXT: s_setpc_b64 s[30:31] 25169; 25170; GFX10-LABEL: v_log_bf16: 25171; GFX10: ; %bb.0: 25172; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25173; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25174; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25175; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 25176; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25177; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 25178; GFX10-NEXT: v_log_f32_e32 v0, v0 25179; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 25180; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 25181; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 25182; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 25183; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo 25184; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| 25185; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 25186; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 25187; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25188; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25189; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25190; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25191; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25192; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25193; GFX10-NEXT: s_setpc_b64 s[30:31] 25194; 25195; GFX11-LABEL: v_log_bf16: 25196; GFX11: ; %bb.0: 25197; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25198; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 25200; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25201; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 25202; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25203; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25204; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 25205; GFX11-NEXT: v_log_f32_e32 v0, v0 25206; GFX11-NEXT: s_waitcnt_depctr 0xfff 25207; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 25208; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25209; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 25210; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 25211; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 25212; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 25213; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo 25214; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| 25215; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 25216; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25217; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 25218; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 25219; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 25220; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25221; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 25222; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25223; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25224; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 25225; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25226; GFX11-NEXT: s_setpc_b64 s[30:31] 25227 %op = call bfloat @llvm.log.bf16(bfloat %a) 25228 ret bfloat %op 25229} 25230 25231define bfloat @v_log2_bf16(bfloat %a) { 25232; GCN-LABEL: v_log2_bf16: 25233; GCN: ; %bb.0: 25234; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25235; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25236; GCN-NEXT: s_mov_b32 s4, 0x800000 25237; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000 25238; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25239; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25240; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 25241; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25242; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 25243; GCN-NEXT: v_log_f32_e32 v0, v0 25244; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25245; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 25246; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25247; GCN-NEXT: s_setpc_b64 s[30:31] 25248; 25249; GFX7-LABEL: v_log2_bf16: 25250; GFX7: ; %bb.0: 25251; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25252; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25253; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25254; GFX7-NEXT: s_mov_b32 s4, 0x800000 25255; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25256; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25257; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25258; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 25259; GFX7-NEXT: v_log_f32_e32 v0, v0 25260; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000 25261; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25262; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 25263; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25264; GFX7-NEXT: s_setpc_b64 s[30:31] 25265; 25266; GFX8-LABEL: v_log2_bf16: 25267; GFX8: ; %bb.0: 25268; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25269; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25270; GFX8-NEXT: s_mov_b32 s4, 0x800000 25271; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25272; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25273; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25274; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 25275; GFX8-NEXT: v_log_f32_e32 v0, v0 25276; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 25277; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25278; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 25279; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25280; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25281; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25282; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25283; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25284; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25285; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25286; GFX8-NEXT: s_setpc_b64 s[30:31] 25287; 25288; GFX9-LABEL: v_log2_bf16: 25289; GFX9: ; %bb.0: 25290; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25291; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25292; GFX9-NEXT: s_mov_b32 s4, 0x800000 25293; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25294; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 25295; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25296; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 25297; GFX9-NEXT: v_log_f32_e32 v0, v0 25298; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 25299; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25300; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25301; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 25302; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25303; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25304; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25305; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25306; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25307; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25308; GFX9-NEXT: s_setpc_b64 s[30:31] 25309; 25310; GFX10-LABEL: v_log2_bf16: 25311; GFX10: ; %bb.0: 25312; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25313; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25314; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25315; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 25316; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo 25317; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25318; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 25319; GFX10-NEXT: v_log_f32_e32 v0, v0 25320; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 25321; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25322; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25323; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25324; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25325; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25326; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25327; GFX10-NEXT: s_setpc_b64 s[30:31] 25328; 25329; GFX11-LABEL: v_log2_bf16: 25330; GFX11: ; %bb.0: 25331; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25332; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25333; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 25334; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25335; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 25336; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo 25337; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25338; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25339; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 25340; GFX11-NEXT: v_log_f32_e32 v0, v0 25341; GFX11-NEXT: s_waitcnt_depctr 0xfff 25342; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 25343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 25344; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 25345; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 25346; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25347; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25348; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25349; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25350; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25351; GFX11-NEXT: s_setpc_b64 s[30:31] 25352 %op = call bfloat @llvm.log2.bf16(bfloat %a) 25353 ret bfloat %op 25354} 25355 25356define bfloat @v_log10_bf16(bfloat %a) { 25357; GCN-LABEL: v_log10_bf16: 25358; GCN: ; %bb.0: 25359; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25360; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25361; GCN-NEXT: s_mov_b32 s4, 0x800000 25362; GCN-NEXT: s_mov_b32 s5, 0x7f800000 25363; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b 25364; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25365; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25366; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 25367; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 25368; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 25369; GCN-NEXT: v_log_f32_e32 v0, v0 25370; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 25371; GCN-NEXT: v_sub_f32_e32 v3, v0, v2 25372; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 25373; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 25374; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 25375; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 25376; GCN-NEXT: v_add_f32_e32 v3, v4, v3 25377; GCN-NEXT: v_add_f32_e32 v3, v5, v3 25378; GCN-NEXT: v_add_f32_e32 v2, v2, v3 25379; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5 25380; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] 25381; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25382; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 25383; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25384; GCN-NEXT: s_setpc_b64 s[30:31] 25385; 25386; GFX7-LABEL: v_log10_bf16: 25387; GFX7: ; %bb.0: 25388; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25389; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25390; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25391; GFX7-NEXT: s_mov_b32 s4, 0x800000 25392; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25393; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25394; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25395; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 25396; GFX7-NEXT: v_log_f32_e32 v0, v0 25397; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a 25398; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 25399; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 25400; GFX7-NEXT: s_mov_b32 s4, 0x3284fbcf 25401; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2 25402; GFX7-NEXT: s_mov_b32 s4, 0x7f800000 25403; GFX7-NEXT: v_add_f32_e32 v1, v1, v2 25404; GFX7-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25405; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25406; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b 25407; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25408; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 25409; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25410; GFX7-NEXT: s_setpc_b64 s[30:31] 25411; 25412; GFX8-LABEL: v_log10_bf16: 25413; GFX8: ; %bb.0: 25414; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25415; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25416; GFX8-NEXT: s_mov_b32 s4, 0x800000 25417; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25418; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25419; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25420; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 25421; GFX8-NEXT: v_log_f32_e32 v0, v0 25422; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 25423; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 25424; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1 25425; GFX8-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v2 25426; GFX8-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v2 25427; GFX8-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 25428; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 25429; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 25430; GFX8-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 25431; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 25432; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25433; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25434; GFX8-NEXT: v_mov_b32_e32 v1, 0x411a209b 25435; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25436; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 25437; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25438; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25439; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25440; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25441; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25442; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25443; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25444; GFX8-NEXT: s_setpc_b64 s[30:31] 25445; 25446; GFX9-LABEL: v_log10_bf16: 25447; GFX9: ; %bb.0: 25448; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25449; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25450; GFX9-NEXT: s_mov_b32 s4, 0x800000 25451; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25452; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 25453; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25454; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 25455; GFX9-NEXT: v_log_f32_e32 v0, v0 25456; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a 25457; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 25458; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 25459; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf 25460; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 25461; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 25462; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 25463; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 25464; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 25465; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b 25466; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25467; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 25468; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25469; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25470; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25471; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25472; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25473; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25474; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25475; GFX9-NEXT: s_setpc_b64 s[30:31] 25476; 25477; GFX10-LABEL: v_log10_bf16: 25478; GFX10: ; %bb.0: 25479; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25480; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25481; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25482; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 25483; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25484; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 25485; GFX10-NEXT: v_log_f32_e32 v0, v0 25486; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 25487; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 25488; GFX10-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 25489; GFX10-NEXT: v_add_f32_e32 v1, v1, v2 25490; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo 25491; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| 25492; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 25493; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 25494; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25495; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25496; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25497; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25498; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25499; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25500; GFX10-NEXT: s_setpc_b64 s[30:31] 25501; 25502; GFX11-LABEL: v_log10_bf16: 25503; GFX11: ; %bb.0: 25504; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25505; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25506; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 25507; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 25508; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 25509; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 25510; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25511; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 25512; GFX11-NEXT: v_log_f32_e32 v0, v0 25513; GFX11-NEXT: s_waitcnt_depctr 0xfff 25514; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 25515; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25516; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 25517; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 25518; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 25519; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 25520; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo 25521; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| 25522; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 25523; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25524; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 25525; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 25526; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 25527; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25528; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 25529; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25530; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25531; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 25532; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25533; GFX11-NEXT: s_setpc_b64 s[30:31] 25534 %op = call bfloat @llvm.log10.bf16(bfloat %a) 25535 ret bfloat %op 25536} 25537 25538declare bfloat @llvm.exp.bf16(bfloat) 25539declare bfloat @llvm.exp2.bf16(bfloat) 25540declare bfloat @llvm.exp10.bf16(bfloat) 25541 25542define bfloat @v_exp_bf16(bfloat %a) { 25543; GCN-LABEL: v_exp_bf16: 25544; GCN: ; %bb.0: 25545; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25546; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25547; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0 25548; GCN-NEXT: s_mov_b32 s5, 0x42b17218 25549; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 25550; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25551; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0 25552; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 25553; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 25554; GCN-NEXT: v_rndne_f32_e32 v5, v2 25555; GCN-NEXT: v_mul_f32_e32 v6, 0x39a3b295, v3 25556; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3 25557; GCN-NEXT: v_sub_f32_e32 v2, v2, v5 25558; GCN-NEXT: v_add_f32_e32 v3, v3, v6 25559; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 25560; GCN-NEXT: v_add_f32_e32 v3, v4, v3 25561; GCN-NEXT: v_add_f32_e32 v2, v2, v3 25562; GCN-NEXT: v_exp_f32_e32 v2, v2 25563; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5 25564; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25565; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 25566; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 25567; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25568; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25569; GCN-NEXT: s_setpc_b64 s[30:31] 25570; 25571; GFX7-LABEL: v_exp_bf16: 25572; GFX7: ; %bb.0: 25573; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25574; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25575; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25576; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b 25577; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 25578; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 25579; GFX7-NEXT: s_mov_b32 s4, 0x32a5705f 25580; GFX7-NEXT: v_rndne_f32_e32 v3, v1 25581; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2 25582; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 25583; GFX7-NEXT: v_add_f32_e32 v1, v1, v2 25584; GFX7-NEXT: v_exp_f32_e32 v1, v1 25585; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3 25586; GFX7-NEXT: s_mov_b32 s4, 0xc2ce8ed0 25587; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25588; GFX7-NEXT: s_mov_b32 s4, 0x42b17218 25589; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2 25590; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25591; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000 25592; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25593; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25594; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25595; GFX7-NEXT: s_setpc_b64 s[30:31] 25596; 25597; GFX8-LABEL: v_exp_bf16: 25598; GFX8: ; %bb.0: 25599; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25600; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25601; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0 25602; GFX8-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v0 25603; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v3 25604; GFX8-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3 25605; GFX8-NEXT: v_rndne_f32_e32 v2, v1 25606; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 25607; GFX8-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 25608; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 25609; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 25610; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 25611; GFX8-NEXT: v_exp_f32_e32 v1, v1 25612; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2 25613; GFX8-NEXT: s_mov_b32 s4, 0xc2ce8ed0 25614; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25615; GFX8-NEXT: s_mov_b32 s4, 0x42b17218 25616; GFX8-NEXT: v_ldexp_f32 v1, v1, v2 25617; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25618; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 25619; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25620; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25621; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25622; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25623; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25624; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25625; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25626; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25627; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25628; GFX8-NEXT: s_setpc_b64 s[30:31] 25629; 25630; GFX9-LABEL: v_exp_bf16: 25631; GFX9: ; %bb.0: 25632; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25633; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25634; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 25635; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b 25636; GFX9-NEXT: v_rndne_f32_e32 v2, v1 25637; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 25638; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 25639; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f 25640; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 25641; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 25642; GFX9-NEXT: v_exp_f32_e32 v1, v1 25643; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 25644; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0 25645; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25646; GFX9-NEXT: s_mov_b32 s4, 0x42b17218 25647; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 25648; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25649; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 25650; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25651; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25652; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25653; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25654; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25655; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25656; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25657; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25658; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25659; GFX9-NEXT: s_setpc_b64 s[30:31] 25660; 25661; GFX10-LABEL: v_exp_bf16: 25662; GFX10: ; %bb.0: 25663; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25664; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25665; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 25666; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 25667; GFX10-NEXT: v_rndne_f32_e32 v2, v1 25668; GFX10-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1 25669; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 25670; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3 25671; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 25672; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 25673; GFX10-NEXT: v_exp_f32_e32 v1, v1 25674; GFX10-NEXT: v_ldexp_f32 v1, v1, v2 25675; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 25676; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 25677; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo 25678; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25679; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25680; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25681; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25682; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25683; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25684; GFX10-NEXT: s_setpc_b64 s[30:31] 25685; 25686; GFX11-LABEL: v_exp_bf16: 25687; GFX11: ; %bb.0: 25688; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25689; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25690; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25691; GFX11-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 25692; GFX11-NEXT: v_rndne_f32_e32 v2, v1 25693; GFX11-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1 25694; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 25695; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 25696; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3 25697; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2 25698; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 25699; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 25700; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 25701; GFX11-NEXT: v_exp_f32_e32 v1, v1 25702; GFX11-NEXT: s_waitcnt_depctr 0xfff 25703; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 25704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 25705; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 25706; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 25707; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo 25708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 25709; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 25710; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 25711; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25712; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25714; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25715; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25716; GFX11-NEXT: s_setpc_b64 s[30:31] 25717 %op = call bfloat @llvm.exp.bf16(bfloat %a) 25718 ret bfloat %op 25719} 25720 25721define bfloat @v_exp2_bf16(bfloat %a) { 25722; GCN-LABEL: v_exp2_bf16: 25723; GCN: ; %bb.0: 25724; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25725; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25726; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000 25727; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000 25728; GCN-NEXT: v_not_b32_e32 v2, 63 25729; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25730; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25731; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25732; GCN-NEXT: v_add_f32_e32 v0, v0, v1 25733; GCN-NEXT: v_exp_f32_e32 v0, v0 25734; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc 25735; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 25736; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25737; GCN-NEXT: s_setpc_b64 s[30:31] 25738; 25739; GFX7-LABEL: v_exp2_bf16: 25740; GFX7: ; %bb.0: 25741; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25742; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25743; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25744; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000 25745; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000 25746; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25747; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25748; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 25749; GFX7-NEXT: v_exp_f32_e32 v0, v0 25750; GFX7-NEXT: v_not_b32_e32 v1, 63 25751; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25752; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 25753; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25754; GFX7-NEXT: s_setpc_b64 s[30:31] 25755; 25756; GFX8-LABEL: v_exp2_bf16: 25757; GFX8: ; %bb.0: 25758; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25759; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25760; GFX8-NEXT: s_mov_b32 s4, 0xc2fc0000 25761; GFX8-NEXT: v_mov_b32_e32 v1, 0x42800000 25762; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25763; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25764; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 25765; GFX8-NEXT: v_exp_f32_e32 v0, v0 25766; GFX8-NEXT: v_not_b32_e32 v1, 63 25767; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25768; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 25769; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25770; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25771; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25772; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25773; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25774; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25775; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25776; GFX8-NEXT: s_setpc_b64 s[30:31] 25777; 25778; GFX9-LABEL: v_exp2_bf16: 25779; GFX9: ; %bb.0: 25780; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25781; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25782; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 25783; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 25784; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 25785; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 25786; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 25787; GFX9-NEXT: v_exp_f32_e32 v0, v0 25788; GFX9-NEXT: v_not_b32_e32 v1, 63 25789; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25790; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25791; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 25792; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25793; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25794; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25795; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25796; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25797; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25798; GFX9-NEXT: s_setpc_b64 s[30:31] 25799; 25800; GFX10-LABEL: v_exp2_bf16: 25801; GFX10: ; %bb.0: 25802; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25803; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25804; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 25805; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo 25806; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo 25807; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 25808; GFX10-NEXT: v_exp_f32_e32 v0, v0 25809; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 25810; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25811; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25812; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25813; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25814; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25815; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25816; GFX10-NEXT: s_setpc_b64 s[30:31] 25817; 25818; GFX11-LABEL: v_exp2_bf16: 25819; GFX11: ; %bb.0: 25820; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25821; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25822; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 25823; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 25824; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo 25825; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo 25826; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 25827; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 25828; GFX11-NEXT: v_exp_f32_e32 v0, v0 25829; GFX11-NEXT: s_waitcnt_depctr 0xfff 25830; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 25831; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 25832; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 25833; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25834; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 25835; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25836; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25837; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 25838; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25839; GFX11-NEXT: s_setpc_b64 s[30:31] 25840 %op = call bfloat @llvm.exp2.bf16(bfloat %a) 25841 ret bfloat %op 25842} 25843 25844define bfloat @v_exp10_bf16(bfloat %a) { 25845; GCN-LABEL: v_exp10_bf16: 25846; GCN: ; %bb.0: 25847; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25848; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 25849; GCN-NEXT: s_mov_b32 s4, 0xc23369f4 25850; GCN-NEXT: s_mov_b32 s5, 0x421a209b 25851; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 25852; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25853; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0 25854; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 25855; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 25856; GCN-NEXT: v_rndne_f32_e32 v5, v2 25857; GCN-NEXT: v_mul_f32_e32 v6, 0x3a2784bc, v3 25858; GCN-NEXT: v_mul_f32_e32 v3, 0x40549000, v3 25859; GCN-NEXT: v_sub_f32_e32 v2, v2, v5 25860; GCN-NEXT: v_add_f32_e32 v3, v3, v6 25861; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 25862; GCN-NEXT: v_add_f32_e32 v3, v4, v3 25863; GCN-NEXT: v_add_f32_e32 v2, v2, v3 25864; GCN-NEXT: v_exp_f32_e32 v2, v2 25865; GCN-NEXT: v_ldexp_f32_e32 v2, v2, v5 25866; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25867; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 25868; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 25869; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25870; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25871; GCN-NEXT: s_setpc_b64 s[30:31] 25872; 25873; GFX7-LABEL: v_exp10_bf16: 25874; GFX7: ; %bb.0: 25875; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25876; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 25877; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25878; GFX7-NEXT: s_mov_b32 s4, 0x40549a78 25879; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 25880; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 25881; GFX7-NEXT: s_mov_b32 s4, 0x33979a37 25882; GFX7-NEXT: v_rndne_f32_e32 v3, v1 25883; GFX7-NEXT: v_fma_f32 v2, v0, s4, v2 25884; GFX7-NEXT: v_sub_f32_e32 v1, v1, v3 25885; GFX7-NEXT: v_add_f32_e32 v1, v1, v2 25886; GFX7-NEXT: v_exp_f32_e32 v1, v1 25887; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v3 25888; GFX7-NEXT: s_mov_b32 s4, 0xc23369f4 25889; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25890; GFX7-NEXT: s_mov_b32 s4, 0x421a209b 25891; GFX7-NEXT: v_ldexp_f32_e32 v1, v1, v2 25892; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25893; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000 25894; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25895; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25896; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 25897; GFX7-NEXT: s_setpc_b64 s[30:31] 25898; 25899; GFX8-LABEL: v_exp10_bf16: 25900; GFX8: ; %bb.0: 25901; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25902; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25903; GFX8-NEXT: v_sub_f32_e32 v3, v0, v0 25904; GFX8-NEXT: v_mul_f32_e32 v1, 0x40549000, v0 25905; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v3 25906; GFX8-NEXT: v_mul_f32_e32 v3, 0x40549000, v3 25907; GFX8-NEXT: v_rndne_f32_e32 v2, v1 25908; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 25909; GFX8-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 25910; GFX8-NEXT: v_sub_f32_e32 v1, v1, v2 25911; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 25912; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 25913; GFX8-NEXT: v_exp_f32_e32 v1, v1 25914; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2 25915; GFX8-NEXT: s_mov_b32 s4, 0xc23369f4 25916; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25917; GFX8-NEXT: s_mov_b32 s4, 0x421a209b 25918; GFX8-NEXT: v_ldexp_f32 v1, v1, v2 25919; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25920; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 25921; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25922; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25923; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 25924; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 25925; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 25926; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 25927; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25928; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25929; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25930; GFX8-NEXT: s_setpc_b64 s[30:31] 25931; 25932; GFX9-LABEL: v_exp10_bf16: 25933; GFX9: ; %bb.0: 25934; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25935; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25936; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 25937; GFX9-NEXT: s_mov_b32 s4, 0x40549a78 25938; GFX9-NEXT: v_rndne_f32_e32 v2, v1 25939; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 25940; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 25941; GFX9-NEXT: s_mov_b32 s4, 0x33979a37 25942; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 25943; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 25944; GFX9-NEXT: v_exp_f32_e32 v1, v1 25945; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 25946; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4 25947; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 25948; GFX9-NEXT: s_mov_b32 s4, 0x421a209b 25949; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 25950; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 25951; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 25952; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 25953; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 25954; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 25955; GFX9-NEXT: s_movk_i32 s4, 0x7fff 25956; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 25957; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 25958; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 25959; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 25960; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25961; GFX9-NEXT: s_setpc_b64 s[30:31] 25962; 25963; GFX10-LABEL: v_exp10_bf16: 25964; GFX10: ; %bb.0: 25965; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25966; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25967; GFX10-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 25968; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 25969; GFX10-NEXT: v_rndne_f32_e32 v2, v1 25970; GFX10-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1 25971; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 25972; GFX10-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3 25973; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 25974; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 25975; GFX10-NEXT: v_exp_f32_e32 v1, v1 25976; GFX10-NEXT: v_ldexp_f32 v1, v1, v2 25977; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 25978; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 25979; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo 25980; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 25981; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 25982; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 25983; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 25984; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 25985; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 25986; GFX10-NEXT: s_setpc_b64 s[30:31] 25987; 25988; GFX11-LABEL: v_exp10_bf16: 25989; GFX11: ; %bb.0: 25990; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25991; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 25992; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 25993; GFX11-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 25994; GFX11-NEXT: v_rndne_f32_e32 v2, v1 25995; GFX11-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1 25996; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 25997; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 25998; GFX11-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3 25999; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2 26000; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 26001; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26002; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 26003; GFX11-NEXT: v_exp_f32_e32 v1, v1 26004; GFX11-NEXT: s_waitcnt_depctr 0xfff 26005; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 26006; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 26007; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 26008; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 26009; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo 26010; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 26011; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26012; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26013; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26014; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26015; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26016; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26017; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26018; GFX11-NEXT: s_setpc_b64 s[30:31] 26019 %op = call bfloat @llvm.exp10.bf16(bfloat %a) 26020 ret bfloat %op 26021} 26022 26023declare bfloat @llvm.ceil.bf16(bfloat) 26024 26025define bfloat @v_ceil_bf16(bfloat %a) { 26026; GCN-LABEL: v_ceil_bf16: 26027; GCN: ; %bb.0: 26028; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26029; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26030; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26031; GCN-NEXT: v_ceil_f32_e32 v0, v0 26032; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26033; GCN-NEXT: s_setpc_b64 s[30:31] 26034; 26035; GFX7-LABEL: v_ceil_bf16: 26036; GFX7: ; %bb.0: 26037; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26038; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26039; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26040; GFX7-NEXT: v_ceil_f32_e32 v0, v0 26041; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26042; GFX7-NEXT: s_setpc_b64 s[30:31] 26043; 26044; GFX8-LABEL: v_ceil_bf16: 26045; GFX8: ; %bb.0: 26046; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26047; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26048; GFX8-NEXT: v_ceil_f32_e32 v0, v0 26049; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26050; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26051; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26052; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26053; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26054; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26055; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26056; GFX8-NEXT: s_setpc_b64 s[30:31] 26057; 26058; GFX9-LABEL: v_ceil_bf16: 26059; GFX9: ; %bb.0: 26060; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26061; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26062; GFX9-NEXT: v_ceil_f32_e32 v0, v0 26063; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26064; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26065; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26066; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26067; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26068; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26069; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26070; GFX9-NEXT: s_setpc_b64 s[30:31] 26071; 26072; GFX10-LABEL: v_ceil_bf16: 26073; GFX10: ; %bb.0: 26074; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26075; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26076; GFX10-NEXT: v_ceil_f32_e32 v0, v0 26077; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26078; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26079; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26080; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26081; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26082; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26083; GFX10-NEXT: s_setpc_b64 s[30:31] 26084; 26085; GFX11-LABEL: v_ceil_bf16: 26086; GFX11: ; %bb.0: 26087; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26088; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26089; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26090; GFX11-NEXT: v_ceil_f32_e32 v0, v0 26091; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26092; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26093; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26094; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26095; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26096; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26097; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26098; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26099; GFX11-NEXT: s_setpc_b64 s[30:31] 26100 %op = call bfloat @llvm.ceil.bf16(bfloat %a) 26101 ret bfloat %op 26102} 26103 26104declare bfloat @llvm.trunc.bf16(bfloat) 26105 26106define bfloat @v_trunc_bf16(bfloat %a) { 26107; GCN-LABEL: v_trunc_bf16: 26108; GCN: ; %bb.0: 26109; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26110; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26111; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26112; GCN-NEXT: v_trunc_f32_e32 v0, v0 26113; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26114; GCN-NEXT: s_setpc_b64 s[30:31] 26115; 26116; GFX7-LABEL: v_trunc_bf16: 26117; GFX7: ; %bb.0: 26118; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26119; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26120; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26121; GFX7-NEXT: v_trunc_f32_e32 v0, v0 26122; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26123; GFX7-NEXT: s_setpc_b64 s[30:31] 26124; 26125; GFX8-LABEL: v_trunc_bf16: 26126; GFX8: ; %bb.0: 26127; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26128; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26129; GFX8-NEXT: v_trunc_f32_e32 v0, v0 26130; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26131; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26132; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26133; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26134; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26135; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26136; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26137; GFX8-NEXT: s_setpc_b64 s[30:31] 26138; 26139; GFX9-LABEL: v_trunc_bf16: 26140; GFX9: ; %bb.0: 26141; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26142; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26143; GFX9-NEXT: v_trunc_f32_e32 v0, v0 26144; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26145; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26146; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26147; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26148; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26149; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26150; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26151; GFX9-NEXT: s_setpc_b64 s[30:31] 26152; 26153; GFX10-LABEL: v_trunc_bf16: 26154; GFX10: ; %bb.0: 26155; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26156; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26157; GFX10-NEXT: v_trunc_f32_e32 v0, v0 26158; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26159; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26160; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26161; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26162; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26163; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26164; GFX10-NEXT: s_setpc_b64 s[30:31] 26165; 26166; GFX11-LABEL: v_trunc_bf16: 26167; GFX11: ; %bb.0: 26168; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26169; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26171; GFX11-NEXT: v_trunc_f32_e32 v0, v0 26172; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26173; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26174; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26175; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26176; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26177; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26179; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26180; GFX11-NEXT: s_setpc_b64 s[30:31] 26181 %op = call bfloat @llvm.trunc.bf16(bfloat %a) 26182 ret bfloat %op 26183} 26184 26185declare bfloat @llvm.rint.bf16(bfloat) 26186 26187define bfloat @v_rint_bf16(bfloat %a) { 26188; GCN-LABEL: v_rint_bf16: 26189; GCN: ; %bb.0: 26190; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26191; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26192; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26193; GCN-NEXT: v_rndne_f32_e32 v0, v0 26194; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26195; GCN-NEXT: s_setpc_b64 s[30:31] 26196; 26197; GFX7-LABEL: v_rint_bf16: 26198; GFX7: ; %bb.0: 26199; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26200; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26201; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26202; GFX7-NEXT: v_rndne_f32_e32 v0, v0 26203; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26204; GFX7-NEXT: s_setpc_b64 s[30:31] 26205; 26206; GFX8-LABEL: v_rint_bf16: 26207; GFX8: ; %bb.0: 26208; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26209; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26210; GFX8-NEXT: v_rndne_f32_e32 v0, v0 26211; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26212; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26213; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26214; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26215; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26216; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26217; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26218; GFX8-NEXT: s_setpc_b64 s[30:31] 26219; 26220; GFX9-LABEL: v_rint_bf16: 26221; GFX9: ; %bb.0: 26222; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26223; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26224; GFX9-NEXT: v_rndne_f32_e32 v0, v0 26225; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26226; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26227; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26228; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26229; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26230; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26231; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26232; GFX9-NEXT: s_setpc_b64 s[30:31] 26233; 26234; GFX10-LABEL: v_rint_bf16: 26235; GFX10: ; %bb.0: 26236; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26237; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26238; GFX10-NEXT: v_rndne_f32_e32 v0, v0 26239; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26240; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26241; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26242; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26243; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26244; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26245; GFX10-NEXT: s_setpc_b64 s[30:31] 26246; 26247; GFX11-LABEL: v_rint_bf16: 26248; GFX11: ; %bb.0: 26249; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26250; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26251; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26252; GFX11-NEXT: v_rndne_f32_e32 v0, v0 26253; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26254; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26255; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26256; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26257; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26258; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26259; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26260; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26261; GFX11-NEXT: s_setpc_b64 s[30:31] 26262 %op = call bfloat @llvm.rint.bf16(bfloat %a) 26263 ret bfloat %op 26264} 26265 26266declare bfloat @llvm.nearbyint.bf16(bfloat) 26267 26268define bfloat @v_nearbyint_bf16(bfloat %a) { 26269; GCN-LABEL: v_nearbyint_bf16: 26270; GCN: ; %bb.0: 26271; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26272; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26273; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26274; GCN-NEXT: v_rndne_f32_e32 v0, v0 26275; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26276; GCN-NEXT: s_setpc_b64 s[30:31] 26277; 26278; GFX7-LABEL: v_nearbyint_bf16: 26279; GFX7: ; %bb.0: 26280; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26281; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26282; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26283; GFX7-NEXT: v_rndne_f32_e32 v0, v0 26284; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26285; GFX7-NEXT: s_setpc_b64 s[30:31] 26286; 26287; GFX8-LABEL: v_nearbyint_bf16: 26288; GFX8: ; %bb.0: 26289; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26290; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26291; GFX8-NEXT: v_rndne_f32_e32 v0, v0 26292; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26293; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26294; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26295; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26296; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26297; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26298; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26299; GFX8-NEXT: s_setpc_b64 s[30:31] 26300; 26301; GFX9-LABEL: v_nearbyint_bf16: 26302; GFX9: ; %bb.0: 26303; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26304; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26305; GFX9-NEXT: v_rndne_f32_e32 v0, v0 26306; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26307; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26308; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26309; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26310; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26311; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26312; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26313; GFX9-NEXT: s_setpc_b64 s[30:31] 26314; 26315; GFX10-LABEL: v_nearbyint_bf16: 26316; GFX10: ; %bb.0: 26317; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26318; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26319; GFX10-NEXT: v_rndne_f32_e32 v0, v0 26320; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26321; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26322; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26323; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26324; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26325; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26326; GFX10-NEXT: s_setpc_b64 s[30:31] 26327; 26328; GFX11-LABEL: v_nearbyint_bf16: 26329; GFX11: ; %bb.0: 26330; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26331; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26332; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26333; GFX11-NEXT: v_rndne_f32_e32 v0, v0 26334; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26335; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26336; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26337; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26338; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26339; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26341; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26342; GFX11-NEXT: s_setpc_b64 s[30:31] 26343 %op = call bfloat @llvm.nearbyint.bf16(bfloat %a) 26344 ret bfloat %op 26345} 26346 26347declare bfloat @llvm.round.bf16(bfloat) 26348 26349define bfloat @v_round_bf16(bfloat %a) { 26350; GCN-LABEL: v_round_bf16: 26351; GCN: ; %bb.0: 26352; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26353; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26354; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26355; GCN-NEXT: v_trunc_f32_e32 v1, v0 26356; GCN-NEXT: v_sub_f32_e32 v2, v0, v1 26357; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 26358; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 26359; GCN-NEXT: s_brev_b32 s4, -2 26360; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0 26361; GCN-NEXT: v_add_f32_e32 v0, v1, v0 26362; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26363; GCN-NEXT: s_setpc_b64 s[30:31] 26364; 26365; GFX7-LABEL: v_round_bf16: 26366; GFX7: ; %bb.0: 26367; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26368; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26369; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26370; GFX7-NEXT: v_trunc_f32_e32 v1, v0 26371; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1 26372; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 26373; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 26374; GFX7-NEXT: s_brev_b32 s4, -2 26375; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0 26376; GFX7-NEXT: v_add_f32_e32 v0, v1, v0 26377; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26378; GFX7-NEXT: s_setpc_b64 s[30:31] 26379; 26380; GFX8-LABEL: v_round_bf16: 26381; GFX8: ; %bb.0: 26382; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26383; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26384; GFX8-NEXT: v_trunc_f32_e32 v1, v0 26385; GFX8-NEXT: v_sub_f32_e32 v2, v0, v1 26386; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 26387; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 26388; GFX8-NEXT: s_brev_b32 s4, -2 26389; GFX8-NEXT: v_bfi_b32 v0, s4, v2, v0 26390; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 26391; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26392; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26393; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26394; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26395; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26396; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26397; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26398; GFX8-NEXT: s_setpc_b64 s[30:31] 26399; 26400; GFX9-LABEL: v_round_bf16: 26401; GFX9: ; %bb.0: 26402; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26403; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26404; GFX9-NEXT: v_trunc_f32_e32 v1, v0 26405; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1 26406; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 26407; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] 26408; GFX9-NEXT: s_brev_b32 s4, -2 26409; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 26410; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 26411; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26412; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26413; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26414; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26415; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26416; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26417; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26418; GFX9-NEXT: s_setpc_b64 s[30:31] 26419; 26420; GFX10-LABEL: v_round_bf16: 26421; GFX10: ; %bb.0: 26422; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26423; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26424; GFX10-NEXT: v_trunc_f32_e32 v1, v0 26425; GFX10-NEXT: v_sub_f32_e32 v2, v0, v1 26426; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5 26427; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4 26428; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 26429; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 26430; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26431; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26432; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26433; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26434; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26435; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26436; GFX10-NEXT: s_setpc_b64 s[30:31] 26437; 26438; GFX11-LABEL: v_round_bf16: 26439; GFX11: ; %bb.0: 26440; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26441; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26442; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26443; GFX11-NEXT: v_trunc_f32_e32 v1, v0 26444; GFX11-NEXT: v_sub_f32_e32 v2, v0, v1 26445; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26446; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 26447; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 26448; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26449; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 26450; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 26451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 26452; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26453; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26454; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26455; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26456; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26457; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26458; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26459; GFX11-NEXT: s_setpc_b64 s[30:31] 26460 %op = call bfloat @llvm.round.bf16(bfloat %a) 26461 ret bfloat %op 26462} 26463 26464declare bfloat @llvm.roundeven.bf16(bfloat) 26465 26466define bfloat @v_roundeven_bf16(bfloat %a) { 26467; GCN-LABEL: v_roundeven_bf16: 26468; GCN: ; %bb.0: 26469; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26470; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26471; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26472; GCN-NEXT: v_rndne_f32_e32 v0, v0 26473; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26474; GCN-NEXT: s_setpc_b64 s[30:31] 26475; 26476; GFX7-LABEL: v_roundeven_bf16: 26477; GFX7: ; %bb.0: 26478; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26479; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26480; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26481; GFX7-NEXT: v_rndne_f32_e32 v0, v0 26482; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26483; GFX7-NEXT: s_setpc_b64 s[30:31] 26484; 26485; GFX8-LABEL: v_roundeven_bf16: 26486; GFX8: ; %bb.0: 26487; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26488; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26489; GFX8-NEXT: v_rndne_f32_e32 v0, v0 26490; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26491; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26492; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26493; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26494; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26495; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26496; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26497; GFX8-NEXT: s_setpc_b64 s[30:31] 26498; 26499; GFX9-LABEL: v_roundeven_bf16: 26500; GFX9: ; %bb.0: 26501; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26502; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26503; GFX9-NEXT: v_rndne_f32_e32 v0, v0 26504; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26505; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26506; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26507; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26508; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26509; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26510; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26511; GFX9-NEXT: s_setpc_b64 s[30:31] 26512; 26513; GFX10-LABEL: v_roundeven_bf16: 26514; GFX10: ; %bb.0: 26515; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26516; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26517; GFX10-NEXT: v_rndne_f32_e32 v0, v0 26518; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26519; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26520; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26521; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26522; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26523; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26524; GFX10-NEXT: s_setpc_b64 s[30:31] 26525; 26526; GFX11-LABEL: v_roundeven_bf16: 26527; GFX11: ; %bb.0: 26528; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26529; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26530; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26531; GFX11-NEXT: v_rndne_f32_e32 v0, v0 26532; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26533; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26534; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26535; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26536; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26537; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26538; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26539; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26540; GFX11-NEXT: s_setpc_b64 s[30:31] 26541 %op = call bfloat @llvm.roundeven.bf16(bfloat %a) 26542 ret bfloat %op 26543} 26544 26545declare bfloat @llvm.floor.bf16(bfloat) 26546 26547define bfloat @v_floor_bf16(bfloat %a) { 26548; GCN-LABEL: v_floor_bf16: 26549; GCN: ; %bb.0: 26550; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26551; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26552; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26553; GCN-NEXT: v_floor_f32_e32 v0, v0 26554; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26555; GCN-NEXT: s_setpc_b64 s[30:31] 26556; 26557; GFX7-LABEL: v_floor_bf16: 26558; GFX7: ; %bb.0: 26559; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26560; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26561; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26562; GFX7-NEXT: v_floor_f32_e32 v0, v0 26563; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26564; GFX7-NEXT: s_setpc_b64 s[30:31] 26565; 26566; GFX8-LABEL: v_floor_bf16: 26567; GFX8: ; %bb.0: 26568; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26569; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26570; GFX8-NEXT: v_floor_f32_e32 v0, v0 26571; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26572; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26573; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26574; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26575; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26576; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26577; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26578; GFX8-NEXT: s_setpc_b64 s[30:31] 26579; 26580; GFX9-LABEL: v_floor_bf16: 26581; GFX9: ; %bb.0: 26582; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26583; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26584; GFX9-NEXT: v_floor_f32_e32 v0, v0 26585; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26586; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26587; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26588; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26589; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26590; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26591; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26592; GFX9-NEXT: s_setpc_b64 s[30:31] 26593; 26594; GFX10-LABEL: v_floor_bf16: 26595; GFX10: ; %bb.0: 26596; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26597; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26598; GFX10-NEXT: v_floor_f32_e32 v0, v0 26599; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26600; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26601; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26602; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26603; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26604; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26605; GFX10-NEXT: s_setpc_b64 s[30:31] 26606; 26607; GFX11-LABEL: v_floor_bf16: 26608; GFX11: ; %bb.0: 26609; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26610; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26611; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26612; GFX11-NEXT: v_floor_f32_e32 v0, v0 26613; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26614; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26615; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26616; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26617; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26618; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26619; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26620; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26621; GFX11-NEXT: s_setpc_b64 s[30:31] 26622 %op = call bfloat @llvm.floor.bf16(bfloat %a) 26623 ret bfloat %op 26624} 26625 26626declare bfloat @llvm.canonicalize.bf16(bfloat) 26627 26628define bfloat @v_canonicalize_bf16(bfloat %a) { 26629; GCN-LABEL: v_canonicalize_bf16: 26630; GCN: ; %bb.0: 26631; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26632; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26633; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26634; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26635; GCN-NEXT: s_setpc_b64 s[30:31] 26636; 26637; GFX7-LABEL: v_canonicalize_bf16: 26638; GFX7: ; %bb.0: 26639; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26640; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26641; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26642; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26643; GFX7-NEXT: s_setpc_b64 s[30:31] 26644; 26645; GFX8-LABEL: v_canonicalize_bf16: 26646; GFX8: ; %bb.0: 26647; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26648; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26649; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 26650; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 26651; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 26652; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 26653; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 26654; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26655; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26656; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26657; GFX8-NEXT: s_setpc_b64 s[30:31] 26658; 26659; GFX9-LABEL: v_canonicalize_bf16: 26660; GFX9: ; %bb.0: 26661; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26662; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26663; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 26664; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 26665; GFX9-NEXT: s_movk_i32 s4, 0x7fff 26666; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 26667; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 26668; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 26669; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 26670; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26671; GFX9-NEXT: s_setpc_b64 s[30:31] 26672; 26673; GFX10-LABEL: v_canonicalize_bf16: 26674; GFX10: ; %bb.0: 26675; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26676; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26677; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 26678; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 26679; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 26680; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26681; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26682; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26683; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26684; GFX10-NEXT: s_setpc_b64 s[30:31] 26685; 26686; GFX11-LABEL: v_canonicalize_bf16: 26687; GFX11: ; %bb.0: 26688; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26689; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26690; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 26691; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 26692; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 26693; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 26694; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 26695; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 26696; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 26697; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 26698; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26699; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 26700; GFX11-NEXT: s_setpc_b64 s[30:31] 26701 %op = call bfloat @llvm.canonicalize.bf16(bfloat %a) 26702 ret bfloat %op 26703} 26704 26705declare bfloat @llvm.arithmetic.fence.bf16(bfloat) 26706 26707; FIXME: Promotion broken 26708; define bfloat @v_arithmetic_fence_bf16(bfloat %a) { 26709; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a) 26710; ret bfloat %op 26711; } 26712 26713define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) { 26714; GCN-LABEL: v_fcmp_false_bf16: 26715; GCN: ; %bb.0: 26716; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26717; GCN-NEXT: v_mov_b32_e32 v0, 0 26718; GCN-NEXT: s_setpc_b64 s[30:31] 26719; 26720; GFX7-LABEL: v_fcmp_false_bf16: 26721; GFX7: ; %bb.0: 26722; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26723; GFX7-NEXT: v_mov_b32_e32 v0, 0 26724; GFX7-NEXT: s_setpc_b64 s[30:31] 26725; 26726; GFX8-LABEL: v_fcmp_false_bf16: 26727; GFX8: ; %bb.0: 26728; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26729; GFX8-NEXT: v_mov_b32_e32 v0, 0 26730; GFX8-NEXT: s_setpc_b64 s[30:31] 26731; 26732; GFX9-LABEL: v_fcmp_false_bf16: 26733; GFX9: ; %bb.0: 26734; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26735; GFX9-NEXT: v_mov_b32_e32 v0, 0 26736; GFX9-NEXT: s_setpc_b64 s[30:31] 26737; 26738; GFX10-LABEL: v_fcmp_false_bf16: 26739; GFX10: ; %bb.0: 26740; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26741; GFX10-NEXT: v_mov_b32_e32 v0, 0 26742; GFX10-NEXT: s_setpc_b64 s[30:31] 26743; 26744; GFX11-LABEL: v_fcmp_false_bf16: 26745; GFX11: ; %bb.0: 26746; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26747; GFX11-NEXT: v_mov_b32_e32 v0, 0 26748; GFX11-NEXT: s_setpc_b64 s[30:31] 26749 %op = fcmp false bfloat %a, %b 26750 ret i1 %op 26751} 26752 26753define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { 26754; GCN-LABEL: v_fcmp_oeq_bf16: 26755; GCN: ; %bb.0: 26756; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26757; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26758; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 26759; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26760; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26761; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 26762; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26763; GCN-NEXT: s_setpc_b64 s[30:31] 26764; 26765; GFX7-LABEL: v_fcmp_oeq_bf16: 26766; GFX7: ; %bb.0: 26767; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26768; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26769; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 26770; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26771; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26772; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 26773; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26774; GFX7-NEXT: s_setpc_b64 s[30:31] 26775; 26776; GFX8-LABEL: v_fcmp_oeq_bf16: 26777; GFX8: ; %bb.0: 26778; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26779; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26780; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26781; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 26782; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26783; GFX8-NEXT: s_setpc_b64 s[30:31] 26784; 26785; GFX9-LABEL: v_fcmp_oeq_bf16: 26786; GFX9: ; %bb.0: 26787; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26788; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26789; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26790; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 26791; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26792; GFX9-NEXT: s_setpc_b64 s[30:31] 26793; 26794; GFX10-LABEL: v_fcmp_oeq_bf16: 26795; GFX10: ; %bb.0: 26796; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26797; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26798; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26799; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 26800; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26801; GFX10-NEXT: s_setpc_b64 s[30:31] 26802; 26803; GFX11-LABEL: v_fcmp_oeq_bf16: 26804; GFX11: ; %bb.0: 26805; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26806; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26807; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26808; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26809; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 26810; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26811; GFX11-NEXT: s_setpc_b64 s[30:31] 26812 %op = fcmp oeq bfloat %a, %b 26813 ret i1 %op 26814} 26815 26816define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { 26817; GCN-LABEL: v_fcmp_ogt_bf16: 26818; GCN: ; %bb.0: 26819; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26820; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26821; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 26822; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26823; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26824; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 26825; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26826; GCN-NEXT: s_setpc_b64 s[30:31] 26827; 26828; GFX7-LABEL: v_fcmp_ogt_bf16: 26829; GFX7: ; %bb.0: 26830; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26831; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26832; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 26833; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26834; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26835; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 26836; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26837; GFX7-NEXT: s_setpc_b64 s[30:31] 26838; 26839; GFX8-LABEL: v_fcmp_ogt_bf16: 26840; GFX8: ; %bb.0: 26841; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26842; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26843; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26844; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 26845; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26846; GFX8-NEXT: s_setpc_b64 s[30:31] 26847; 26848; GFX9-LABEL: v_fcmp_ogt_bf16: 26849; GFX9: ; %bb.0: 26850; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26851; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26852; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26853; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 26854; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26855; GFX9-NEXT: s_setpc_b64 s[30:31] 26856; 26857; GFX10-LABEL: v_fcmp_ogt_bf16: 26858; GFX10: ; %bb.0: 26859; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26860; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26861; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26862; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 26863; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26864; GFX10-NEXT: s_setpc_b64 s[30:31] 26865; 26866; GFX11-LABEL: v_fcmp_ogt_bf16: 26867; GFX11: ; %bb.0: 26868; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26869; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26870; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26871; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26872; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 26873; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26874; GFX11-NEXT: s_setpc_b64 s[30:31] 26875 %op = fcmp ogt bfloat %a, %b 26876 ret i1 %op 26877} 26878 26879define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { 26880; GCN-LABEL: v_fcmp_oge_bf16: 26881; GCN: ; %bb.0: 26882; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26883; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26884; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 26885; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26886; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26887; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 26888; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26889; GCN-NEXT: s_setpc_b64 s[30:31] 26890; 26891; GFX7-LABEL: v_fcmp_oge_bf16: 26892; GFX7: ; %bb.0: 26893; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26894; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26895; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 26896; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26897; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26898; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 26899; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26900; GFX7-NEXT: s_setpc_b64 s[30:31] 26901; 26902; GFX8-LABEL: v_fcmp_oge_bf16: 26903; GFX8: ; %bb.0: 26904; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26905; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26906; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26907; GFX8-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 26908; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26909; GFX8-NEXT: s_setpc_b64 s[30:31] 26910; 26911; GFX9-LABEL: v_fcmp_oge_bf16: 26912; GFX9: ; %bb.0: 26913; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26914; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26915; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26916; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 26917; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26918; GFX9-NEXT: s_setpc_b64 s[30:31] 26919; 26920; GFX10-LABEL: v_fcmp_oge_bf16: 26921; GFX10: ; %bb.0: 26922; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26923; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26924; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26925; GFX10-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 26926; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26927; GFX10-NEXT: s_setpc_b64 s[30:31] 26928; 26929; GFX11-LABEL: v_fcmp_oge_bf16: 26930; GFX11: ; %bb.0: 26931; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26932; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26933; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26934; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26935; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 26936; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26937; GFX11-NEXT: s_setpc_b64 s[30:31] 26938 %op = fcmp oge bfloat %a, %b 26939 ret i1 %op 26940} 26941 26942define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { 26943; GCN-LABEL: v_fcmp_olt_bf16: 26944; GCN: ; %bb.0: 26945; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26946; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 26947; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 26948; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26949; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26950; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 26951; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26952; GCN-NEXT: s_setpc_b64 s[30:31] 26953; 26954; GFX7-LABEL: v_fcmp_olt_bf16: 26955; GFX7: ; %bb.0: 26956; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26957; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 26958; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 26959; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 26960; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 26961; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 26962; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26963; GFX7-NEXT: s_setpc_b64 s[30:31] 26964; 26965; GFX8-LABEL: v_fcmp_olt_bf16: 26966; GFX8: ; %bb.0: 26967; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26968; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26969; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26970; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 26971; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26972; GFX8-NEXT: s_setpc_b64 s[30:31] 26973; 26974; GFX9-LABEL: v_fcmp_olt_bf16: 26975; GFX9: ; %bb.0: 26976; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26977; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26978; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26979; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 26980; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 26981; GFX9-NEXT: s_setpc_b64 s[30:31] 26982; 26983; GFX10-LABEL: v_fcmp_olt_bf16: 26984; GFX10: ; %bb.0: 26985; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26986; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26987; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26988; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 26989; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 26990; GFX10-NEXT: s_setpc_b64 s[30:31] 26991; 26992; GFX11-LABEL: v_fcmp_olt_bf16: 26993; GFX11: ; %bb.0: 26994; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26995; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 26996; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 26997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 26998; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 26999; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27000; GFX11-NEXT: s_setpc_b64 s[30:31] 27001 %op = fcmp olt bfloat %a, %b 27002 ret i1 %op 27003} 27004 27005define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { 27006; GCN-LABEL: v_fcmp_ole_bf16: 27007; GCN: ; %bb.0: 27008; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27009; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27010; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27011; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27012; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27013; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 27014; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27015; GCN-NEXT: s_setpc_b64 s[30:31] 27016; 27017; GFX7-LABEL: v_fcmp_ole_bf16: 27018; GFX7: ; %bb.0: 27019; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27020; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27021; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27022; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27023; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27024; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 27025; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27026; GFX7-NEXT: s_setpc_b64 s[30:31] 27027; 27028; GFX8-LABEL: v_fcmp_ole_bf16: 27029; GFX8: ; %bb.0: 27030; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27031; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27032; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27033; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 27034; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27035; GFX8-NEXT: s_setpc_b64 s[30:31] 27036; 27037; GFX9-LABEL: v_fcmp_ole_bf16: 27038; GFX9: ; %bb.0: 27039; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27040; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27041; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27042; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 27043; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27044; GFX9-NEXT: s_setpc_b64 s[30:31] 27045; 27046; GFX10-LABEL: v_fcmp_ole_bf16: 27047; GFX10: ; %bb.0: 27048; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27049; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27050; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27051; GFX10-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 27052; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27053; GFX10-NEXT: s_setpc_b64 s[30:31] 27054; 27055; GFX11-LABEL: v_fcmp_ole_bf16: 27056; GFX11: ; %bb.0: 27057; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27058; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27059; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27060; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27061; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 27062; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27063; GFX11-NEXT: s_setpc_b64 s[30:31] 27064 %op = fcmp ole bfloat %a, %b 27065 ret i1 %op 27066} 27067 27068define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { 27069; GCN-LABEL: v_fcmp_one_bf16: 27070; GCN: ; %bb.0: 27071; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27072; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27073; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27074; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27075; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27076; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 27077; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27078; GCN-NEXT: s_setpc_b64 s[30:31] 27079; 27080; GFX7-LABEL: v_fcmp_one_bf16: 27081; GFX7: ; %bb.0: 27082; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27083; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27084; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27085; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27086; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27087; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 27088; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27089; GFX7-NEXT: s_setpc_b64 s[30:31] 27090; 27091; GFX8-LABEL: v_fcmp_one_bf16: 27092; GFX8: ; %bb.0: 27093; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27094; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27095; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27096; GFX8-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 27097; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27098; GFX8-NEXT: s_setpc_b64 s[30:31] 27099; 27100; GFX9-LABEL: v_fcmp_one_bf16: 27101; GFX9: ; %bb.0: 27102; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27103; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27104; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27105; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 27106; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27107; GFX9-NEXT: s_setpc_b64 s[30:31] 27108; 27109; GFX10-LABEL: v_fcmp_one_bf16: 27110; GFX10: ; %bb.0: 27111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27112; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27113; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27114; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 27115; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27116; GFX10-NEXT: s_setpc_b64 s[30:31] 27117; 27118; GFX11-LABEL: v_fcmp_one_bf16: 27119; GFX11: ; %bb.0: 27120; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27121; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27122; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27123; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27124; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 27125; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27126; GFX11-NEXT: s_setpc_b64 s[30:31] 27127 %op = fcmp one bfloat %a, %b 27128 ret i1 %op 27129} 27130 27131define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { 27132; GCN-LABEL: v_fcmp_uno_bf16: 27133; GCN: ; %bb.0: 27134; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27135; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27136; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27137; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27138; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27139; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 27140; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27141; GCN-NEXT: s_setpc_b64 s[30:31] 27142; 27143; GFX7-LABEL: v_fcmp_uno_bf16: 27144; GFX7: ; %bb.0: 27145; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27146; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27147; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27148; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27149; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27150; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 27151; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27152; GFX7-NEXT: s_setpc_b64 s[30:31] 27153; 27154; GFX8-LABEL: v_fcmp_uno_bf16: 27155; GFX8: ; %bb.0: 27156; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27157; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27158; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27159; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 27160; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27161; GFX8-NEXT: s_setpc_b64 s[30:31] 27162; 27163; GFX9-LABEL: v_fcmp_uno_bf16: 27164; GFX9: ; %bb.0: 27165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27166; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27167; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27168; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 27169; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27170; GFX9-NEXT: s_setpc_b64 s[30:31] 27171; 27172; GFX10-LABEL: v_fcmp_uno_bf16: 27173; GFX10: ; %bb.0: 27174; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27175; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27176; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27177; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 27178; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27179; GFX10-NEXT: s_setpc_b64 s[30:31] 27180; 27181; GFX11-LABEL: v_fcmp_uno_bf16: 27182; GFX11: ; %bb.0: 27183; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27184; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27185; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27186; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27187; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 27188; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27189; GFX11-NEXT: s_setpc_b64 s[30:31] 27190 %op = fcmp uno bfloat %a, %b 27191 ret i1 %op 27192} 27193 27194define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { 27195; GCN-LABEL: v_fcmp_ueq_bf16: 27196; GCN: ; %bb.0: 27197; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27198; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27199; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27200; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27201; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27202; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 27203; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27204; GCN-NEXT: s_setpc_b64 s[30:31] 27205; 27206; GFX7-LABEL: v_fcmp_ueq_bf16: 27207; GFX7: ; %bb.0: 27208; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27209; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27210; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27211; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27212; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27213; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 27214; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27215; GFX7-NEXT: s_setpc_b64 s[30:31] 27216; 27217; GFX8-LABEL: v_fcmp_ueq_bf16: 27218; GFX8: ; %bb.0: 27219; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27220; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27221; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27222; GFX8-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 27223; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27224; GFX8-NEXT: s_setpc_b64 s[30:31] 27225; 27226; GFX9-LABEL: v_fcmp_ueq_bf16: 27227; GFX9: ; %bb.0: 27228; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27229; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27230; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27231; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 27232; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27233; GFX9-NEXT: s_setpc_b64 s[30:31] 27234; 27235; GFX10-LABEL: v_fcmp_ueq_bf16: 27236; GFX10: ; %bb.0: 27237; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27238; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27239; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27240; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 27241; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27242; GFX10-NEXT: s_setpc_b64 s[30:31] 27243; 27244; GFX11-LABEL: v_fcmp_ueq_bf16: 27245; GFX11: ; %bb.0: 27246; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27247; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27248; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27250; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 27251; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27252; GFX11-NEXT: s_setpc_b64 s[30:31] 27253 %op = fcmp ueq bfloat %a, %b 27254 ret i1 %op 27255} 27256 27257define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { 27258; GCN-LABEL: v_fcmp_ugt_bf16: 27259; GCN: ; %bb.0: 27260; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27261; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27262; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27263; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27264; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27265; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 27266; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27267; GCN-NEXT: s_setpc_b64 s[30:31] 27268; 27269; GFX7-LABEL: v_fcmp_ugt_bf16: 27270; GFX7: ; %bb.0: 27271; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27272; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27273; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27274; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27275; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27276; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 27277; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27278; GFX7-NEXT: s_setpc_b64 s[30:31] 27279; 27280; GFX8-LABEL: v_fcmp_ugt_bf16: 27281; GFX8: ; %bb.0: 27282; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27283; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27284; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27285; GFX8-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 27286; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27287; GFX8-NEXT: s_setpc_b64 s[30:31] 27288; 27289; GFX9-LABEL: v_fcmp_ugt_bf16: 27290; GFX9: ; %bb.0: 27291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27292; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27293; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27294; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 27295; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27296; GFX9-NEXT: s_setpc_b64 s[30:31] 27297; 27298; GFX10-LABEL: v_fcmp_ugt_bf16: 27299; GFX10: ; %bb.0: 27300; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27301; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27302; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27303; GFX10-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 27304; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27305; GFX10-NEXT: s_setpc_b64 s[30:31] 27306; 27307; GFX11-LABEL: v_fcmp_ugt_bf16: 27308; GFX11: ; %bb.0: 27309; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27310; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27311; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27312; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27313; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 27314; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27315; GFX11-NEXT: s_setpc_b64 s[30:31] 27316 %op = fcmp ugt bfloat %a, %b 27317 ret i1 %op 27318} 27319 27320define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { 27321; GCN-LABEL: v_fcmp_uge_bf16: 27322; GCN: ; %bb.0: 27323; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27324; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27325; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27326; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27327; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27328; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 27329; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27330; GCN-NEXT: s_setpc_b64 s[30:31] 27331; 27332; GFX7-LABEL: v_fcmp_uge_bf16: 27333; GFX7: ; %bb.0: 27334; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27335; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27336; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27337; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27338; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27339; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 27340; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27341; GFX7-NEXT: s_setpc_b64 s[30:31] 27342; 27343; GFX8-LABEL: v_fcmp_uge_bf16: 27344; GFX8: ; %bb.0: 27345; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27346; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27347; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27348; GFX8-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 27349; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27350; GFX8-NEXT: s_setpc_b64 s[30:31] 27351; 27352; GFX9-LABEL: v_fcmp_uge_bf16: 27353; GFX9: ; %bb.0: 27354; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27355; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27356; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27357; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 27358; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27359; GFX9-NEXT: s_setpc_b64 s[30:31] 27360; 27361; GFX10-LABEL: v_fcmp_uge_bf16: 27362; GFX10: ; %bb.0: 27363; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27364; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27365; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27366; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 27367; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27368; GFX10-NEXT: s_setpc_b64 s[30:31] 27369; 27370; GFX11-LABEL: v_fcmp_uge_bf16: 27371; GFX11: ; %bb.0: 27372; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27373; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27374; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27376; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 27377; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27378; GFX11-NEXT: s_setpc_b64 s[30:31] 27379 %op = fcmp uge bfloat %a, %b 27380 ret i1 %op 27381} 27382 27383define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { 27384; GCN-LABEL: v_fcmp_ult_bf16: 27385; GCN: ; %bb.0: 27386; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27387; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27388; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27389; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27390; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27391; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 27392; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27393; GCN-NEXT: s_setpc_b64 s[30:31] 27394; 27395; GFX7-LABEL: v_fcmp_ult_bf16: 27396; GFX7: ; %bb.0: 27397; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27398; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27399; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27400; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27401; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27402; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 27403; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27404; GFX7-NEXT: s_setpc_b64 s[30:31] 27405; 27406; GFX8-LABEL: v_fcmp_ult_bf16: 27407; GFX8: ; %bb.0: 27408; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27409; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27410; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27411; GFX8-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 27412; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27413; GFX8-NEXT: s_setpc_b64 s[30:31] 27414; 27415; GFX9-LABEL: v_fcmp_ult_bf16: 27416; GFX9: ; %bb.0: 27417; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27418; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27419; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27420; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 27421; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27422; GFX9-NEXT: s_setpc_b64 s[30:31] 27423; 27424; GFX10-LABEL: v_fcmp_ult_bf16: 27425; GFX10: ; %bb.0: 27426; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27427; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27428; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27429; GFX10-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 27430; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27431; GFX10-NEXT: s_setpc_b64 s[30:31] 27432; 27433; GFX11-LABEL: v_fcmp_ult_bf16: 27434; GFX11: ; %bb.0: 27435; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27436; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27437; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27438; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27439; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 27440; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27441; GFX11-NEXT: s_setpc_b64 s[30:31] 27442 %op = fcmp ult bfloat %a, %b 27443 ret i1 %op 27444} 27445 27446define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { 27447; GCN-LABEL: v_fcmp_ule_bf16: 27448; GCN: ; %bb.0: 27449; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27450; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27451; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27452; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27453; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27454; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 27455; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27456; GCN-NEXT: s_setpc_b64 s[30:31] 27457; 27458; GFX7-LABEL: v_fcmp_ule_bf16: 27459; GFX7: ; %bb.0: 27460; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27461; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27462; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27463; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27464; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27465; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 27466; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27467; GFX7-NEXT: s_setpc_b64 s[30:31] 27468; 27469; GFX8-LABEL: v_fcmp_ule_bf16: 27470; GFX8: ; %bb.0: 27471; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27472; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27473; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27474; GFX8-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 27475; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27476; GFX8-NEXT: s_setpc_b64 s[30:31] 27477; 27478; GFX9-LABEL: v_fcmp_ule_bf16: 27479; GFX9: ; %bb.0: 27480; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27481; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27482; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27483; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 27484; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27485; GFX9-NEXT: s_setpc_b64 s[30:31] 27486; 27487; GFX10-LABEL: v_fcmp_ule_bf16: 27488; GFX10: ; %bb.0: 27489; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27490; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27491; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27492; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 27493; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27494; GFX10-NEXT: s_setpc_b64 s[30:31] 27495; 27496; GFX11-LABEL: v_fcmp_ule_bf16: 27497; GFX11: ; %bb.0: 27498; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27499; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27500; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27501; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27502; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 27503; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27504; GFX11-NEXT: s_setpc_b64 s[30:31] 27505 %op = fcmp ule bfloat %a, %b 27506 ret i1 %op 27507} 27508 27509define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { 27510; GCN-LABEL: v_fcmp_une_bf16: 27511; GCN: ; %bb.0: 27512; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27513; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27514; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 27515; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27516; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27517; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 27518; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27519; GCN-NEXT: s_setpc_b64 s[30:31] 27520; 27521; GFX7-LABEL: v_fcmp_une_bf16: 27522; GFX7: ; %bb.0: 27523; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27524; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27525; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 27526; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 27527; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 27528; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 27529; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27530; GFX7-NEXT: s_setpc_b64 s[30:31] 27531; 27532; GFX8-LABEL: v_fcmp_une_bf16: 27533; GFX8: ; %bb.0: 27534; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27535; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27536; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27537; GFX8-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 27538; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27539; GFX8-NEXT: s_setpc_b64 s[30:31] 27540; 27541; GFX9-LABEL: v_fcmp_une_bf16: 27542; GFX9: ; %bb.0: 27543; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27544; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27545; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27546; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 27547; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 27548; GFX9-NEXT: s_setpc_b64 s[30:31] 27549; 27550; GFX10-LABEL: v_fcmp_une_bf16: 27551; GFX10: ; %bb.0: 27552; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27553; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27554; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27555; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 27556; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27557; GFX10-NEXT: s_setpc_b64 s[30:31] 27558; 27559; GFX11-LABEL: v_fcmp_une_bf16: 27560; GFX11: ; %bb.0: 27561; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27562; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 27563; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27564; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27565; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 27566; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 27567; GFX11-NEXT: s_setpc_b64 s[30:31] 27568 %op = fcmp une bfloat %a, %b 27569 ret i1 %op 27570} 27571 27572define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) { 27573; GCN-LABEL: v_fcmp_true_bf16: 27574; GCN: ; %bb.0: 27575; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27576; GCN-NEXT: v_mov_b32_e32 v0, 1 27577; GCN-NEXT: s_setpc_b64 s[30:31] 27578; 27579; GFX7-LABEL: v_fcmp_true_bf16: 27580; GFX7: ; %bb.0: 27581; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27582; GFX7-NEXT: v_mov_b32_e32 v0, 1 27583; GFX7-NEXT: s_setpc_b64 s[30:31] 27584; 27585; GFX8-LABEL: v_fcmp_true_bf16: 27586; GFX8: ; %bb.0: 27587; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27588; GFX8-NEXT: v_mov_b32_e32 v0, 1 27589; GFX8-NEXT: s_setpc_b64 s[30:31] 27590; 27591; GFX9-LABEL: v_fcmp_true_bf16: 27592; GFX9: ; %bb.0: 27593; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27594; GFX9-NEXT: v_mov_b32_e32 v0, 1 27595; GFX9-NEXT: s_setpc_b64 s[30:31] 27596; 27597; GFX10-LABEL: v_fcmp_true_bf16: 27598; GFX10: ; %bb.0: 27599; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27600; GFX10-NEXT: v_mov_b32_e32 v0, 1 27601; GFX10-NEXT: s_setpc_b64 s[30:31] 27602; 27603; GFX11-LABEL: v_fcmp_true_bf16: 27604; GFX11: ; %bb.0: 27605; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27606; GFX11-NEXT: v_mov_b32_e32 v0, 1 27607; GFX11-NEXT: s_setpc_b64 s[30:31] 27608 %op = fcmp true bfloat %a, %b 27609 ret i1 %op 27610} 27611 27612declare bfloat @llvm.copysign.bf16(bfloat, bfloat) 27613 27614define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { 27615; GCN-LABEL: v_copysign_bf16_bf16: 27616; GCN: ; %bb.0: 27617; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27618; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27619; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 27620; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27621; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27622; GCN-NEXT: v_or_b32_e32 v0, v0, v1 27623; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27624; GCN-NEXT: s_setpc_b64 s[30:31] 27625; 27626; GFX7-LABEL: v_copysign_bf16_bf16: 27627; GFX7: ; %bb.0: 27628; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27629; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27630; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 27631; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27632; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27633; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 27634; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27635; GFX7-NEXT: s_setpc_b64 s[30:31] 27636; 27637; GFX8-LABEL: v_copysign_bf16_bf16: 27638; GFX8: ; %bb.0: 27639; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27640; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27641; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 27642; GFX8-NEXT: s_setpc_b64 s[30:31] 27643; 27644; GFX9-LABEL: v_copysign_bf16_bf16: 27645; GFX9: ; %bb.0: 27646; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27647; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27648; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 27649; GFX9-NEXT: s_setpc_b64 s[30:31] 27650; 27651; GFX10-LABEL: v_copysign_bf16_bf16: 27652; GFX10: ; %bb.0: 27653; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27654; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27655; GFX10-NEXT: s_setpc_b64 s[30:31] 27656; 27657; GFX11-LABEL: v_copysign_bf16_bf16: 27658; GFX11: ; %bb.0: 27659; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27660; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27661; GFX11-NEXT: s_setpc_b64 s[30:31] 27662 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27663 ret bfloat %op 27664} 27665 27666define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { 27667; GCN-LABEL: v_copysign_bf16_s_bf16: 27668; GCN: ; %bb.0: 27669; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27670; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27671; GCN-NEXT: s_and_b32 s4, s16, 0x80000000 27672; GCN-NEXT: s_lshr_b32 s4, s4, 16 27673; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27674; GCN-NEXT: v_or_b32_e32 v0, s4, v0 27675; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27676; GCN-NEXT: s_setpc_b64 s[30:31] 27677; 27678; GFX7-LABEL: v_copysign_bf16_s_bf16: 27679; GFX7: ; %bb.0: 27680; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27681; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27682; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000 27683; GFX7-NEXT: s_lshr_b32 s4, s4, 16 27684; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27685; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 27686; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27687; GFX7-NEXT: s_setpc_b64 s[30:31] 27688; 27689; GFX8-LABEL: v_copysign_bf16_s_bf16: 27690; GFX8: ; %bb.0: 27691; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27692; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27693; GFX8-NEXT: v_mov_b32_e32 v1, s16 27694; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 27695; GFX8-NEXT: s_setpc_b64 s[30:31] 27696; 27697; GFX9-LABEL: v_copysign_bf16_s_bf16: 27698; GFX9: ; %bb.0: 27699; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27700; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27701; GFX9-NEXT: v_mov_b32_e32 v1, s16 27702; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 27703; GFX9-NEXT: s_setpc_b64 s[30:31] 27704; 27705; GFX10-LABEL: v_copysign_bf16_s_bf16: 27706; GFX10: ; %bb.0: 27707; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27708; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16 27709; GFX10-NEXT: s_setpc_b64 s[30:31] 27710; 27711; GFX11-LABEL: v_copysign_bf16_s_bf16: 27712; GFX11: ; %bb.0: 27713; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27714; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 27715; GFX11-NEXT: s_setpc_b64 s[30:31] 27716 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27717 ret bfloat %op 27718} 27719 27720define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { 27721; GCN-LABEL: v_copysign_s_bf16_bf16: 27722; GCN: ; %bb.0: 27723; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27724; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 27725; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 27726; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 27727; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 27728; GCN-NEXT: v_or_b32_e32 v0, v1, v0 27729; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27730; GCN-NEXT: s_setpc_b64 s[30:31] 27731; 27732; GFX7-LABEL: v_copysign_s_bf16_bf16: 27733; GFX7: ; %bb.0: 27734; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27735; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 27736; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 27737; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 27738; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 27739; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 27740; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27741; GFX7-NEXT: s_setpc_b64 s[30:31] 27742; 27743; GFX8-LABEL: v_copysign_s_bf16_bf16: 27744; GFX8: ; %bb.0: 27745; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27746; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27747; GFX8-NEXT: v_mov_b32_e32 v1, s16 27748; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 27749; GFX8-NEXT: s_setpc_b64 s[30:31] 27750; 27751; GFX9-LABEL: v_copysign_s_bf16_bf16: 27752; GFX9: ; %bb.0: 27753; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27754; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27755; GFX9-NEXT: v_mov_b32_e32 v1, s16 27756; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 27757; GFX9-NEXT: s_setpc_b64 s[30:31] 27758; 27759; GFX10-LABEL: v_copysign_s_bf16_bf16: 27760; GFX10: ; %bb.0: 27761; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27762; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0 27763; GFX10-NEXT: s_setpc_b64 s[30:31] 27764; 27765; GFX11-LABEL: v_copysign_s_bf16_bf16: 27766; GFX11: ; %bb.0: 27767; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27768; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 27769; GFX11-NEXT: s_setpc_b64 s[30:31] 27770 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27771 ret bfloat %op 27772} 27773 27774define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { 27775; GCN-LABEL: v_copysign_bf16_f32: 27776; GCN: ; %bb.0: 27777; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27778; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27779; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 27780; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27781; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27782; GCN-NEXT: v_or_b32_e32 v0, v0, v1 27783; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27784; GCN-NEXT: s_setpc_b64 s[30:31] 27785; 27786; GFX7-LABEL: v_copysign_bf16_f32: 27787; GFX7: ; %bb.0: 27788; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27789; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27790; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 27791; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27792; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27793; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 27794; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27795; GFX7-NEXT: s_setpc_b64 s[30:31] 27796; 27797; GFX8-LABEL: v_copysign_bf16_f32: 27798; GFX8: ; %bb.0: 27799; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27800; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27801; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27802; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 27803; GFX8-NEXT: s_setpc_b64 s[30:31] 27804; 27805; GFX9-LABEL: v_copysign_bf16_f32: 27806; GFX9: ; %bb.0: 27807; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27808; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27809; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27810; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 27811; GFX9-NEXT: s_setpc_b64 s[30:31] 27812; 27813; GFX10-LABEL: v_copysign_bf16_f32: 27814; GFX10: ; %bb.0: 27815; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27816; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27817; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27818; GFX10-NEXT: s_setpc_b64 s[30:31] 27819; 27820; GFX11-LABEL: v_copysign_bf16_f32: 27821; GFX11: ; %bb.0: 27822; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27823; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27824; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27825; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27826; GFX11-NEXT: s_setpc_b64 s[30:31] 27827 %sign = fptrunc float %sign.f32 to bfloat 27828 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27829 ret bfloat %op 27830} 27831 27832define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { 27833; GCN-LABEL: v_copysign_bf16_f64: 27834; GCN: ; %bb.0: 27835; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27836; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27837; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2 27838; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27839; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27840; GCN-NEXT: v_or_b32_e32 v0, v0, v1 27841; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27842; GCN-NEXT: s_setpc_b64 s[30:31] 27843; 27844; GFX7-LABEL: v_copysign_bf16_f64: 27845; GFX7: ; %bb.0: 27846; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27847; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27848; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2 27849; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 27850; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27851; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 27852; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27853; GFX7-NEXT: s_setpc_b64 s[30:31] 27854; 27855; GFX8-LABEL: v_copysign_bf16_f64: 27856; GFX8: ; %bb.0: 27857; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27858; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 27859; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27860; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 27861; GFX8-NEXT: s_setpc_b64 s[30:31] 27862; 27863; GFX9-LABEL: v_copysign_bf16_f64: 27864; GFX9: ; %bb.0: 27865; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27866; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 27867; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27868; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 27869; GFX9-NEXT: s_setpc_b64 s[30:31] 27870; 27871; GFX10-LABEL: v_copysign_bf16_f64: 27872; GFX10: ; %bb.0: 27873; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27874; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 27875; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27876; GFX10-NEXT: s_setpc_b64 s[30:31] 27877; 27878; GFX11-LABEL: v_copysign_bf16_f64: 27879; GFX11: ; %bb.0: 27880; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27881; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 27882; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27883; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27884; GFX11-NEXT: s_setpc_b64 s[30:31] 27885 %sign = fptrunc double %sign.f64 to bfloat 27886 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27887 ret bfloat %op 27888} 27889 27890define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { 27891; GCN-LABEL: v_copysign_bf16_f16: 27892; GCN: ; %bb.0: 27893; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27894; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 27895; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 27896; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 27897; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27898; GCN-NEXT: v_or_b32_e32 v0, v0, v1 27899; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27900; GCN-NEXT: s_setpc_b64 s[30:31] 27901; 27902; GFX7-LABEL: v_copysign_bf16_f16: 27903; GFX7: ; %bb.0: 27904; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27905; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 27906; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 27907; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27908; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 27909; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 27910; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 27911; GFX7-NEXT: s_setpc_b64 s[30:31] 27912; 27913; GFX8-LABEL: v_copysign_bf16_f16: 27914; GFX8: ; %bb.0: 27915; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27916; GFX8-NEXT: s_movk_i32 s4, 0x7fff 27917; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 27918; GFX8-NEXT: s_setpc_b64 s[30:31] 27919; 27920; GFX9-LABEL: v_copysign_bf16_f16: 27921; GFX9: ; %bb.0: 27922; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27923; GFX9-NEXT: s_movk_i32 s4, 0x7fff 27924; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 27925; GFX9-NEXT: s_setpc_b64 s[30:31] 27926; 27927; GFX10-LABEL: v_copysign_bf16_f16: 27928; GFX10: ; %bb.0: 27929; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27930; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27931; GFX10-NEXT: s_setpc_b64 s[30:31] 27932; 27933; GFX11-LABEL: v_copysign_bf16_f16: 27934; GFX11: ; %bb.0: 27935; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27936; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 27937; GFX11-NEXT: s_setpc_b64 s[30:31] 27938 %sign = bitcast half %sign.f16 to bfloat 27939 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 27940 ret bfloat %op 27941} 27942 27943define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) { 27944; GCN-LABEL: s_copysign_bf16_bf16: 27945; GCN: ; %bb.0: 27946; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 27947; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 27948; GCN-NEXT: s_lshr_b32 s0, s0, 16 27949; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 27950; GCN-NEXT: v_or_b32_e32 v0, s0, v0 27951; GCN-NEXT: v_readfirstlane_b32 s0, v0 27952; GCN-NEXT: ; return to shader part epilog 27953; 27954; GFX7-LABEL: s_copysign_bf16_bf16: 27955; GFX7: ; %bb.0: 27956; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 27957; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 27958; GFX7-NEXT: s_lshr_b32 s0, s0, 16 27959; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 27960; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 27961; GFX7-NEXT: v_readfirstlane_b32 s0, v0 27962; GFX7-NEXT: ; return to shader part epilog 27963; 27964; GFX8-LABEL: s_copysign_bf16_bf16: 27965; GFX8: ; %bb.0: 27966; GFX8-NEXT: s_movk_i32 s2, 0x7fff 27967; GFX8-NEXT: v_mov_b32_e32 v0, s0 27968; GFX8-NEXT: v_mov_b32_e32 v1, s1 27969; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 27970; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 27971; GFX8-NEXT: v_readfirstlane_b32 s0, v0 27972; GFX8-NEXT: ; return to shader part epilog 27973; 27974; GFX9-LABEL: s_copysign_bf16_bf16: 27975; GFX9: ; %bb.0: 27976; GFX9-NEXT: s_movk_i32 s2, 0x7fff 27977; GFX9-NEXT: v_mov_b32_e32 v0, s0 27978; GFX9-NEXT: v_mov_b32_e32 v1, s1 27979; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 27980; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 27981; GFX9-NEXT: v_readfirstlane_b32 s0, v0 27982; GFX9-NEXT: ; return to shader part epilog 27983; 27984; GFX10-LABEL: s_copysign_bf16_bf16: 27985; GFX10: ; %bb.0: 27986; GFX10-NEXT: v_mov_b32_e32 v0, s1 27987; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 27988; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 27989; GFX10-NEXT: v_readfirstlane_b32 s0, v0 27990; GFX10-NEXT: ; return to shader part epilog 27991; 27992; GFX11-LABEL: s_copysign_bf16_bf16: 27993; GFX11: ; %bb.0: 27994; GFX11-NEXT: v_mov_b32_e32 v0, s1 27995; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 27996; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 27997; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 27998; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 27999; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28000; GFX11-NEXT: ; return to shader part epilog 28001 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 28002 %cast = bitcast bfloat %op to i16 28003 %zext = zext i16 %cast to i32 28004 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 28005 ret i32 %readlane 28006} 28007 28008define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) { 28009; GCN-LABEL: s_copysign_bf16_f32: 28010; GCN: ; %bb.0: 28011; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 28012; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 28013; GCN-NEXT: s_lshr_b32 s0, s0, 16 28014; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 28015; GCN-NEXT: v_or_b32_e32 v0, s0, v0 28016; GCN-NEXT: v_readfirstlane_b32 s0, v0 28017; GCN-NEXT: ; return to shader part epilog 28018; 28019; GFX7-LABEL: s_copysign_bf16_f32: 28020; GFX7: ; %bb.0: 28021; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 28022; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 28023; GFX7-NEXT: s_lshr_b32 s0, s0, 16 28024; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 28025; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 28026; GFX7-NEXT: v_readfirstlane_b32 s0, v0 28027; GFX7-NEXT: ; return to shader part epilog 28028; 28029; GFX8-LABEL: s_copysign_bf16_f32: 28030; GFX8: ; %bb.0: 28031; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1 28032; GFX8-NEXT: s_movk_i32 s1, 0x7fff 28033; GFX8-NEXT: v_mov_b32_e32 v1, s0 28034; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 28035; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 28036; GFX8-NEXT: v_readfirstlane_b32 s0, v0 28037; GFX8-NEXT: ; return to shader part epilog 28038; 28039; GFX9-LABEL: s_copysign_bf16_f32: 28040; GFX9: ; %bb.0: 28041; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1 28042; GFX9-NEXT: s_movk_i32 s1, 0x7fff 28043; GFX9-NEXT: v_mov_b32_e32 v1, s0 28044; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 28045; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 28046; GFX9-NEXT: v_readfirstlane_b32 s0, v0 28047; GFX9-NEXT: ; return to shader part epilog 28048; 28049; GFX10-LABEL: s_copysign_bf16_f32: 28050; GFX10: ; %bb.0: 28051; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1 28052; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28053; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 28054; GFX10-NEXT: v_readfirstlane_b32 s0, v0 28055; GFX10-NEXT: ; return to shader part epilog 28056; 28057; GFX11-LABEL: s_copysign_bf16_f32: 28058; GFX11: ; %bb.0: 28059; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 28060; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28061; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28062; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 28063; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28064; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28065; GFX11-NEXT: ; return to shader part epilog 28066 %sign = fptrunc float %sign.f32 to bfloat 28067 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 28068 %cast = bitcast bfloat %op to i16 28069 %zext = zext i16 %cast to i32 28070 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 28071 ret i32 %readlane 28072} 28073 28074define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) { 28075; GCN-LABEL: s_copysign_bf16_f64: 28076; GCN: ; %bb.0: 28077; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 28078; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 28079; GCN-NEXT: s_lshr_b32 s0, s0, 16 28080; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 28081; GCN-NEXT: v_or_b32_e32 v0, s0, v0 28082; GCN-NEXT: v_readfirstlane_b32 s0, v0 28083; GCN-NEXT: ; return to shader part epilog 28084; 28085; GFX7-LABEL: s_copysign_bf16_f64: 28086; GFX7: ; %bb.0: 28087; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 28088; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 28089; GFX7-NEXT: s_lshr_b32 s0, s0, 16 28090; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 28091; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 28092; GFX7-NEXT: v_readfirstlane_b32 s0, v0 28093; GFX7-NEXT: ; return to shader part epilog 28094; 28095; GFX8-LABEL: s_copysign_bf16_f64: 28096; GFX8: ; %bb.0: 28097; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2 28098; GFX8-NEXT: s_movk_i32 s1, 0x7fff 28099; GFX8-NEXT: v_mov_b32_e32 v1, s0 28100; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 28101; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 28102; GFX8-NEXT: v_readfirstlane_b32 s0, v0 28103; GFX8-NEXT: ; return to shader part epilog 28104; 28105; GFX9-LABEL: s_copysign_bf16_f64: 28106; GFX9: ; %bb.0: 28107; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2 28108; GFX9-NEXT: s_movk_i32 s1, 0x7fff 28109; GFX9-NEXT: v_mov_b32_e32 v1, s0 28110; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 28111; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 28112; GFX9-NEXT: v_readfirstlane_b32 s0, v0 28113; GFX9-NEXT: ; return to shader part epilog 28114; 28115; GFX10-LABEL: s_copysign_bf16_f64: 28116; GFX10: ; %bb.0: 28117; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2 28118; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28119; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 28120; GFX10-NEXT: v_readfirstlane_b32 s0, v0 28121; GFX10-NEXT: ; return to shader part epilog 28122; 28123; GFX11-LABEL: s_copysign_bf16_f64: 28124; GFX11: ; %bb.0: 28125; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 28126; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28127; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28128; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 28129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28130; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28131; GFX11-NEXT: ; return to shader part epilog 28132 %sign = fptrunc double %sign.f64 to bfloat 28133 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 28134 %cast = bitcast bfloat %op to i16 28135 %zext = zext i16 %cast to i32 28136 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 28137 ret i32 %readlane 28138} 28139 28140define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) { 28141; GCN-LABEL: s_copysign_bf16_f16: 28142; GCN: ; %bb.0: 28143; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 28144; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1 28145; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 28146; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 28147; GCN-NEXT: v_or_b32_e32 v0, v0, v1 28148; GCN-NEXT: v_readfirstlane_b32 s0, v0 28149; GCN-NEXT: ; return to shader part epilog 28150; 28151; GFX7-LABEL: s_copysign_bf16_f16: 28152; GFX7: ; %bb.0: 28153; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1 28154; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 28155; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 28156; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 28157; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 28158; GFX7-NEXT: v_readfirstlane_b32 s0, v0 28159; GFX7-NEXT: ; return to shader part epilog 28160; 28161; GFX8-LABEL: s_copysign_bf16_f16: 28162; GFX8: ; %bb.0: 28163; GFX8-NEXT: s_movk_i32 s2, 0x7fff 28164; GFX8-NEXT: v_mov_b32_e32 v0, s0 28165; GFX8-NEXT: v_mov_b32_e32 v1, s1 28166; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 28167; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 28168; GFX8-NEXT: v_readfirstlane_b32 s0, v0 28169; GFX8-NEXT: ; return to shader part epilog 28170; 28171; GFX9-LABEL: s_copysign_bf16_f16: 28172; GFX9: ; %bb.0: 28173; GFX9-NEXT: s_movk_i32 s2, 0x7fff 28174; GFX9-NEXT: v_mov_b32_e32 v0, s0 28175; GFX9-NEXT: v_mov_b32_e32 v1, s1 28176; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 28177; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 28178; GFX9-NEXT: v_readfirstlane_b32 s0, v0 28179; GFX9-NEXT: ; return to shader part epilog 28180; 28181; GFX10-LABEL: s_copysign_bf16_f16: 28182; GFX10: ; %bb.0: 28183; GFX10-NEXT: v_mov_b32_e32 v0, s1 28184; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28185; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 28186; GFX10-NEXT: v_readfirstlane_b32 s0, v0 28187; GFX10-NEXT: ; return to shader part epilog 28188; 28189; GFX11-LABEL: s_copysign_bf16_f16: 28190; GFX11: ; %bb.0: 28191; GFX11-NEXT: v_mov_b32_e32 v0, s1 28192; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28193; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28194; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 28195; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28196; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28197; GFX11-NEXT: ; return to shader part epilog 28198 %sign = bitcast half %sign.f16 to bfloat 28199 %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) 28200 %cast = bitcast bfloat %op to i16 28201 %zext = zext i16 %cast to i32 28202 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 28203 ret i32 %readlane 28204} 28205 28206declare float @llvm.copysign.f32(float, float) 28207 28208define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { 28209; GCN-LABEL: v_copysign_f32_bf16: 28210; GCN: ; %bb.0: 28211; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28212; GCN-NEXT: s_brev_b32 s4, -2 28213; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 28214; GCN-NEXT: s_setpc_b64 s[30:31] 28215; 28216; GFX7-LABEL: v_copysign_f32_bf16: 28217; GFX7: ; %bb.0: 28218; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28219; GFX7-NEXT: s_brev_b32 s4, -2 28220; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 28221; GFX7-NEXT: s_setpc_b64 s[30:31] 28222; 28223; GFX8-LABEL: v_copysign_f32_bf16: 28224; GFX8: ; %bb.0: 28225; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28226; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28227; GFX8-NEXT: s_brev_b32 s4, -2 28228; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 28229; GFX8-NEXT: s_setpc_b64 s[30:31] 28230; 28231; GFX9-LABEL: v_copysign_f32_bf16: 28232; GFX9: ; %bb.0: 28233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28234; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28235; GFX9-NEXT: s_brev_b32 s4, -2 28236; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 28237; GFX9-NEXT: s_setpc_b64 s[30:31] 28238; 28239; GFX10-LABEL: v_copysign_f32_bf16: 28240; GFX10: ; %bb.0: 28241; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28242; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28243; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 28244; GFX10-NEXT: s_setpc_b64 s[30:31] 28245; 28246; GFX11-LABEL: v_copysign_f32_bf16: 28247; GFX11: ; %bb.0: 28248; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28249; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28250; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28251; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 28252; GFX11-NEXT: s_setpc_b64 s[30:31] 28253 %sign = fpext bfloat %sign.bf16 to float 28254 %op = call float @llvm.copysign.f32(float %mag, float %sign) 28255 ret float %op 28256} 28257 28258define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) { 28259; GCN-LABEL: s_copysign_f32_bf16: 28260; GCN: ; %bb.0: 28261; GCN-NEXT: s_brev_b32 s2, -2 28262; GCN-NEXT: v_mov_b32_e32 v0, s0 28263; GCN-NEXT: v_mov_b32_e32 v1, s1 28264; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1 28265; GCN-NEXT: v_readfirstlane_b32 s0, v0 28266; GCN-NEXT: ; return to shader part epilog 28267; 28268; GFX7-LABEL: s_copysign_f32_bf16: 28269; GFX7: ; %bb.0: 28270; GFX7-NEXT: s_brev_b32 s2, -2 28271; GFX7-NEXT: v_mov_b32_e32 v0, s0 28272; GFX7-NEXT: v_mov_b32_e32 v1, s1 28273; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1 28274; GFX7-NEXT: v_readfirstlane_b32 s0, v0 28275; GFX7-NEXT: ; return to shader part epilog 28276; 28277; GFX8-LABEL: s_copysign_f32_bf16: 28278; GFX8: ; %bb.0: 28279; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s1 28280; GFX8-NEXT: s_brev_b32 s1, -2 28281; GFX8-NEXT: v_mov_b32_e32 v1, s0 28282; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 28283; GFX8-NEXT: v_readfirstlane_b32 s0, v0 28284; GFX8-NEXT: ; return to shader part epilog 28285; 28286; GFX9-LABEL: s_copysign_f32_bf16: 28287; GFX9: ; %bb.0: 28288; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1 28289; GFX9-NEXT: s_brev_b32 s1, -2 28290; GFX9-NEXT: v_mov_b32_e32 v1, s0 28291; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 28292; GFX9-NEXT: v_readfirstlane_b32 s0, v0 28293; GFX9-NEXT: ; return to shader part epilog 28294; 28295; GFX10-LABEL: s_copysign_f32_bf16: 28296; GFX10: ; %bb.0: 28297; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s1 28298; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 28299; GFX10-NEXT: v_readfirstlane_b32 s0, v0 28300; GFX10-NEXT: ; return to shader part epilog 28301; 28302; GFX11-LABEL: s_copysign_f32_bf16: 28303; GFX11: ; %bb.0: 28304; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 28305; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28306; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 28307; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28308; GFX11-NEXT: ; return to shader part epilog 28309 %sign = fpext bfloat %sign.bf16 to float 28310 %op = call float @llvm.copysign.f32(float %mag, float %sign) 28311 %cast = bitcast float %op to i32 28312 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) 28313 ret i32 %readlane 28314} 28315 28316declare half @llvm.copysign.f16(half, half) 28317 28318define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { 28319; GCN-LABEL: v_copysign_f16_bf16: 28320; GCN: ; %bb.0: 28321; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28322; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 28323; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 28324; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 28325; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 28326; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 28327; GCN-NEXT: s_brev_b32 s4, -2 28328; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 28329; GCN-NEXT: s_setpc_b64 s[30:31] 28330; 28331; GFX7-LABEL: v_copysign_f16_bf16: 28332; GFX7: ; %bb.0: 28333; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28334; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 28335; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 28336; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 28337; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 28338; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 28339; GFX7-NEXT: s_brev_b32 s4, -2 28340; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 28341; GFX7-NEXT: s_setpc_b64 s[30:31] 28342; 28343; GFX8-LABEL: v_copysign_f16_bf16: 28344; GFX8: ; %bb.0: 28345; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28346; GFX8-NEXT: s_movk_i32 s4, 0x7fff 28347; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 28348; GFX8-NEXT: s_setpc_b64 s[30:31] 28349; 28350; GFX9-LABEL: v_copysign_f16_bf16: 28351; GFX9: ; %bb.0: 28352; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28353; GFX9-NEXT: s_movk_i32 s4, 0x7fff 28354; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 28355; GFX9-NEXT: s_setpc_b64 s[30:31] 28356; 28357; GFX10-LABEL: v_copysign_f16_bf16: 28358; GFX10: ; %bb.0: 28359; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28360; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 28361; GFX10-NEXT: s_setpc_b64 s[30:31] 28362; 28363; GFX11-LABEL: v_copysign_f16_bf16: 28364; GFX11: ; %bb.0: 28365; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28366; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 28367; GFX11-NEXT: s_setpc_b64 s[30:31] 28368 %sign = bitcast bfloat %sign.bf16 to half 28369 %op = call half @llvm.copysign.f16(half %mag, half %sign) 28370 ret half %op 28371} 28372 28373define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { 28374; GCN-LABEL: s_copysign_f16_bf16: 28375; GCN: ; %bb.0: 28376; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 28377; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0 28378; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 28379; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 28380; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 28381; GCN-NEXT: s_brev_b32 s0, -2 28382; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0 28383; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 28384; GCN-NEXT: v_readfirstlane_b32 s0, v0 28385; GCN-NEXT: ; return to shader part epilog 28386; 28387; GFX7-LABEL: s_copysign_f16_bf16: 28388; GFX7: ; %bb.0: 28389; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0 28390; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 28391; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 28392; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 28393; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 28394; GFX7-NEXT: s_brev_b32 s0, -2 28395; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 28396; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 28397; GFX7-NEXT: v_readfirstlane_b32 s0, v0 28398; GFX7-NEXT: ; return to shader part epilog 28399; 28400; GFX8-LABEL: s_copysign_f16_bf16: 28401; GFX8: ; %bb.0: 28402; GFX8-NEXT: s_movk_i32 s2, 0x7fff 28403; GFX8-NEXT: v_mov_b32_e32 v0, s0 28404; GFX8-NEXT: v_mov_b32_e32 v1, s1 28405; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 28406; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 28407; GFX8-NEXT: v_readfirstlane_b32 s0, v0 28408; GFX8-NEXT: ; return to shader part epilog 28409; 28410; GFX9-LABEL: s_copysign_f16_bf16: 28411; GFX9: ; %bb.0: 28412; GFX9-NEXT: s_movk_i32 s2, 0x7fff 28413; GFX9-NEXT: v_mov_b32_e32 v0, s0 28414; GFX9-NEXT: v_mov_b32_e32 v1, s1 28415; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 28416; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 28417; GFX9-NEXT: v_readfirstlane_b32 s0, v0 28418; GFX9-NEXT: ; return to shader part epilog 28419; 28420; GFX10-LABEL: s_copysign_f16_bf16: 28421; GFX10: ; %bb.0: 28422; GFX10-NEXT: v_mov_b32_e32 v0, s1 28423; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28424; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 28425; GFX10-NEXT: v_readfirstlane_b32 s0, v0 28426; GFX10-NEXT: ; return to shader part epilog 28427; 28428; GFX11-LABEL: s_copysign_f16_bf16: 28429; GFX11: ; %bb.0: 28430; GFX11-NEXT: v_mov_b32_e32 v0, s1 28431; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28432; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 28433; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 28434; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28435; GFX11-NEXT: v_readfirstlane_b32 s0, v0 28436; GFX11-NEXT: ; return to shader part epilog 28437 %sign = bitcast bfloat %sign.bf16 to half 28438 %op = call half @llvm.copysign.f16(half %mag, half %sign) 28439 %cast = bitcast half %op to i16 28440 %zext = zext i16 %cast to i32 28441 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 28442 ret i32 %readlane 28443} 28444 28445declare double @llvm.copysign.f64(double, double) 28446 28447define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { 28448; GCN-LABEL: v_copysign_f64_bf16: 28449; GCN: ; %bb.0: 28450; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28451; GCN-NEXT: s_brev_b32 s4, -2 28452; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 28453; GCN-NEXT: s_setpc_b64 s[30:31] 28454; 28455; GFX7-LABEL: v_copysign_f64_bf16: 28456; GFX7: ; %bb.0: 28457; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28458; GFX7-NEXT: s_brev_b32 s4, -2 28459; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 28460; GFX7-NEXT: s_setpc_b64 s[30:31] 28461; 28462; GFX8-LABEL: v_copysign_f64_bf16: 28463; GFX8: ; %bb.0: 28464; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28465; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 28466; GFX8-NEXT: s_brev_b32 s4, -2 28467; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 28468; GFX8-NEXT: s_setpc_b64 s[30:31] 28469; 28470; GFX9-LABEL: v_copysign_f64_bf16: 28471; GFX9: ; %bb.0: 28472; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28473; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 28474; GFX9-NEXT: s_brev_b32 s4, -2 28475; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 28476; GFX9-NEXT: s_setpc_b64 s[30:31] 28477; 28478; GFX10-LABEL: v_copysign_f64_bf16: 28479; GFX10: ; %bb.0: 28480; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28481; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 28482; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 28483; GFX10-NEXT: s_setpc_b64 s[30:31] 28484; 28485; GFX11-LABEL: v_copysign_f64_bf16: 28486; GFX11: ; %bb.0: 28487; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28488; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 28489; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28490; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 28491; GFX11-NEXT: s_setpc_b64 s[30:31] 28492 %sign = fpext bfloat %sign.bf16 to double 28493 %op = call double @llvm.copysign.f64(double %mag, double %sign) 28494 ret double %op 28495} 28496 28497define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) { 28498; GCN-LABEL: s_copysign_f64_bf16: 28499; GCN: ; %bb.0: 28500; GCN-NEXT: s_brev_b32 s3, -2 28501; GCN-NEXT: v_mov_b32_e32 v0, s1 28502; GCN-NEXT: v_mov_b32_e32 v1, s2 28503; GCN-NEXT: v_bfi_b32 v0, s3, v0, v1 28504; GCN-NEXT: v_readfirstlane_b32 s1, v0 28505; GCN-NEXT: ; return to shader part epilog 28506; 28507; GFX7-LABEL: s_copysign_f64_bf16: 28508; GFX7: ; %bb.0: 28509; GFX7-NEXT: s_brev_b32 s3, -2 28510; GFX7-NEXT: v_mov_b32_e32 v0, s1 28511; GFX7-NEXT: v_mov_b32_e32 v1, s2 28512; GFX7-NEXT: v_bfi_b32 v0, s3, v0, v1 28513; GFX7-NEXT: v_readfirstlane_b32 s1, v0 28514; GFX7-NEXT: ; return to shader part epilog 28515; 28516; GFX8-LABEL: s_copysign_f64_bf16: 28517; GFX8: ; %bb.0: 28518; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2 28519; GFX8-NEXT: s_brev_b32 s2, -2 28520; GFX8-NEXT: v_mov_b32_e32 v1, s1 28521; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0 28522; GFX8-NEXT: v_readfirstlane_b32 s1, v0 28523; GFX8-NEXT: ; return to shader part epilog 28524; 28525; GFX9-LABEL: s_copysign_f64_bf16: 28526; GFX9: ; %bb.0: 28527; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2 28528; GFX9-NEXT: s_brev_b32 s2, -2 28529; GFX9-NEXT: v_mov_b32_e32 v1, s1 28530; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 28531; GFX9-NEXT: v_readfirstlane_b32 s1, v0 28532; GFX9-NEXT: ; return to shader part epilog 28533; 28534; GFX10-LABEL: s_copysign_f64_bf16: 28535; GFX10: ; %bb.0: 28536; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2 28537; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 28538; GFX10-NEXT: v_readfirstlane_b32 s1, v0 28539; GFX10-NEXT: ; return to shader part epilog 28540; 28541; GFX11-LABEL: s_copysign_f64_bf16: 28542; GFX11: ; %bb.0: 28543; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 28544; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 28545; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 28546; GFX11-NEXT: v_readfirstlane_b32 s1, v0 28547; GFX11-NEXT: ; return to shader part epilog 28548 %sign = fpext bfloat %sign.bf16 to double 28549 %op = call double @llvm.copysign.f64(double %mag, double %sign) 28550 %cast = bitcast double %op to <2 x i32> 28551 %cast.0 = extractelement <2 x i32> %cast, i32 0 28552 %cast.1 = extractelement <2 x i32> %cast, i32 1 28553 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) 28554 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) 28555 %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 28556 %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1 28557 ret <2 x i32> %ins.1 28558} 28559 28560define i16 @v_fptosi_bf16_to_i16(bfloat %x) { 28561; GCN-LABEL: v_fptosi_bf16_to_i16: 28562; GCN: ; %bb.0: 28563; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28564; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28565; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28566; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28567; GCN-NEXT: s_setpc_b64 s[30:31] 28568; 28569; GFX7-LABEL: v_fptosi_bf16_to_i16: 28570; GFX7: ; %bb.0: 28571; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28572; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 28573; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28574; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 28575; GFX7-NEXT: s_setpc_b64 s[30:31] 28576; 28577; GFX8-LABEL: v_fptosi_bf16_to_i16: 28578; GFX8: ; %bb.0: 28579; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28580; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28581; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0 28582; GFX8-NEXT: s_setpc_b64 s[30:31] 28583; 28584; GFX9-LABEL: v_fptosi_bf16_to_i16: 28585; GFX9: ; %bb.0: 28586; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28587; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28588; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 28589; GFX9-NEXT: s_setpc_b64 s[30:31] 28590; 28591; GFX10-LABEL: v_fptosi_bf16_to_i16: 28592; GFX10: ; %bb.0: 28593; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28594; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28595; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 28596; GFX10-NEXT: s_setpc_b64 s[30:31] 28597; 28598; GFX11-LABEL: v_fptosi_bf16_to_i16: 28599; GFX11: ; %bb.0: 28600; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28601; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28602; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28603; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 28604; GFX11-NEXT: s_setpc_b64 s[30:31] 28605 %op = fptosi bfloat %x to i16 28606 ret i16 %op 28607} 28608 28609define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { 28610; GCN-LABEL: v_fptosi_v2bf16_to_v2i16: 28611; GCN: ; %bb.0: 28612; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28613; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28614; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 28615; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28616; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28617; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 28618; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28619; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28620; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 28621; GCN-NEXT: v_or_b32_e32 v0, v0, v2 28622; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 28623; GCN-NEXT: s_setpc_b64 s[30:31] 28624; 28625; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16: 28626; GFX7: ; %bb.0: 28627; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28628; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 28629; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 28630; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28631; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28632; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 28633; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 28634; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28635; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 28636; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 28637; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 28638; GFX7-NEXT: s_setpc_b64 s[30:31] 28639; 28640; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16: 28641; GFX8: ; %bb.0: 28642; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28643; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 28644; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28645; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 28646; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0 28647; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 28648; GFX8-NEXT: s_setpc_b64 s[30:31] 28649; 28650; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16: 28651; GFX9: ; %bb.0: 28652; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28653; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 28654; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28655; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 28656; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 28657; GFX9-NEXT: s_mov_b32 s4, 0x5040100 28658; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 28659; GFX9-NEXT: s_setpc_b64 s[30:31] 28660; 28661; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16: 28662; GFX10: ; %bb.0: 28663; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28664; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 28665; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28666; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1 28667; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 28668; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 28669; GFX10-NEXT: s_setpc_b64 s[30:31] 28670; 28671; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16: 28672; GFX11TRUE16: ; %bb.0: 28673; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28674; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 28675; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28676; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 28677; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28678; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28679; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 28680; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 28681; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 28682; 28683; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16: 28684; GFX11FAKE16: ; %bb.0: 28685; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28686; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 28687; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28688; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 28689; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28690; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28691; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 28692; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 28693; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 28694 %op = fptosi <2 x bfloat> %x to <2 x i16> 28695 ret <2 x i16> %op 28696} 28697 28698define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { 28699; GCN-LABEL: v_fptosi_v3bf16_to_v3i16: 28700; GCN: ; %bb.0: 28701; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28702; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 28703; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28704; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 28705; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28706; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28707; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 28708; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 28709; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28710; GCN-NEXT: v_cvt_i32_f32_e32 v3, v2 28711; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28712; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 28713; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 28714; GCN-NEXT: v_or_b32_e32 v0, v0, v1 28715; GCN-NEXT: v_alignbit_b32 v1, v3, v1, 16 28716; GCN-NEXT: s_setpc_b64 s[30:31] 28717; 28718; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16: 28719; GFX7: ; %bb.0: 28720; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28721; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 28722; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 28723; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 28724; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28725; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28726; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 28727; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 28728; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 28729; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v2 28730; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28731; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 28732; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 28733; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v3 28734; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 28735; GFX7-NEXT: s_setpc_b64 s[30:31] 28736; 28737; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16: 28738; GFX8: ; %bb.0: 28739; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28740; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 28741; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28742; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2 28743; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 28744; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28745; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v1 28746; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 28747; GFX8-NEXT: s_setpc_b64 s[30:31] 28748; 28749; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16: 28750; GFX9: ; %bb.0: 28751; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28752; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 28753; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28754; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 28755; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 28756; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28757; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 28758; GFX9-NEXT: s_mov_b32 s4, 0x5040100 28759; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 28760; GFX9-NEXT: s_setpc_b64 s[30:31] 28761; 28762; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16: 28763; GFX10: ; %bb.0: 28764; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28765; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 28766; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28767; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28768; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 28769; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 28770; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1 28771; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 28772; GFX10-NEXT: s_setpc_b64 s[30:31] 28773; 28774; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16: 28775; GFX11TRUE16: ; %bb.0: 28776; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28777; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 28778; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28779; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28780; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 28781; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 28782; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28783; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 28784; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28785; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 28786; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 28787; 28788; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16: 28789; GFX11FAKE16: ; %bb.0: 28790; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28791; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 28792; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28793; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28794; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 28795; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 28796; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28797; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 28798; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28799; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 28800; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 28801 %op = fptosi <3 x bfloat> %x to <3 x i16> 28802 ret <3 x i16> %op 28803} 28804 28805define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { 28806; GCN-LABEL: v_fptosi_v4bf16_to_v4i16: 28807; GCN: ; %bb.0: 28808; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28809; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 28810; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 28811; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28812; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 28813; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28814; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28815; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 28816; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 28817; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 28818; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28819; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 28820; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 28821; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28822; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 28823; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 28824; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 28825; GCN-NEXT: v_or_b32_e32 v0, v0, v1 28826; GCN-NEXT: v_or_b32_e32 v2, v2, v4 28827; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 28828; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 28829; GCN-NEXT: s_setpc_b64 s[30:31] 28830; 28831; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16: 28832; GFX7: ; %bb.0: 28833; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28834; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 28835; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 28836; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 28837; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 28838; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 28839; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 28840; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28841; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28842; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3 28843; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 28844; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 28845; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 28846; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3 28847; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 28848; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28849; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 28850; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 28851; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 28852; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 28853; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 28854; GFX7-NEXT: s_setpc_b64 s[30:31] 28855; 28856; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16: 28857; GFX8: ; %bb.0: 28858; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28859; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28860; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28861; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 28862; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28863; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v2 28864; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v3 28865; GFX8-NEXT: v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 28866; GFX8-NEXT: v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 28867; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 28868; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 28869; GFX8-NEXT: s_setpc_b64 s[30:31] 28870; 28871; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16: 28872; GFX9: ; %bb.0: 28873; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28874; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28875; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28876; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 28877; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28878; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 28879; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 28880; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 28881; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 28882; GFX9-NEXT: s_mov_b32 s4, 0x5040100 28883; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 28884; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 28885; GFX9-NEXT: s_setpc_b64 s[30:31] 28886; 28887; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16: 28888; GFX10: ; %bb.0: 28889; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28890; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28891; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 28892; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28893; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28894; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 28895; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v3 28896; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 28897; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v1 28898; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 28899; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 28900; GFX10-NEXT: s_setpc_b64 s[30:31] 28901; 28902; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16: 28903; GFX11TRUE16: ; %bb.0: 28904; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28905; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 28906; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28907; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 28908; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 28909; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 28910; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 28911; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28912; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 28913; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3 28914; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28915; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 28916; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 28917; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 28918; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 28919; 28920; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16: 28921; GFX11FAKE16: ; %bb.0: 28922; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28923; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 28924; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 28925; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28926; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28927; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 28928; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 28929; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3 28930; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 28931; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 28932; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 28933; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 28934; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 28935; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 28936; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 28937 %op = fptosi <4 x bfloat> %x to <4 x i16> 28938 ret <4 x i16> %op 28939} 28940 28941define i32 @v_fptosi_bf16_to_i32(bfloat %x) { 28942; GCN-LABEL: v_fptosi_bf16_to_i32: 28943; GCN: ; %bb.0: 28944; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28945; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28946; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28947; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28948; GCN-NEXT: s_setpc_b64 s[30:31] 28949; 28950; GFX7-LABEL: v_fptosi_bf16_to_i32: 28951; GFX7: ; %bb.0: 28952; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28953; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 28954; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28955; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 28956; GFX7-NEXT: s_setpc_b64 s[30:31] 28957; 28958; GFX8-LABEL: v_fptosi_bf16_to_i32: 28959; GFX8: ; %bb.0: 28960; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28961; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28962; GFX8-NEXT: v_cvt_i32_f32_e32 v0, v0 28963; GFX8-NEXT: s_setpc_b64 s[30:31] 28964; 28965; GFX9-LABEL: v_fptosi_bf16_to_i32: 28966; GFX9: ; %bb.0: 28967; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28968; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28969; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 28970; GFX9-NEXT: s_setpc_b64 s[30:31] 28971; 28972; GFX10-LABEL: v_fptosi_bf16_to_i32: 28973; GFX10: ; %bb.0: 28974; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28975; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28976; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 28977; GFX10-NEXT: s_setpc_b64 s[30:31] 28978; 28979; GFX11-LABEL: v_fptosi_bf16_to_i32: 28980; GFX11: ; %bb.0: 28981; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28982; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 28983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 28984; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 28985; GFX11-NEXT: s_setpc_b64 s[30:31] 28986 %op = fptosi bfloat %x to i32 28987 ret i32 %op 28988} 28989 28990define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { 28991; GCN-LABEL: v_fptosi_v2bf16_to_v2i32: 28992; GCN: ; %bb.0: 28993; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28994; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 28995; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 28996; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 28997; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 28998; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 28999; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 29000; GCN-NEXT: s_setpc_b64 s[30:31] 29001; 29002; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32: 29003; GFX7: ; %bb.0: 29004; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29005; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29006; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29007; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29008; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29009; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 29010; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 29011; GFX7-NEXT: s_setpc_b64 s[30:31] 29012; 29013; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32: 29014; GFX8: ; %bb.0: 29015; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29016; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29017; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v1 29018; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29019; GFX8-NEXT: v_cvt_i32_f32_e32 v1, v0 29020; GFX8-NEXT: v_mov_b32_e32 v0, v2 29021; GFX8-NEXT: s_setpc_b64 s[30:31] 29022; 29023; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32: 29024; GFX9: ; %bb.0: 29025; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29026; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29027; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v1 29028; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29029; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v0 29030; GFX9-NEXT: v_mov_b32_e32 v0, v2 29031; GFX9-NEXT: s_setpc_b64 s[30:31] 29032; 29033; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32: 29034; GFX10: ; %bb.0: 29035; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29036; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29037; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 29038; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v1 29039; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v2 29040; GFX10-NEXT: s_setpc_b64 s[30:31] 29041; 29042; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32: 29043; GFX11: ; %bb.0: 29044; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29045; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29046; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 29047; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 29048; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1 29049; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2 29050; GFX11-NEXT: s_setpc_b64 s[30:31] 29051 %op = fptosi <2 x bfloat> %x to <2 x i32> 29052 ret <2 x i32> %op 29053} 29054 29055define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { 29056; GCN-LABEL: v_fptosi_v3bf16_to_v3i32: 29057; GCN: ; %bb.0: 29058; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29059; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 29060; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 29061; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29062; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29063; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29064; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29065; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 29066; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 29067; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 29068; GCN-NEXT: s_setpc_b64 s[30:31] 29069; 29070; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32: 29071; GFX7: ; %bb.0: 29072; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29073; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 29074; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29075; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29076; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29077; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29078; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29079; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 29080; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 29081; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 29082; GFX7-NEXT: s_setpc_b64 s[30:31] 29083; 29084; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32: 29085; GFX8: ; %bb.0: 29086; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29087; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29088; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29089; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2 29090; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0 29091; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 29092; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0 29093; GFX8-NEXT: v_mov_b32_e32 v0, v4 29094; GFX8-NEXT: v_mov_b32_e32 v1, v3 29095; GFX8-NEXT: s_setpc_b64 s[30:31] 29096; 29097; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32: 29098; GFX9: ; %bb.0: 29099; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29100; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29101; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29102; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 29103; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0 29104; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 29105; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0 29106; GFX9-NEXT: v_mov_b32_e32 v0, v4 29107; GFX9-NEXT: v_mov_b32_e32 v1, v3 29108; GFX9-NEXT: s_setpc_b64 s[30:31] 29109; 29110; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32: 29111; GFX10: ; %bb.0: 29112; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29113; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29114; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 29115; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 29116; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2 29117; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3 29118; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4 29119; GFX10-NEXT: s_setpc_b64 s[30:31] 29120; 29121; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32: 29122; GFX11: ; %bb.0: 29123; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29124; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29125; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 29126; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 29127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 29128; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2 29129; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3 29130; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 29131; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 29132; GFX11-NEXT: s_setpc_b64 s[30:31] 29133 %op = fptosi <3 x bfloat> %x to <3 x i32> 29134 ret <3 x i32> %op 29135} 29136 29137define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { 29138; GCN-LABEL: v_fptosi_v4bf16_to_v4i32: 29139; GCN: ; %bb.0: 29140; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29141; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 29142; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 29143; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 29144; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29145; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29146; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29147; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29148; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 29149; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 29150; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 29151; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 29152; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 29153; GCN-NEXT: s_setpc_b64 s[30:31] 29154; 29155; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32: 29156; GFX7: ; %bb.0: 29157; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29158; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 29159; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 29160; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29161; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29162; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29163; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29164; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29165; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 29166; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 29167; GFX7-NEXT: v_cvt_i32_f32_e32 v1, v1 29168; GFX7-NEXT: v_cvt_i32_f32_e32 v2, v2 29169; GFX7-NEXT: v_cvt_i32_f32_e32 v3, v3 29170; GFX7-NEXT: s_setpc_b64 s[30:31] 29171; 29172; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32: 29173; GFX8: ; %bb.0: 29174; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29175; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29176; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29177; GFX8-NEXT: v_cvt_i32_f32_e32 v5, v0 29178; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 29179; GFX8-NEXT: v_cvt_i32_f32_e32 v4, v2 29180; GFX8-NEXT: v_cvt_i32_f32_e32 v2, v0 29181; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 29182; GFX8-NEXT: v_cvt_i32_f32_e32 v3, v0 29183; GFX8-NEXT: v_mov_b32_e32 v0, v4 29184; GFX8-NEXT: v_mov_b32_e32 v1, v5 29185; GFX8-NEXT: s_setpc_b64 s[30:31] 29186; 29187; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32: 29188; GFX9: ; %bb.0: 29189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29190; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29191; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29192; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v0 29193; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 29194; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 29195; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v0 29196; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 29197; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v0 29198; GFX9-NEXT: v_mov_b32_e32 v0, v4 29199; GFX9-NEXT: v_mov_b32_e32 v1, v5 29200; GFX9-NEXT: s_setpc_b64 s[30:31] 29201; 29202; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32: 29203; GFX10: ; %bb.0: 29204; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29205; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29206; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 29207; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 29208; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 29209; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v2 29210; GFX10-NEXT: v_cvt_i32_f32_e32 v1, v3 29211; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v4 29212; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v5 29213; GFX10-NEXT: s_setpc_b64 s[30:31] 29214; 29215; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32: 29216; GFX11: ; %bb.0: 29217; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29218; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29219; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 29220; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 29221; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 29222; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 29223; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v2 29224; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v3 29225; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 29226; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 29227; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5 29228; GFX11-NEXT: s_setpc_b64 s[30:31] 29229 %op = fptosi <4 x bfloat> %x to <4 x i32> 29230 ret <4 x i32> %op 29231} 29232 29233define i64 @v_fptosi_bf16_to_i64(bfloat %x) { 29234; GCN-LABEL: v_fptosi_bf16_to_i64: 29235; GCN: ; %bb.0: 29236; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29237; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29238; GCN-NEXT: s_mov_b32 s4, 0x2f800000 29239; GCN-NEXT: s_mov_b32 s5, 0xcf800000 29240; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29241; GCN-NEXT: v_trunc_f32_e32 v0, v0 29242; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4 29243; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 29244; GCN-NEXT: v_floor_f32_e32 v1, v1 29245; GCN-NEXT: v_fma_f32 v0, v1, s5, |v0| 29246; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 29247; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 29248; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 29249; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 29250; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 29251; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 29252; GCN-NEXT: s_setpc_b64 s[30:31] 29253; 29254; GFX7-LABEL: v_fptosi_bf16_to_i64: 29255; GFX7: ; %bb.0: 29256; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29257; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29258; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29259; GFX7-NEXT: v_trunc_f32_e32 v0, v0 29260; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 29261; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4 29262; GFX7-NEXT: v_floor_f32_e32 v1, v1 29263; GFX7-NEXT: s_mov_b32 s4, 0xcf800000 29264; GFX7-NEXT: v_fma_f32 v2, v1, s4, |v0| 29265; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2 29266; GFX7-NEXT: v_cvt_u32_f32_e32 v1, v1 29267; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29268; GFX7-NEXT: v_xor_b32_e32 v0, v2, v3 29269; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3 29270; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 29271; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 29272; GFX7-NEXT: s_setpc_b64 s[30:31] 29273; 29274; GFX8-LABEL: v_fptosi_bf16_to_i64: 29275; GFX8: ; %bb.0: 29276; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29277; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 29278; GFX8-NEXT: v_trunc_f32_e32 v0, v0 29279; GFX8-NEXT: s_mov_b32 s4, 0x2f800000 29280; GFX8-NEXT: v_mul_f32_e64 v1, |v0|, s4 29281; GFX8-NEXT: v_floor_f32_e32 v1, v1 29282; GFX8-NEXT: s_mov_b32 s4, 0xcf800000 29283; GFX8-NEXT: v_fma_f32 v2, v1, s4, |v0| 29284; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 29285; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 29286; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29287; GFX8-NEXT: v_xor_b32_e32 v0, v2, v3 29288; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3 29289; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 29290; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 29291; GFX8-NEXT: s_setpc_b64 s[30:31] 29292; 29293; GFX9-LABEL: v_fptosi_bf16_to_i64: 29294; GFX9: ; %bb.0: 29295; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29296; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 29297; GFX9-NEXT: v_trunc_f32_e32 v0, v0 29298; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 29299; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4 29300; GFX9-NEXT: v_floor_f32_e32 v1, v1 29301; GFX9-NEXT: s_mov_b32 s4, 0xcf800000 29302; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 29303; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0| 29304; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 29305; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29306; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3 29307; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3 29308; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 29309; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc 29310; GFX9-NEXT: s_setpc_b64 s[30:31] 29311; 29312; GFX10-LABEL: v_fptosi_bf16_to_i64: 29313; GFX10: ; %bb.0: 29314; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29315; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 29316; GFX10-NEXT: v_trunc_f32_e32 v0, v0 29317; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| 29318; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29319; GFX10-NEXT: v_floor_f32_e32 v1, v1 29320; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| 29321; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 29322; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v2 29323; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 29324; GFX10-NEXT: v_xor_b32_e32 v0, v0, v3 29325; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 29326; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 29327; GFX10-NEXT: s_setpc_b64 s[30:31] 29328; 29329; GFX11-LABEL: v_fptosi_bf16_to_i64: 29330; GFX11: ; %bb.0: 29331; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29332; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 29333; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 29334; GFX11-NEXT: v_trunc_f32_e32 v0, v0 29335; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| 29336; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29337; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 29338; GFX11-NEXT: v_floor_f32_e32 v1, v1 29339; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| 29340; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 29341; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 29342; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2 29343; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 29344; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 29345; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3 29346; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 29347; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 29348; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 29349; GFX11-NEXT: s_setpc_b64 s[30:31] 29350 %op = fptosi bfloat %x to i64 29351 ret i64 %op 29352} 29353 29354define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { 29355; GCN-LABEL: v_fptosi_v2bf16_to_v2i64: 29356; GCN: ; %bb.0: 29357; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29358; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 29359; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29360; GCN-NEXT: s_mov_b32 s4, 0x2f800000 29361; GCN-NEXT: s_mov_b32 s5, 0xcf800000 29362; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29363; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29364; GCN-NEXT: v_trunc_f32_e32 v0, v0 29365; GCN-NEXT: v_trunc_f32_e32 v1, v1 29366; GCN-NEXT: v_mul_f32_e64 v2, |v0|, s4 29367; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v0 29368; GCN-NEXT: v_mul_f32_e64 v4, |v1|, s4 29369; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 29370; GCN-NEXT: v_floor_f32_e32 v2, v2 29371; GCN-NEXT: v_floor_f32_e32 v4, v4 29372; GCN-NEXT: v_fma_f32 v0, v2, s5, |v0| 29373; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 29374; GCN-NEXT: v_fma_f32 v1, v4, s5, |v1| 29375; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 29376; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 29377; GCN-NEXT: v_xor_b32_e32 v2, v2, v3 29378; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 29379; GCN-NEXT: v_xor_b32_e32 v4, v4, v5 29380; GCN-NEXT: v_xor_b32_e32 v0, v0, v3 29381; GCN-NEXT: v_xor_b32_e32 v6, v1, v5 29382; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 29383; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc 29384; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v5 29385; GCN-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 29386; GCN-NEXT: s_setpc_b64 s[30:31] 29387; 29388; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64: 29389; GFX7: ; %bb.0: 29390; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29391; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29392; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29393; GFX7-NEXT: v_trunc_f32_e32 v0, v0 29394; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 29395; GFX7-NEXT: v_mul_f32_e64 v2, |v0|, s4 29396; GFX7-NEXT: v_floor_f32_e32 v2, v2 29397; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 29398; GFX7-NEXT: v_fma_f32 v3, v2, s5, |v0| 29399; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 29400; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29401; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v0 29402; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29403; GFX7-NEXT: v_xor_b32_e32 v0, v3, v4 29404; GFX7-NEXT: v_trunc_f32_e32 v3, v1 29405; GFX7-NEXT: v_mul_f32_e64 v1, |v3|, s4 29406; GFX7-NEXT: v_floor_f32_e32 v1, v1 29407; GFX7-NEXT: v_cvt_u32_f32_e32 v2, v2 29408; GFX7-NEXT: v_fma_f32 v5, v1, s5, |v3| 29409; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 29410; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v1 29411; GFX7-NEXT: v_xor_b32_e32 v2, v2, v4 29412; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 29413; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v3 29414; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc 29415; GFX7-NEXT: v_xor_b32_e32 v2, v5, v3 29416; GFX7-NEXT: v_xor_b32_e32 v4, v6, v3 29417; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 29418; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc 29419; GFX7-NEXT: s_setpc_b64 s[30:31] 29420; 29421; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64: 29422; GFX8: ; %bb.0: 29423; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29424; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29425; GFX8-NEXT: v_trunc_f32_e32 v1, v1 29426; GFX8-NEXT: s_mov_b32 s4, 0x2f800000 29427; GFX8-NEXT: v_mul_f32_e64 v2, |v1|, s4 29428; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29429; GFX8-NEXT: v_floor_f32_e32 v2, v2 29430; GFX8-NEXT: s_mov_b32 s5, 0xcf800000 29431; GFX8-NEXT: v_trunc_f32_e32 v4, v0 29432; GFX8-NEXT: v_fma_f32 v3, v2, s5, |v1| 29433; GFX8-NEXT: v_mul_f32_e64 v0, |v4|, s4 29434; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 29435; GFX8-NEXT: v_floor_f32_e32 v0, v0 29436; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 29437; GFX8-NEXT: v_fma_f32 v5, v0, s5, |v4| 29438; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 29439; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29440; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 29441; GFX8-NEXT: v_xor_b32_e32 v3, v3, v1 29442; GFX8-NEXT: v_xor_b32_e32 v2, v2, v1 29443; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v3, v1 29444; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 29445; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 29446; GFX8-NEXT: v_xor_b32_e32 v2, v5, v3 29447; GFX8-NEXT: v_xor_b32_e32 v4, v6, v3 29448; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 29449; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc 29450; GFX8-NEXT: s_setpc_b64 s[30:31] 29451; 29452; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64: 29453; GFX9: ; %bb.0: 29454; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29455; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29456; GFX9-NEXT: v_trunc_f32_e32 v1, v1 29457; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 29458; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4 29459; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29460; GFX9-NEXT: v_floor_f32_e32 v2, v2 29461; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 29462; GFX9-NEXT: v_trunc_f32_e32 v4, v0 29463; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1| 29464; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4 29465; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 29466; GFX9-NEXT: v_floor_f32_e32 v0, v0 29467; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 29468; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4| 29469; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 29470; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29471; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 29472; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1 29473; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 29474; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 29475; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4 29476; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 29477; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3 29478; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3 29479; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 29480; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc 29481; GFX9-NEXT: s_setpc_b64 s[30:31] 29482; 29483; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64: 29484; GFX10: ; %bb.0: 29485; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29486; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29487; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29488; GFX10-NEXT: v_trunc_f32_e32 v1, v1 29489; GFX10-NEXT: v_trunc_f32_e32 v0, v0 29490; GFX10-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1| 29491; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0| 29492; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v0 29493; GFX10-NEXT: v_floor_f32_e32 v2, v2 29494; GFX10-NEXT: v_floor_f32_e32 v3, v3 29495; GFX10-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1| 29496; GFX10-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0| 29497; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29498; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 29499; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 29500; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v4 29501; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v5 29502; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1 29503; GFX10-NEXT: v_xor_b32_e32 v3, v3, v6 29504; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 29505; GFX10-NEXT: v_xor_b32_e32 v4, v4, v6 29506; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1 29507; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo 29508; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 29509; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo 29510; GFX10-NEXT: s_setpc_b64 s[30:31] 29511; 29512; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64: 29513; GFX11: ; %bb.0: 29514; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29515; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 29516; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29517; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 29518; GFX11-NEXT: v_trunc_f32_e32 v1, v1 29519; GFX11-NEXT: v_trunc_f32_e32 v0, v0 29520; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 29521; GFX11-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v1| 29522; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v0| 29523; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v0 29524; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 29525; GFX11-NEXT: v_floor_f32_e32 v2, v2 29526; GFX11-NEXT: v_floor_f32_e32 v3, v3 29527; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 29528; GFX11-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v1| 29529; GFX11-NEXT: v_fma_f32 v5, 0xcf800000, v3, |v0| 29530; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29531; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2 29532; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 29533; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v4 29534; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v5 29535; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 29536; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1 29537; GFX11-NEXT: v_xor_b32_e32 v3, v3, v6 29538; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 29539; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1 29540; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6 29541; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 29542; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1 29543; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo 29544; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 29545; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo 29546; GFX11-NEXT: s_setpc_b64 s[30:31] 29547 %op = fptosi <2 x bfloat> %x to <2 x i64> 29548 ret <2 x i64> %op 29549} 29550 29551define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { 29552; GCN-LABEL: v_fptosi_v3bf16_to_v3i64: 29553; GCN: ; %bb.0: 29554; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29555; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 29556; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 29557; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29558; GCN-NEXT: s_mov_b32 s4, 0x2f800000 29559; GCN-NEXT: s_mov_b32 s5, 0xcf800000 29560; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29561; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29562; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29563; GCN-NEXT: v_trunc_f32_e32 v0, v0 29564; GCN-NEXT: v_trunc_f32_e32 v1, v1 29565; GCN-NEXT: v_trunc_f32_e32 v2, v2 29566; GCN-NEXT: v_mul_f32_e64 v3, |v0|, s4 29567; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 29568; GCN-NEXT: v_mul_f32_e64 v5, |v1|, s4 29569; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 29570; GCN-NEXT: v_mul_f32_e64 v7, |v2|, s4 29571; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v2 29572; GCN-NEXT: v_floor_f32_e32 v3, v3 29573; GCN-NEXT: v_floor_f32_e32 v5, v5 29574; GCN-NEXT: v_floor_f32_e32 v7, v7 29575; GCN-NEXT: v_fma_f32 v0, v3, s5, |v0| 29576; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 29577; GCN-NEXT: v_fma_f32 v1, v5, s5, |v1| 29578; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 29579; GCN-NEXT: v_fma_f32 v2, v7, s5, |v2| 29580; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 29581; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 29582; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 29583; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 29584; GCN-NEXT: v_xor_b32_e32 v5, v5, v6 29585; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 29586; GCN-NEXT: v_xor_b32_e32 v7, v7, v8 29587; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 29588; GCN-NEXT: v_xor_b32_e32 v9, v1, v6 29589; GCN-NEXT: v_xor_b32_e32 v10, v2, v8 29590; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 29591; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc 29592; GCN-NEXT: v_sub_i32_e32 v2, vcc, v9, v6 29593; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v6, vcc 29594; GCN-NEXT: v_sub_i32_e32 v4, vcc, v10, v8 29595; GCN-NEXT: v_subb_u32_e32 v5, vcc, v7, v8, vcc 29596; GCN-NEXT: s_setpc_b64 s[30:31] 29597; 29598; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64: 29599; GFX7: ; %bb.0: 29600; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29601; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29602; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29603; GFX7-NEXT: v_trunc_f32_e32 v0, v0 29604; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 29605; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4 29606; GFX7-NEXT: v_floor_f32_e32 v3, v3 29607; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 29608; GFX7-NEXT: v_fma_f32 v4, v3, s5, |v0| 29609; GFX7-NEXT: v_cvt_u32_f32_e32 v4, v4 29610; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29611; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v0 29612; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29613; GFX7-NEXT: v_xor_b32_e32 v0, v4, v5 29614; GFX7-NEXT: v_trunc_f32_e32 v4, v1 29615; GFX7-NEXT: v_mul_f32_e64 v1, |v4|, s4 29616; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 29617; GFX7-NEXT: v_floor_f32_e32 v1, v1 29618; GFX7-NEXT: v_fma_f32 v6, v1, s5, |v4| 29619; GFX7-NEXT: v_cvt_u32_f32_e32 v6, v6 29620; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 29621; GFX7-NEXT: v_xor_b32_e32 v3, v3, v5 29622; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 29623; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v1 29624; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v5, vcc 29625; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v4 29626; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29627; GFX7-NEXT: v_xor_b32_e32 v5, v6, v3 29628; GFX7-NEXT: v_trunc_f32_e32 v6, v2 29629; GFX7-NEXT: v_mul_f32_e64 v2, |v6|, s4 29630; GFX7-NEXT: v_floor_f32_e32 v2, v2 29631; GFX7-NEXT: v_xor_b32_e32 v4, v7, v3 29632; GFX7-NEXT: v_fma_f32 v7, v2, s5, |v6| 29633; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 29634; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v2 29635; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v5, v3 29636; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v6 29637; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc 29638; GFX7-NEXT: v_xor_b32_e32 v4, v7, v5 29639; GFX7-NEXT: v_xor_b32_e32 v6, v8, v5 29640; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 29641; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc 29642; GFX7-NEXT: s_setpc_b64 s[30:31] 29643; 29644; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64: 29645; GFX8: ; %bb.0: 29646; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29647; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29648; GFX8-NEXT: v_trunc_f32_e32 v2, v2 29649; GFX8-NEXT: s_mov_b32 s4, 0x2f800000 29650; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4 29651; GFX8-NEXT: v_floor_f32_e32 v3, v3 29652; GFX8-NEXT: s_mov_b32 s5, 0xcf800000 29653; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2| 29654; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29655; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4 29656; GFX8-NEXT: v_trunc_f32_e32 v5, v0 29657; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 29658; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4 29659; GFX8-NEXT: v_floor_f32_e32 v0, v0 29660; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2 29661; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5| 29662; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2 29663; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v6 29664; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 29665; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2 29666; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v0 29667; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2 29668; GFX8-NEXT: v_trunc_f32_e32 v1, v1 29669; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v3, v2, vcc 29670; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 29671; GFX8-NEXT: v_mul_f32_e64 v5, |v1|, s4 29672; GFX8-NEXT: v_floor_f32_e32 v5, v5 29673; GFX8-NEXT: v_xor_b32_e32 v2, v7, v3 29674; GFX8-NEXT: v_fma_f32 v7, v5, s5, |v1| 29675; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 29676; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 29677; GFX8-NEXT: v_xor_b32_e32 v4, v8, v3 29678; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 29679; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29680; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc 29681; GFX8-NEXT: v_xor_b32_e32 v4, v7, v1 29682; GFX8-NEXT: v_xor_b32_e32 v5, v5, v1 29683; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v1 29684; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc 29685; GFX8-NEXT: v_mov_b32_e32 v1, v6 29686; GFX8-NEXT: s_setpc_b64 s[30:31] 29687; 29688; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64: 29689; GFX9: ; %bb.0: 29690; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29691; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29692; GFX9-NEXT: v_trunc_f32_e32 v2, v2 29693; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 29694; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 29695; GFX9-NEXT: v_floor_f32_e32 v3, v3 29696; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 29697; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| 29698; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29699; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 29700; GFX9-NEXT: v_trunc_f32_e32 v5, v0 29701; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 29702; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 29703; GFX9-NEXT: v_floor_f32_e32 v0, v0 29704; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 29705; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| 29706; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 29707; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 29708; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 29709; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 29710; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0 29711; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 29712; GFX9-NEXT: v_trunc_f32_e32 v1, v1 29713; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc 29714; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 29715; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4 29716; GFX9-NEXT: v_floor_f32_e32 v5, v5 29717; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3 29718; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1| 29719; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 29720; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 29721; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3 29722; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 29723; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29724; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc 29725; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1 29726; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1 29727; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 29728; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc 29729; GFX9-NEXT: v_mov_b32_e32 v1, v6 29730; GFX9-NEXT: s_setpc_b64 s[30:31] 29731; 29732; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64: 29733; GFX10: ; %bb.0: 29734; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29735; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29736; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29737; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 29738; GFX10-NEXT: v_trunc_f32_e32 v2, v2 29739; GFX10-NEXT: v_trunc_f32_e32 v0, v0 29740; GFX10-NEXT: v_trunc_f32_e32 v1, v1 29741; GFX10-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2| 29742; GFX10-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0| 29743; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1| 29744; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2 29745; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0 29746; GFX10-NEXT: v_floor_f32_e32 v3, v3 29747; GFX10-NEXT: v_floor_f32_e32 v4, v4 29748; GFX10-NEXT: v_floor_f32_e32 v6, v6 29749; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v1 29750; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2| 29751; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0| 29752; GFX10-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1| 29753; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 29754; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 29755; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 29756; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 29757; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 29758; GFX10-NEXT: v_xor_b32_e32 v3, v3, v5 29759; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 29760; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5 29761; GFX10-NEXT: v_xor_b32_e32 v9, v0, v7 29762; GFX10-NEXT: v_xor_b32_e32 v4, v4, v7 29763; GFX10-NEXT: v_xor_b32_e32 v10, v1, v8 29764; GFX10-NEXT: v_xor_b32_e32 v6, v6, v8 29765; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5 29766; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo 29767; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7 29768; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo 29769; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8 29770; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo 29771; GFX10-NEXT: s_setpc_b64 s[30:31] 29772; 29773; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64: 29774; GFX11: ; %bb.0: 29775; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29776; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29777; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29778; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 29779; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 29780; GFX11-NEXT: v_trunc_f32_e32 v2, v2 29781; GFX11-NEXT: v_trunc_f32_e32 v0, v0 29782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 29783; GFX11-NEXT: v_trunc_f32_e32 v1, v1 29784; GFX11-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v2| 29785; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 29786; GFX11-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v0| 29787; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v1| 29788; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2 29789; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0 29790; GFX11-NEXT: v_floor_f32_e32 v3, v3 29791; GFX11-NEXT: v_floor_f32_e32 v4, v4 29792; GFX11-NEXT: v_floor_f32_e32 v6, v6 29793; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v1 29794; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 29795; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v3, |v2| 29796; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v4, |v0| 29797; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 29798; GFX11-NEXT: v_fma_f32 v1, 0xcf800000, v6, |v1| 29799; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 29800; GFX11-NEXT: v_cvt_u32_f32_e32 v4, v4 29801; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2 29802; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 29803; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 29804; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5 29805; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 29806; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5 29807; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7 29808; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7 29809; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8 29810; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8 29811; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5 29812; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo 29813; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7 29814; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo 29815; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8 29816; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo 29817; GFX11-NEXT: s_setpc_b64 s[30:31] 29818 %op = fptosi <3 x bfloat> %x to <3 x i64> 29819 ret <3 x i64> %op 29820} 29821 29822define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { 29823; GCN-LABEL: v_fptosi_v4bf16_to_v4i64: 29824; GCN: ; %bb.0: 29825; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29826; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 29827; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 29828; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 29829; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 29830; GCN-NEXT: s_mov_b32 s4, 0x2f800000 29831; GCN-NEXT: s_mov_b32 s5, 0xcf800000 29832; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29833; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29834; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29835; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 29836; GCN-NEXT: v_trunc_f32_e32 v0, v0 29837; GCN-NEXT: v_trunc_f32_e32 v1, v1 29838; GCN-NEXT: v_trunc_f32_e32 v2, v2 29839; GCN-NEXT: v_trunc_f32_e32 v3, v3 29840; GCN-NEXT: v_mul_f32_e64 v4, |v0|, s4 29841; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 29842; GCN-NEXT: v_mul_f32_e64 v6, |v1|, s4 29843; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1 29844; GCN-NEXT: v_mul_f32_e64 v8, |v2|, s4 29845; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v2 29846; GCN-NEXT: v_mul_f32_e64 v10, |v3|, s4 29847; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3 29848; GCN-NEXT: v_floor_f32_e32 v4, v4 29849; GCN-NEXT: v_floor_f32_e32 v6, v6 29850; GCN-NEXT: v_floor_f32_e32 v8, v8 29851; GCN-NEXT: v_floor_f32_e32 v10, v10 29852; GCN-NEXT: v_fma_f32 v0, v4, s5, |v0| 29853; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 29854; GCN-NEXT: v_fma_f32 v1, v6, s5, |v1| 29855; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 29856; GCN-NEXT: v_fma_f32 v2, v8, s5, |v2| 29857; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 29858; GCN-NEXT: v_fma_f32 v3, v10, s5, |v3| 29859; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 29860; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 29861; GCN-NEXT: v_xor_b32_e32 v4, v4, v5 29862; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 29863; GCN-NEXT: v_xor_b32_e32 v6, v6, v7 29864; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 29865; GCN-NEXT: v_xor_b32_e32 v8, v8, v9 29866; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 29867; GCN-NEXT: v_xor_b32_e32 v10, v10, v11 29868; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 29869; GCN-NEXT: v_xor_b32_e32 v12, v1, v7 29870; GCN-NEXT: v_xor_b32_e32 v13, v2, v9 29871; GCN-NEXT: v_xor_b32_e32 v14, v3, v11 29872; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 29873; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v5, vcc 29874; GCN-NEXT: v_sub_i32_e32 v2, vcc, v12, v7 29875; GCN-NEXT: v_subb_u32_e32 v3, vcc, v6, v7, vcc 29876; GCN-NEXT: v_sub_i32_e32 v4, vcc, v13, v9 29877; GCN-NEXT: v_subb_u32_e32 v5, vcc, v8, v9, vcc 29878; GCN-NEXT: v_sub_i32_e32 v6, vcc, v14, v11 29879; GCN-NEXT: v_subb_u32_e32 v7, vcc, v10, v11, vcc 29880; GCN-NEXT: s_setpc_b64 s[30:31] 29881; 29882; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64: 29883; GFX7: ; %bb.0: 29884; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29885; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 29886; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29887; GFX7-NEXT: v_trunc_f32_e32 v0, v0 29888; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 29889; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v3 29890; GFX7-NEXT: v_mul_f32_e64 v3, |v0|, s4 29891; GFX7-NEXT: v_floor_f32_e32 v3, v3 29892; GFX7-NEXT: s_mov_b32 s5, 0xcf800000 29893; GFX7-NEXT: v_fma_f32 v5, v3, s5, |v0| 29894; GFX7-NEXT: v_cvt_u32_f32_e32 v5, v5 29895; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 29896; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v0 29897; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29898; GFX7-NEXT: v_xor_b32_e32 v0, v5, v6 29899; GFX7-NEXT: v_trunc_f32_e32 v5, v1 29900; GFX7-NEXT: v_mul_f32_e64 v1, |v5|, s4 29901; GFX7-NEXT: v_cvt_u32_f32_e32 v3, v3 29902; GFX7-NEXT: v_floor_f32_e32 v1, v1 29903; GFX7-NEXT: v_fma_f32 v7, v1, s5, |v5| 29904; GFX7-NEXT: v_cvt_u32_f32_e32 v7, v7 29905; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 29906; GFX7-NEXT: v_xor_b32_e32 v3, v3, v6 29907; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 29908; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v1 29909; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc 29910; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v5 29911; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 29912; GFX7-NEXT: v_xor_b32_e32 v6, v7, v3 29913; GFX7-NEXT: v_trunc_f32_e32 v7, v2 29914; GFX7-NEXT: v_mul_f32_e64 v2, |v7|, s4 29915; GFX7-NEXT: v_floor_f32_e32 v2, v2 29916; GFX7-NEXT: v_xor_b32_e32 v5, v8, v3 29917; GFX7-NEXT: v_fma_f32 v8, v2, s5, |v7| 29918; GFX7-NEXT: v_cvt_u32_f32_e32 v8, v8 29919; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v2 29920; GFX7-NEXT: v_sub_i32_e32 v2, vcc, v6, v3 29921; GFX7-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc 29922; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v7 29923; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 29924; GFX7-NEXT: v_xor_b32_e32 v7, v8, v5 29925; GFX7-NEXT: v_trunc_f32_e32 v8, v4 29926; GFX7-NEXT: v_mul_f32_e64 v4, |v8|, s4 29927; GFX7-NEXT: v_floor_f32_e32 v4, v4 29928; GFX7-NEXT: v_xor_b32_e32 v6, v9, v5 29929; GFX7-NEXT: v_fma_f32 v9, v4, s5, |v8| 29930; GFX7-NEXT: v_cvt_u32_f32_e32 v9, v9 29931; GFX7-NEXT: v_cvt_u32_f32_e32 v10, v4 29932; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v7, v5 29933; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v8 29934; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc 29935; GFX7-NEXT: v_xor_b32_e32 v6, v9, v7 29936; GFX7-NEXT: v_xor_b32_e32 v8, v10, v7 29937; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 29938; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc 29939; GFX7-NEXT: s_setpc_b64 s[30:31] 29940; 29941; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64: 29942; GFX8: ; %bb.0: 29943; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 29944; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 29945; GFX8-NEXT: v_trunc_f32_e32 v2, v2 29946; GFX8-NEXT: s_mov_b32 s4, 0x2f800000 29947; GFX8-NEXT: v_mul_f32_e64 v3, |v2|, s4 29948; GFX8-NEXT: v_floor_f32_e32 v3, v3 29949; GFX8-NEXT: s_mov_b32 s5, 0xcf800000 29950; GFX8-NEXT: v_fma_f32 v4, v3, s5, |v2| 29951; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 29952; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v4 29953; GFX8-NEXT: v_trunc_f32_e32 v5, v0 29954; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 29955; GFX8-NEXT: v_mul_f32_e64 v0, |v5|, s4 29956; GFX8-NEXT: v_floor_f32_e32 v0, v0 29957; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2 29958; GFX8-NEXT: v_fma_f32 v6, v0, s5, |v5| 29959; GFX8-NEXT: v_xor_b32_e32 v4, v4, v2 29960; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 29961; GFX8-NEXT: v_xor_b32_e32 v3, v3, v2 29962; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v0 29963; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v2 29964; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v3, v2, vcc 29965; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 29966; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 29967; GFX8-NEXT: v_trunc_f32_e32 v5, v5 29968; GFX8-NEXT: v_xor_b32_e32 v2, v6, v3 29969; GFX8-NEXT: v_mul_f32_e64 v6, |v5|, s4 29970; GFX8-NEXT: v_floor_f32_e32 v6, v6 29971; GFX8-NEXT: v_xor_b32_e32 v4, v7, v3 29972; GFX8-NEXT: v_fma_f32 v7, v6, s5, |v5| 29973; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 29974; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 29975; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 29976; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 29977; GFX8-NEXT: v_trunc_f32_e32 v1, v1 29978; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc 29979; GFX8-NEXT: v_xor_b32_e32 v4, v7, v5 29980; GFX8-NEXT: v_mul_f32_e64 v7, |v1|, s4 29981; GFX8-NEXT: v_floor_f32_e32 v7, v7 29982; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 29983; GFX8-NEXT: v_fma_f32 v9, v7, s5, |v1| 29984; GFX8-NEXT: v_cvt_u32_f32_e32 v9, v9 29985; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 29986; GFX8-NEXT: v_xor_b32_e32 v6, v6, v5 29987; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 29988; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 29989; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc 29990; GFX8-NEXT: v_xor_b32_e32 v6, v9, v1 29991; GFX8-NEXT: v_xor_b32_e32 v7, v7, v1 29992; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v1 29993; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v1, vcc 29994; GFX8-NEXT: v_mov_b32_e32 v1, v8 29995; GFX8-NEXT: s_setpc_b64 s[30:31] 29996; 29997; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64: 29998; GFX9: ; %bb.0: 29999; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30000; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 30001; GFX9-NEXT: v_trunc_f32_e32 v2, v2 30002; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 30003; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 30004; GFX9-NEXT: v_floor_f32_e32 v3, v3 30005; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 30006; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| 30007; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30008; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 30009; GFX9-NEXT: v_trunc_f32_e32 v5, v0 30010; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 30011; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 30012; GFX9-NEXT: v_floor_f32_e32 v0, v0 30013; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 30014; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| 30015; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 30016; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 30017; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 30018; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 30019; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 30020; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc 30021; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 30022; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 30023; GFX9-NEXT: v_trunc_f32_e32 v5, v5 30024; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3 30025; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4 30026; GFX9-NEXT: v_floor_f32_e32 v6, v6 30027; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3 30028; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5| 30029; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 30030; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30031; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 30032; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 30033; GFX9-NEXT: v_trunc_f32_e32 v1, v1 30034; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc 30035; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5 30036; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4 30037; GFX9-NEXT: v_floor_f32_e32 v7, v7 30038; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 30039; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1| 30040; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 30041; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 30042; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5 30043; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 30044; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 30045; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 30046; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1 30047; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1 30048; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 30049; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc 30050; GFX9-NEXT: v_mov_b32_e32 v1, v8 30051; GFX9-NEXT: s_setpc_b64 s[30:31] 30052; 30053; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64: 30054; GFX10: ; %bb.0: 30055; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30056; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 30057; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30058; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 30059; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30060; GFX10-NEXT: v_trunc_f32_e32 v2, v2 30061; GFX10-NEXT: v_trunc_f32_e32 v0, v0 30062; GFX10-NEXT: v_trunc_f32_e32 v3, v3 30063; GFX10-NEXT: v_trunc_f32_e32 v4, v1 30064; GFX10-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2| 30065; GFX10-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0| 30066; GFX10-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3| 30067; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v2 30068; GFX10-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4| 30069; GFX10-NEXT: v_floor_f32_e32 v1, v1 30070; GFX10-NEXT: v_floor_f32_e32 v6, v6 30071; GFX10-NEXT: v_floor_f32_e32 v8, v8 30072; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v0 30073; GFX10-NEXT: v_floor_f32_e32 v9, v9 30074; GFX10-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2| 30075; GFX10-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0| 30076; GFX10-NEXT: v_ashrrev_i32_e32 v10, 31, v3 30077; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 30078; GFX10-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3| 30079; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 30080; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 30081; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 30082; GFX10-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4| 30083; GFX10-NEXT: v_xor_b32_e32 v1, v1, v5 30084; GFX10-NEXT: v_xor_b32_e32 v2, v2, v5 30085; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v3 30086; GFX10-NEXT: v_xor_b32_e32 v3, v0, v7 30087; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 30088; GFX10-NEXT: v_xor_b32_e32 v6, v6, v7 30089; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 30090; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5 30091; GFX10-NEXT: v_ashrrev_i32_e32 v13, 31, v4 30092; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v9 30093; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 30094; GFX10-NEXT: v_xor_b32_e32 v4, v12, v10 30095; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7 30096; GFX10-NEXT: v_xor_b32_e32 v5, v8, v10 30097; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo 30098; GFX10-NEXT: v_xor_b32_e32 v6, v11, v13 30099; GFX10-NEXT: v_xor_b32_e32 v7, v9, v13 30100; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10 30101; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo 30102; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13 30103; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo 30104; GFX10-NEXT: s_setpc_b64 s[30:31] 30105; 30106; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64: 30107; GFX11: ; %bb.0: 30108; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30109; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 30110; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30111; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 30112; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30113; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30114; GFX11-NEXT: v_trunc_f32_e32 v2, v2 30115; GFX11-NEXT: v_trunc_f32_e32 v0, v0 30116; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30117; GFX11-NEXT: v_trunc_f32_e32 v3, v3 30118; GFX11-NEXT: v_trunc_f32_e32 v4, v1 30119; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30120; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2| 30121; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0| 30122; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 30123; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3| 30124; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2 30125; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4| 30126; GFX11-NEXT: v_floor_f32_e32 v1, v1 30127; GFX11-NEXT: v_floor_f32_e32 v6, v6 30128; GFX11-NEXT: v_floor_f32_e32 v8, v8 30129; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0 30130; GFX11-NEXT: v_floor_f32_e32 v9, v9 30131; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2| 30132; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0| 30133; GFX11-NEXT: v_ashrrev_i32_e32 v10, 31, v3 30134; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 30135; GFX11-NEXT: v_fma_f32 v3, 0xcf800000, v8, |v3| 30136; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2 30137; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 30138; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 30139; GFX11-NEXT: v_fma_f32 v11, 0xcf800000, v9, |v4| 30140; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 30141; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5 30142; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v3 30143; GFX11-NEXT: v_xor_b32_e32 v3, v0, v7 30144; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8 30145; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7 30146; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11 30147; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5 30148; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4 30149; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9 30150; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 30151; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10 30152; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7 30153; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10 30154; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo 30155; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13 30156; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13 30157; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10 30158; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo 30159; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30160; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13 30161; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo 30162; GFX11-NEXT: s_setpc_b64 s[30:31] 30163 %op = fptosi <4 x bfloat> %x to <4 x i64> 30164 ret <4 x i64> %op 30165} 30166 30167define bfloat @v_sitofp_i16_to_bf16(i16 %x) { 30168; GCN-LABEL: v_sitofp_i16_to_bf16: 30169; GCN: ; %bb.0: 30170; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30171; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 30172; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30173; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30174; GCN-NEXT: s_setpc_b64 s[30:31] 30175; 30176; GFX7-LABEL: v_sitofp_i16_to_bf16: 30177; GFX7: ; %bb.0: 30178; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30179; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 30180; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30181; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30182; GFX7-NEXT: s_setpc_b64 s[30:31] 30183; 30184; GFX8-LABEL: v_sitofp_i16_to_bf16: 30185; GFX8: ; %bb.0: 30186; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30187; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30188; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 30189; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 30190; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 30191; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 30192; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30193; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 30194; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30195; GFX8-NEXT: s_setpc_b64 s[30:31] 30196; 30197; GFX9-LABEL: v_sitofp_i16_to_bf16: 30198; GFX9: ; %bb.0: 30199; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30200; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30201; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30202; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 30203; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 30204; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 30205; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30206; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 30207; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30208; GFX9-NEXT: s_setpc_b64 s[30:31] 30209; 30210; GFX10-LABEL: v_sitofp_i16_to_bf16: 30211; GFX10: ; %bb.0: 30212; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30213; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30214; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 30215; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 30216; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30217; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 30218; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 30219; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30220; GFX10-NEXT: s_setpc_b64 s[30:31] 30221; 30222; GFX11-LABEL: v_sitofp_i16_to_bf16: 30223; GFX11: ; %bb.0: 30224; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30225; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 30226; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 30227; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 30228; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 30229; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 30230; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 30232; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 30233; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 30234; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 30235; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30236; GFX11-NEXT: s_setpc_b64 s[30:31] 30237 %op = sitofp i16 %x to bfloat 30238 ret bfloat %op 30239} 30240 30241define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { 30242; GCN-LABEL: v_sitofp_v2i16_to_v2bf16: 30243; GCN: ; %bb.0: 30244; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30245; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 30246; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 30247; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 30248; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30249; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30250; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30251; GCN-NEXT: s_setpc_b64 s[30:31] 30252; 30253; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16: 30254; GFX7: ; %bb.0: 30255; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30256; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 30257; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 30258; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30259; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 30260; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30261; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30262; GFX7-NEXT: s_setpc_b64 s[30:31] 30263; 30264; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16: 30265; GFX8: ; %bb.0: 30266; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30267; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30268; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30269; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 30270; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 30271; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30272; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 30273; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30274; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 30275; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 30276; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 30277; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 30278; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 30279; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30280; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 30281; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30282; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 30283; GFX8-NEXT: s_setpc_b64 s[30:31] 30284; 30285; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: 30286; GFX9: ; %bb.0: 30287; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30288; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30289; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30290; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30291; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 30292; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 30293; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 30294; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30295; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 30296; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 30297; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 30298; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 30299; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30300; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 30301; GFX9-NEXT: s_mov_b32 s4, 0x7060302 30302; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 30303; GFX9-NEXT: s_setpc_b64 s[30:31] 30304; 30305; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16: 30306; GFX10: ; %bb.0: 30307; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30308; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30309; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30310; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 30311; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 30312; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 30313; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30314; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 30315; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 30316; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 30317; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 30318; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30319; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 30320; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 30321; GFX10-NEXT: s_setpc_b64 s[30:31] 30322; 30323; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16: 30324; GFX11: ; %bb.0: 30325; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30326; GFX11-NEXT: v_bfe_i32 v1, v0, 0, 16 30327; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 30328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 30329; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 30330; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 30331; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 30332; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 30333; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 30334; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 30335; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30336; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 30337; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 30338; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 30339; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 30340; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 30341; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30342; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 30343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 30344; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 30345; GFX11-NEXT: s_setpc_b64 s[30:31] 30346 %op = sitofp <2 x i16> %x to <2 x bfloat> 30347 ret <2 x bfloat> %op 30348} 30349 30350define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { 30351; GCN-LABEL: v_sitofp_v3i16_to_v3bf16: 30352; GCN: ; %bb.0: 30353; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30354; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 30355; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 30356; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16 30357; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 30358; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 30359; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30360; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30361; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30362; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30363; GCN-NEXT: s_setpc_b64 s[30:31] 30364; 30365; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16: 30366; GFX7: ; %bb.0: 30367; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30368; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 30369; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 30370; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 30371; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30372; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 30373; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 30374; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30375; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30376; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30377; GFX7-NEXT: s_setpc_b64 s[30:31] 30378; 30379; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16: 30380; GFX8: ; %bb.0: 30381; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30382; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30383; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30384; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30385; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 30386; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 30387; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30388; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 30389; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30390; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 30391; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 30392; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 30393; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30394; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 30395; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 30396; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 30397; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 30398; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 30399; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30400; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 30401; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30402; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 30403; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30404; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 30405; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 30406; GFX8-NEXT: s_setpc_b64 s[30:31] 30407; 30408; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: 30409; GFX9: ; %bb.0: 30410; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30411; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30412; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30413; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30414; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30415; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 30416; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 30417; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 30418; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30419; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 30420; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 30421; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 30422; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 30423; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 30424; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 30425; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 30426; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 30427; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 30428; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30429; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 30430; GFX9-NEXT: s_mov_b32 s4, 0x7060302 30431; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 30432; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 30433; GFX9-NEXT: s_setpc_b64 s[30:31] 30434; 30435; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16: 30436; GFX10: ; %bb.0: 30437; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30438; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30439; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30440; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30441; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 30442; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 30443; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 30444; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30445; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 30446; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 30447; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 30448; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 30449; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 30450; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 30451; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 30452; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30453; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 30454; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30455; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 30456; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 30457; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 30458; GFX10-NEXT: s_setpc_b64 s[30:31] 30459; 30460; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16: 30461; GFX11TRUE16: ; %bb.0: 30462; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30463; GFX11TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 16 30464; GFX11TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16 30465; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0 30466; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30467; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 30468; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 30469; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30470; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 30471; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 30472; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30473; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 30474; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 30475; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 30476; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30477; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 30478; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 30479; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 30480; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 30481; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 30482; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 30483; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30484; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 30485; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 30486; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30487; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 30488; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 30489; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 30490; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 30491; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 30492; 30493; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16: 30494; GFX11FAKE16: ; %bb.0: 30495; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30496; GFX11FAKE16-NEXT: v_bfe_i32 v2, v0, 0, 16 30497; GFX11FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 16 30498; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v0, 16, v0 30499; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30500; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 30501; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 30502; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30503; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 30504; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 30505; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30506; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 30507; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 30508; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 30509; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30510; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 30511; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 30512; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 30513; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 30514; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 30515; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 30516; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30517; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 30518; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 30519; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30520; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 30521; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 30522; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 30523; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 30524; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 30525 %op = sitofp <3 x i16> %x to <3 x bfloat> 30526 ret <3 x bfloat> %op 30527} 30528 30529define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { 30530; GCN-LABEL: v_sitofp_v4i16_to_v4bf16: 30531; GCN: ; %bb.0: 30532; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30533; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 30534; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 30535; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16 30536; GCN-NEXT: v_bfe_i32 v3, v3, 0, 16 30537; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 30538; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 30539; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 30540; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30541; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30542; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30543; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30544; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 30545; GCN-NEXT: s_setpc_b64 s[30:31] 30546; 30547; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16: 30548; GFX7: ; %bb.0: 30549; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30550; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 30551; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16 30552; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 30553; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16 30554; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30555; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 30556; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 30557; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 30558; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30559; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30560; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30561; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 30562; GFX7-NEXT: s_setpc_b64 s[30:31] 30563; 30564; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16: 30565; GFX8: ; %bb.0: 30566; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30567; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30568; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30569; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30570; GFX8-NEXT: s_movk_i32 s4, 0x7fff 30571; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 30572; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 30573; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 30574; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 30575; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 30576; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 30577; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 30578; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 30579; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 30580; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 30581; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30582; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 30583; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1 30584; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30585; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 30586; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 30587; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5 30588; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 30589; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 30590; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 30591; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 30592; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 30593; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 30594; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30595; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 30596; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 30597; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30598; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 30599; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 30600; GFX8-NEXT: s_setpc_b64 s[30:31] 30601; 30602; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: 30603; GFX9: ; %bb.0: 30604; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30605; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30606; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30607; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30608; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 30609; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 30610; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 30611; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 30612; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 30613; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30614; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30615; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 30616; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 30617; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 30618; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30619; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 30620; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 30621; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 30622; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 30623; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 30624; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 30625; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 30626; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 30627; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 30628; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30629; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 30630; GFX9-NEXT: s_mov_b32 s4, 0x7060302 30631; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 30632; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 30633; GFX9-NEXT: s_setpc_b64 s[30:31] 30634; 30635; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16: 30636; GFX10: ; %bb.0: 30637; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30638; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30639; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 30640; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30641; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 30642; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 30643; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 30644; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1 30645; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30646; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 30647; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 30648; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 30649; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff 30650; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 30651; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0 30652; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 30653; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 30654; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff 30655; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 30656; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff 30657; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo 30658; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30659; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo 30660; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30661; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 30662; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo 30663; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 30664; GFX10-NEXT: s_setpc_b64 s[30:31] 30665; 30666; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16: 30667; GFX11: ; %bb.0: 30668; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30669; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 30670; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1 30671; GFX11-NEXT: v_bfe_i32 v3, v0, 0, 16 30672; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0 30673; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30674; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 30675; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 30676; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 30677; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 30678; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 30679; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 30680; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 30681; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 30682; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 30683; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 30684; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30685; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 30686; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 30687; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 30688; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 30689; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff 30690; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff 30691; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 30692; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 30693; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 30694; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff 30695; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo 30696; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30697; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 30698; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo 30699; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30700; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 30701; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo 30702; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 30703; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 30704; GFX11-NEXT: s_setpc_b64 s[30:31] 30705 %op = sitofp <4 x i16> %x to <4 x bfloat> 30706 ret <4 x bfloat> %op 30707} 30708 30709define bfloat @v_sitofp_i32_to_bf16(i32 %x) { 30710; GCN-LABEL: v_sitofp_i32_to_bf16: 30711; GCN: ; %bb.0: 30712; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30713; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30714; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30715; GCN-NEXT: s_setpc_b64 s[30:31] 30716; 30717; GFX7-LABEL: v_sitofp_i32_to_bf16: 30718; GFX7: ; %bb.0: 30719; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30720; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30721; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30722; GFX7-NEXT: s_setpc_b64 s[30:31] 30723; 30724; GFX8-LABEL: v_sitofp_i32_to_bf16: 30725; GFX8: ; %bb.0: 30726; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30727; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 30728; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 30729; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 30730; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 30731; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 30732; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30733; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 30734; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30735; GFX8-NEXT: s_setpc_b64 s[30:31] 30736; 30737; GFX9-LABEL: v_sitofp_i32_to_bf16: 30738; GFX9: ; %bb.0: 30739; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30740; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 30741; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30742; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 30743; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 30744; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 30745; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30746; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 30747; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30748; GFX9-NEXT: s_setpc_b64 s[30:31] 30749; 30750; GFX10-LABEL: v_sitofp_i32_to_bf16: 30751; GFX10: ; %bb.0: 30752; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30753; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 30754; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 30755; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 30756; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30757; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 30758; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 30759; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30760; GFX10-NEXT: s_setpc_b64 s[30:31] 30761; 30762; GFX11-LABEL: v_sitofp_i32_to_bf16: 30763; GFX11: ; %bb.0: 30764; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30765; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 30766; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 30767; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 30768; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 30769; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30770; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 30771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 30772; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 30773; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 30774; GFX11-NEXT: s_setpc_b64 s[30:31] 30775 %op = sitofp i32 %x to bfloat 30776 ret bfloat %op 30777} 30778 30779define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { 30780; GCN-LABEL: v_sitofp_v2i32_to_v2bf16: 30781; GCN: ; %bb.0: 30782; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30783; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 30784; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30785; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30786; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30787; GCN-NEXT: s_setpc_b64 s[30:31] 30788; 30789; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16: 30790; GFX7: ; %bb.0: 30791; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30792; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30793; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 30794; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30795; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30796; GFX7-NEXT: s_setpc_b64 s[30:31] 30797; 30798; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16: 30799; GFX8: ; %bb.0: 30800; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30801; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 30802; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 30803; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 30804; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 30805; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30806; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 30807; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30808; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 30809; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 30810; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 30811; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 30812; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 30813; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30814; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 30815; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 30816; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 30817; GFX8-NEXT: s_setpc_b64 s[30:31] 30818; 30819; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16: 30820; GFX9: ; %bb.0: 30821; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30822; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 30823; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 30824; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30825; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 30826; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 30827; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 30828; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30829; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 30830; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 30831; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 30832; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 30833; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30834; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 30835; GFX9-NEXT: s_mov_b32 s4, 0x7060302 30836; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 30837; GFX9-NEXT: s_setpc_b64 s[30:31] 30838; 30839; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16: 30840; GFX10: ; %bb.0: 30841; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30842; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 30843; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 30844; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 30845; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 30846; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 30847; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30848; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 30849; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 30850; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 30851; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 30852; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30853; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 30854; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 30855; GFX10-NEXT: s_setpc_b64 s[30:31] 30856; 30857; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16: 30858; GFX11: ; %bb.0: 30859; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30860; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 30861; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 30862; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 30863; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 30864; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 30865; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 30866; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30867; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 30868; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 30869; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 30870; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 30871; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 30872; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30873; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 30874; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 30875; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 30876; GFX11-NEXT: s_setpc_b64 s[30:31] 30877 %op = sitofp <2 x i32> %x to <2 x bfloat> 30878 ret <2 x bfloat> %op 30879} 30880 30881define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { 30882; GCN-LABEL: v_sitofp_v3i32_to_v3bf16: 30883; GCN: ; %bb.0: 30884; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30885; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 30886; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 30887; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 30888; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30889; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30890; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30891; GCN-NEXT: s_setpc_b64 s[30:31] 30892; 30893; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16: 30894; GFX7: ; %bb.0: 30895; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30896; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 30897; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 30898; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 30899; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 30900; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 30901; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 30902; GFX7-NEXT: s_setpc_b64 s[30:31] 30903; 30904; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16: 30905; GFX8: ; %bb.0: 30906; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30907; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 30908; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 30909; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 30910; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 30911; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 30912; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 30913; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 30914; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 30915; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 30916; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 30917; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 30918; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 30919; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 30920; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30921; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 30922; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 30923; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 30924; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 30925; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 30926; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30927; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 30928; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 30929; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 30930; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 30931; GFX8-NEXT: v_mov_b32_e32 v1, v2 30932; GFX8-NEXT: s_setpc_b64 s[30:31] 30933; 30934; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16: 30935; GFX9: ; %bb.0: 30936; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30937; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 30938; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 30939; GFX9-NEXT: s_movk_i32 s4, 0x7fff 30940; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 30941; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 30942; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 30943; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 30944; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 30945; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 30946; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 30947; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 30948; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 30949; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 30950; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 30951; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 30952; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 30953; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 30954; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 30955; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 30956; GFX9-NEXT: s_mov_b32 s4, 0x7060302 30957; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 30958; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 30959; GFX9-NEXT: s_setpc_b64 s[30:31] 30960; 30961; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16: 30962; GFX10: ; %bb.0: 30963; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30964; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 30965; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 30966; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 30967; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 30968; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 30969; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 30970; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30971; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 30972; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 30973; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 30974; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 30975; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 30976; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 30977; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 30978; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 30979; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 30980; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 30981; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 30982; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 30983; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 30984; GFX10-NEXT: s_setpc_b64 s[30:31] 30985; 30986; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16: 30987; GFX11TRUE16: ; %bb.0: 30988; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30989; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 30990; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 30991; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 30992; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 30993; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 30994; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 30995; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 30996; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 30997; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 30998; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 30999; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 31000; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 31001; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 31002; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 31003; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 31004; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31005; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 31006; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31007; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 31008; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31009; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 31010; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16 31011; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 31012; 31013; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16: 31014; GFX11FAKE16: ; %bb.0: 31015; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31016; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 31017; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 31018; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 31019; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31020; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 31021; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 31022; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 31023; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31024; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 31025; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 31026; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 31027; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 31028; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 31029; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 31030; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 31031; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31032; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 31033; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31034; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 31035; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31036; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 31037; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 31038; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 31039 %op = sitofp <3 x i32> %x to <3 x bfloat> 31040 ret <3 x bfloat> %op 31041} 31042 31043define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { 31044; GCN-LABEL: v_sitofp_v4i32_to_v4bf16: 31045; GCN: ; %bb.0: 31046; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31047; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 31048; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 31049; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 31050; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 31051; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31052; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31053; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 31054; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 31055; GCN-NEXT: s_setpc_b64 s[30:31] 31056; 31057; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16: 31058; GFX7: ; %bb.0: 31059; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31060; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 31061; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 31062; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 31063; GFX7-NEXT: v_cvt_f32_i32_e32 v3, v3 31064; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31065; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31066; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 31067; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 31068; GFX7-NEXT: s_setpc_b64 s[30:31] 31069; 31070; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16: 31071; GFX8: ; %bb.0: 31072; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31073; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 31074; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 31075; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 31076; GFX8-NEXT: s_movk_i32 s4, 0x7fff 31077; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 31078; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 31079; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 31080; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 31081; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 31082; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc 31083; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 31084; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 31085; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 31086; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3 31087; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 31088; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 31089; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 31090; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 31091; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 31092; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 31093; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 31094; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31095; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc 31096; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 31097; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 31098; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 31099; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 31100; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 31101; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc 31102; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 31103; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 31104; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 31105; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 31106; GFX8-NEXT: s_setpc_b64 s[30:31] 31107; 31108; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16: 31109; GFX9: ; %bb.0: 31110; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31111; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 31112; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 31113; GFX9-NEXT: s_movk_i32 s4, 0x7fff 31114; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31115; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 31116; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 31117; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 31118; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 31119; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 31120; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 31121; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 31122; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 31123; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 31124; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 31125; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 31126; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 31127; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 31128; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 31129; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31130; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 31131; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 31132; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 31133; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 31134; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 31135; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc 31136; GFX9-NEXT: s_mov_b32 s4, 0x7060302 31137; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 31138; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 31139; GFX9-NEXT: s_setpc_b64 s[30:31] 31140; 31141; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16: 31142; GFX10: ; %bb.0: 31143; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31144; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 31145; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 31146; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 31147; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 31148; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 31149; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 31150; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 31151; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31152; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 31153; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 31154; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 31155; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 31156; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 31157; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 31158; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 31159; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31160; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 31161; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 31162; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 31163; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 31164; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31165; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 31166; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 31167; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31168; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo 31169; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 31170; GFX10-NEXT: s_setpc_b64 s[30:31] 31171; 31172; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16: 31173; GFX11: ; %bb.0: 31174; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31175; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 31176; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 31177; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 31178; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 31179; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 31180; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 31181; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 31182; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 31183; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 31184; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31185; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 31186; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 31187; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 31188; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 31189; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 31190; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 31191; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31192; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 31193; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 31194; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 31195; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31196; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) 31197; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 31198; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 31199; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 31200; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31201; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 31202; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo 31203; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 31204; GFX11-NEXT: s_setpc_b64 s[30:31] 31205 %op = sitofp <4 x i32> %x to <4 x bfloat> 31206 ret <4 x bfloat> %op 31207} 31208 31209define bfloat @v_sitofp_i64_to_bf16(i64 %x) { 31210; GCN-LABEL: v_sitofp_i64_to_bf16: 31211; GCN: ; %bb.0: 31212; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31213; GCN-NEXT: v_xor_b32_e32 v2, v0, v1 31214; GCN-NEXT: v_ffbh_i32_e32 v3, v1 31215; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31216; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v3 31217; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v2 31218; GCN-NEXT: v_min_u32_e32 v2, v3, v2 31219; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 31220; GCN-NEXT: v_min_u32_e32 v0, 1, v0 31221; GCN-NEXT: v_or_b32_e32 v0, v1, v0 31222; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 31223; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 31224; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 31225; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31226; GCN-NEXT: s_setpc_b64 s[30:31] 31227; 31228; GFX7-LABEL: v_sitofp_i64_to_bf16: 31229; GFX7: ; %bb.0: 31230; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31231; GFX7-NEXT: v_xor_b32_e32 v2, v0, v1 31232; GFX7-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31233; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 31234; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v2 31235; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 31236; GFX7-NEXT: v_min_u32_e32 v2, v3, v2 31237; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 31238; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 31239; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 31240; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 31241; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 31242; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 31243; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31244; GFX7-NEXT: s_setpc_b64 s[30:31] 31245; 31246; GFX8-LABEL: v_sitofp_i64_to_bf16: 31247; GFX8: ; %bb.0: 31248; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31249; GFX8-NEXT: v_xor_b32_e32 v2, v0, v1 31250; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31251; GFX8-NEXT: v_ffbh_i32_e32 v3, v1 31252; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v2 31253; GFX8-NEXT: v_add_u32_e32 v3, vcc, -1, v3 31254; GFX8-NEXT: v_min_u32_e32 v2, v3, v2 31255; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 31256; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 31257; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 31258; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 31259; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 31260; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 31261; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 31262; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 31263; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 31264; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 31265; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31266; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 31267; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 31268; GFX8-NEXT: s_setpc_b64 s[30:31] 31269; 31270; GFX9-LABEL: v_sitofp_i64_to_bf16: 31271; GFX9: ; %bb.0: 31272; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31273; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1 31274; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31275; GFX9-NEXT: v_ffbh_i32_e32 v3, v1 31276; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 31277; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 31278; GFX9-NEXT: v_min_u32_e32 v2, v3, v2 31279; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 31280; GFX9-NEXT: s_movk_i32 s4, 0x7fff 31281; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 31282; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 31283; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31284; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 31285; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 31286; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 31287; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 31288; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 31289; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31290; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 31291; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 31292; GFX9-NEXT: s_setpc_b64 s[30:31] 31293; 31294; GFX10-LABEL: v_sitofp_i64_to_bf16: 31295; GFX10: ; %bb.0: 31296; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31297; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 31298; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 31299; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31300; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 31301; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 31302; GFX10-NEXT: v_min_u32_e32 v2, v3, v2 31303; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 31304; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 31305; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 31306; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 31307; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 31308; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 31309; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 31310; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 31311; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31312; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 31313; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 31314; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 31315; GFX10-NEXT: s_setpc_b64 s[30:31] 31316; 31317; GFX11-LABEL: v_sitofp_i64_to_bf16: 31318; GFX11: ; %bb.0: 31319; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31320; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 31321; GFX11-NEXT: v_cls_i32_e32 v3, v1 31322; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31323; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 31324; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 31325; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 31326; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2 31327; GFX11-NEXT: v_min_u32_e32 v2, v3, v2 31328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 31329; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 31330; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 31331; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 31332; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 31333; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 31334; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 31335; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 31336; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 31337; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 31338; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 31339; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 31341; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 31342; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 31343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 31344; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 31345; GFX11-NEXT: s_setpc_b64 s[30:31] 31346 %op = sitofp i64 %x to bfloat 31347 ret bfloat %op 31348} 31349 31350define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { 31351; GCN-LABEL: v_sitofp_v2i64_to_v2bf16: 31352; GCN: ; %bb.0: 31353; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31354; GCN-NEXT: v_ffbh_i32_e32 v4, v3 31355; GCN-NEXT: v_xor_b32_e32 v5, v2, v3 31356; GCN-NEXT: v_ffbh_i32_e32 v6, v1 31357; GCN-NEXT: v_xor_b32_e32 v7, v0, v1 31358; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v4 31359; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31360; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6 31361; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31362; GCN-NEXT: v_add_i32_e32 v5, vcc, 32, v5 31363; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7 31364; GCN-NEXT: v_min_u32_e32 v4, v4, v5 31365; GCN-NEXT: v_min_u32_e32 v5, v6, v7 31366; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 31367; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 31368; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 31369; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5 31370; GCN-NEXT: v_min_u32_e32 v2, 1, v2 31371; GCN-NEXT: v_min_u32_e32 v0, 1, v0 31372; GCN-NEXT: v_or_b32_e32 v2, v3, v2 31373; GCN-NEXT: v_or_b32_e32 v0, v1, v0 31374; GCN-NEXT: v_cvt_f32_i32_e32 v1, v2 31375; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 31376; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4 31377; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5 31378; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31379; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31380; GCN-NEXT: s_setpc_b64 s[30:31] 31381; 31382; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16: 31383; GFX7: ; %bb.0: 31384; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31385; GFX7-NEXT: v_xor_b32_e32 v5, v2, v3 31386; GFX7-NEXT: v_ffbh_i32_e32 v4, v3 31387; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31388; GFX7-NEXT: v_add_i32_e32 v4, vcc, -1, v4 31389; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 31390; GFX7-NEXT: v_min_u32_e32 v4, v4, v5 31391; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 31392; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 31393; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 31394; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 31395; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 31396; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31397; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 31398; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 31399; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 31400; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 31401; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 31402; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 31403; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 31404; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 31405; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 31406; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4 31407; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 31408; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 31409; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31410; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31411; GFX7-NEXT: s_setpc_b64 s[30:31] 31412; 31413; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16: 31414; GFX8: ; %bb.0: 31415; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31416; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1 31417; GFX8-NEXT: v_ffbh_i32_e32 v4, v1 31418; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31419; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 31420; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 31421; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 31422; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 31423; GFX8-NEXT: s_movk_i32 s4, 0x7fff 31424; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 31425; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 31426; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 31427; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 31428; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 31429; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 31430; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 31431; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3 31432; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0 31433; GFX8-NEXT: v_ffbh_i32_e32 v0, v3 31434; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 31435; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 31436; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 31437; GFX8-NEXT: v_min_u32_e32 v6, v0, v1 31438; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] 31439; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 31440; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 31441; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 31442; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 31443; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 31444; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 31445; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 31446; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 31447; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 31448; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 31449; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 31450; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 31451; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31452; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 31453; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 31454; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 31455; GFX8-NEXT: s_setpc_b64 s[30:31] 31456; 31457; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16: 31458; GFX9: ; %bb.0: 31459; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31460; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1 31461; GFX9-NEXT: v_ffbh_i32_e32 v4, v1 31462; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31463; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 31464; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 31465; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 31466; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 31467; GFX9-NEXT: s_movk_i32 s4, 0x7fff 31468; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 31469; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 31470; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31471; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 31472; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 31473; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 31474; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 31475; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 31476; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 31477; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 31478; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 31479; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 31480; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 31481; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] 31482; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 31483; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 31484; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 31485; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31486; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 31487; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 31488; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 31489; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 31490; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 31491; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 31492; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 31493; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31494; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 31495; GFX9-NEXT: s_mov_b32 s4, 0x7060302 31496; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 31497; GFX9-NEXT: s_setpc_b64 s[30:31] 31498; 31499; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16: 31500; GFX10: ; %bb.0: 31501; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31502; GFX10-NEXT: v_xor_b32_e32 v4, v0, v1 31503; GFX10-NEXT: v_xor_b32_e32 v5, v2, v3 31504; GFX10-NEXT: v_ffbh_i32_e32 v6, v1 31505; GFX10-NEXT: v_ffbh_i32_e32 v7, v3 31506; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 31507; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31508; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 31509; GFX10-NEXT: v_add_nc_u32_e32 v7, -1, v7 31510; GFX10-NEXT: v_add_nc_u32_e32 v4, 32, v4 31511; GFX10-NEXT: v_add_nc_u32_e32 v5, 32, v5 31512; GFX10-NEXT: v_min_u32_e32 v4, v6, v4 31513; GFX10-NEXT: v_min_u32_e32 v5, v7, v5 31514; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 31515; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 31516; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 31517; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 31518; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 31519; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 31520; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4 31521; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5 31522; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 31523; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 31524; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 31525; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 31526; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 31527; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 31528; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 31529; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31530; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 31531; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 31532; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 31533; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 31534; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31535; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 31536; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31537; GFX10-NEXT: s_setpc_b64 s[30:31] 31538; 31539; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16: 31540; GFX11: ; %bb.0: 31541; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31542; GFX11-NEXT: v_xor_b32_e32 v4, v0, v1 31543; GFX11-NEXT: v_xor_b32_e32 v5, v2, v3 31544; GFX11-NEXT: v_cls_i32_e32 v6, v1 31545; GFX11-NEXT: v_cls_i32_e32 v7, v3 31546; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31547; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v4 31548; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31549; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31550; GFX11-NEXT: v_add_nc_u32_e32 v6, -1, v6 31551; GFX11-NEXT: v_add_nc_u32_e32 v7, -1, v7 31552; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31553; GFX11-NEXT: v_add_nc_u32_e32 v4, 32, v4 31554; GFX11-NEXT: v_add_nc_u32_e32 v5, 32, v5 31555; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31556; GFX11-NEXT: v_min_u32_e32 v4, v6, v4 31557; GFX11-NEXT: v_min_u32_e32 v5, v7, v5 31558; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31559; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 31560; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 31561; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31562; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 31563; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 31564; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31565; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 31566; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 31567; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4 31568; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5 31569; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31570; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 31571; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 31572; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31573; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 31574; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 31575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 31576; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 31577; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 31578; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 31579; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31580; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 31581; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 31582; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 31583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 31584; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 31585; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31586; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 31587; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 31588; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 31589; GFX11-NEXT: s_setpc_b64 s[30:31] 31590 %op = sitofp <2 x i64> %x to <2 x bfloat> 31591 ret <2 x bfloat> %op 31592} 31593 31594define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { 31595; GCN-LABEL: v_sitofp_v3i64_to_v3bf16: 31596; GCN: ; %bb.0: 31597; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31598; GCN-NEXT: v_ffbh_i32_e32 v6, v5 31599; GCN-NEXT: v_xor_b32_e32 v7, v4, v5 31600; GCN-NEXT: v_ffbh_i32_e32 v8, v3 31601; GCN-NEXT: v_xor_b32_e32 v9, v2, v3 31602; GCN-NEXT: v_ffbh_i32_e32 v10, v1 31603; GCN-NEXT: v_xor_b32_e32 v11, v0, v1 31604; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v6 31605; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31606; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8 31607; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9 31608; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10 31609; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 31610; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v7 31611; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9 31612; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11 31613; GCN-NEXT: v_min_u32_e32 v6, v6, v7 31614; GCN-NEXT: v_min_u32_e32 v7, v8, v9 31615; GCN-NEXT: v_min_u32_e32 v8, v10, v11 31616; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 31617; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6 31618; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 31619; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7 31620; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 31621; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 31622; GCN-NEXT: v_min_u32_e32 v4, 1, v4 31623; GCN-NEXT: v_min_u32_e32 v2, 1, v2 31624; GCN-NEXT: v_min_u32_e32 v0, 1, v0 31625; GCN-NEXT: v_or_b32_e32 v4, v5, v4 31626; GCN-NEXT: v_or_b32_e32 v2, v3, v2 31627; GCN-NEXT: v_or_b32_e32 v0, v1, v0 31628; GCN-NEXT: v_cvt_f32_i32_e32 v1, v4 31629; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 31630; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 31631; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6 31632; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7 31633; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8 31634; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31635; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31636; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 31637; GCN-NEXT: s_setpc_b64 s[30:31] 31638; 31639; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16: 31640; GFX7: ; %bb.0: 31641; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31642; GFX7-NEXT: v_xor_b32_e32 v7, v4, v5 31643; GFX7-NEXT: v_ffbh_i32_e32 v6, v5 31644; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31645; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6 31646; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7 31647; GFX7-NEXT: v_min_u32_e32 v6, v6, v7 31648; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 31649; GFX7-NEXT: v_xor_b32_e32 v7, v2, v3 31650; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 31651; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 31652; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 31653; GFX7-NEXT: v_ffbh_i32_e32 v6, v3 31654; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31655; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4 31656; GFX7-NEXT: v_add_i32_e32 v6, vcc, -1, v6 31657; GFX7-NEXT: v_add_i32_e32 v7, vcc, 32, v7 31658; GFX7-NEXT: v_min_u32_e32 v6, v6, v7 31659; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 31660; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 31661; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 31662; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 31663; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 31664; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 31665; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 31666; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 31667; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 31668; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 31669; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 31670; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 31671; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 31672; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 31673; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 31674; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 31675; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 31676; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 31677; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 31678; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 31679; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 31680; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 31681; GFX7-NEXT: s_setpc_b64 s[30:31] 31682; 31683; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16: 31684; GFX8: ; %bb.0: 31685; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31686; GFX8-NEXT: v_xor_b32_e32 v7, v4, v5 31687; GFX8-NEXT: v_ffbh_i32_e32 v6, v5 31688; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31689; GFX8-NEXT: v_add_u32_e32 v6, vcc, -1, v6 31690; GFX8-NEXT: v_add_u32_e32 v7, vcc, 32, v7 31691; GFX8-NEXT: v_min_u32_e32 v6, v6, v7 31692; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 31693; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1 31694; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 31695; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 31696; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 31697; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6 31698; GFX8-NEXT: v_ffbh_i32_e32 v7, v1 31699; GFX8-NEXT: v_ldexp_f32 v4, v4, v5 31700; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8 31701; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 31702; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7 31703; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8 31704; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 31705; GFX8-NEXT: s_movk_i32 s4, 0x7fff 31706; GFX8-NEXT: v_min_u32_e32 v7, v7, v8 31707; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 31708; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] 31709; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 31710; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 31711; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 31712; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 31713; GFX8-NEXT: v_xor_b32_e32 v6, v2, v3 31714; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 31715; GFX8-NEXT: v_ffbh_i32_e32 v5, v3 31716; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6 31717; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 31718; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5 31719; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6 31720; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 31721; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 31722; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 31723; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 31724; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 31725; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 31726; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 31727; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 31728; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 31729; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 31730; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 31731; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0 31732; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31733; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 31734; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 31735; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 31736; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 31737; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 31738; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 31739; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 31740; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 31741; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 31742; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 31743; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 31744; GFX8-NEXT: s_setpc_b64 s[30:31] 31745; 31746; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16: 31747; GFX9: ; %bb.0: 31748; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31749; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5 31750; GFX9-NEXT: v_ffbh_i32_e32 v6, v5 31751; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31752; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 31753; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 31754; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 31755; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 31756; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 31757; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 31758; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 31759; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 31760; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 31761; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31762; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 31763; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 31764; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 31765; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] 31766; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 31767; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 31768; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 31769; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31770; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 31771; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 31772; GFX9-NEXT: s_movk_i32 s4, 0x7fff 31773; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 31774; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 31775; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 31776; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 31777; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc 31778; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 31779; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 31780; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 31781; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 31782; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 31783; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 31784; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 31785; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 31786; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 31787; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] 31788; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 31789; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 31790; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 31791; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 31792; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 31793; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 31794; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 31795; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 31796; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 31797; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 31798; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 31799; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 31800; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 31801; GFX9-NEXT: s_mov_b32 s4, 0x7060302 31802; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 31803; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 31804; GFX9-NEXT: s_setpc_b64 s[30:31] 31805; 31806; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16: 31807; GFX10: ; %bb.0: 31808; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31809; GFX10-NEXT: v_xor_b32_e32 v8, v0, v1 31810; GFX10-NEXT: v_xor_b32_e32 v7, v4, v5 31811; GFX10-NEXT: v_xor_b32_e32 v9, v2, v3 31812; GFX10-NEXT: v_ffbh_i32_e32 v10, v1 31813; GFX10-NEXT: v_ffbh_i32_e32 v6, v5 31814; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8 31815; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31816; GFX10-NEXT: v_ffbh_i32_e32 v11, v3 31817; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v9 31818; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10 31819; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8 31820; GFX10-NEXT: v_add_nc_u32_e32 v6, -1, v6 31821; GFX10-NEXT: v_add_nc_u32_e32 v7, 32, v7 31822; GFX10-NEXT: v_add_nc_u32_e32 v11, -1, v11 31823; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 31824; GFX10-NEXT: v_min_u32_e32 v8, v10, v8 31825; GFX10-NEXT: v_min_u32_e32 v6, v6, v7 31826; GFX10-NEXT: v_min_u32_e32 v7, v11, v9 31827; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 31828; GFX10-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 31829; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v6 31830; GFX10-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 31831; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 31832; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 31833; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 31834; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 31835; GFX10-NEXT: v_or_b32_e32 v1, v5, v4 31836; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v7 31837; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 31838; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v8 31839; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 31840; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 31841; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 31842; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 31843; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 31844; GFX10-NEXT: v_ldexp_f32 v2, v2, v4 31845; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 31846; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 31847; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31848; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 31849; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 31850; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 31851; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 31852; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 31853; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 31854; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 31855; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 31856; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31857; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 31858; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31859; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 31860; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 31861; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 31862; GFX10-NEXT: s_setpc_b64 s[30:31] 31863; 31864; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16: 31865; GFX11TRUE16: ; %bb.0: 31866; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31867; GFX11TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1 31868; GFX11TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5 31869; GFX11TRUE16-NEXT: v_xor_b32_e32 v9, v2, v3 31870; GFX11TRUE16-NEXT: v_cls_i32_e32 v10, v1 31871; GFX11TRUE16-NEXT: v_cls_i32_e32 v6, v5 31872; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 31873; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31874; GFX11TRUE16-NEXT: v_cls_i32_e32 v11, v3 31875; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9 31876; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, -1, v10 31877; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 31878; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, -1, v6 31879; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v7, 32, v7 31880; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, -1, v11 31881; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 31882; GFX11TRUE16-NEXT: v_min_u32_e32 v8, v10, v8 31883; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 31884; GFX11TRUE16-NEXT: v_min_u32_e32 v6, v6, v7 31885; GFX11TRUE16-NEXT: v_min_u32_e32 v7, v11, v9 31886; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31887; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 31888; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 31889; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6 31890; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31891; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 31892; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0 31893; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 31894; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 31895; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 31896; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31897; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 31898; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4 31899; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7 31900; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 31901; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 31902; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8 31903; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 31904; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 31905; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 31906; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31907; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 31908; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v6 31909; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31910; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 31911; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 31912; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 31913; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31914; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) 31915; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 31916; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 31917; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 31918; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 31919; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 31920; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 31921; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 31922; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 31923; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31924; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 31925; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 31926; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31927; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 31928; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 31929; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 31930; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 31931; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 31932; 31933; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16: 31934; GFX11FAKE16: ; %bb.0: 31935; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31936; GFX11FAKE16-NEXT: v_xor_b32_e32 v8, v0, v1 31937; GFX11FAKE16-NEXT: v_xor_b32_e32 v7, v4, v5 31938; GFX11FAKE16-NEXT: v_xor_b32_e32 v9, v2, v3 31939; GFX11FAKE16-NEXT: v_cls_i32_e32 v10, v1 31940; GFX11FAKE16-NEXT: v_cls_i32_e32 v6, v5 31941; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 31942; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7 31943; GFX11FAKE16-NEXT: v_cls_i32_e32 v11, v3 31944; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v9, 31, v9 31945; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, -1, v10 31946; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 31947; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, -1, v6 31948; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v7, 32, v7 31949; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, -1, v11 31950; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 31951; GFX11FAKE16-NEXT: v_min_u32_e32 v8, v10, v8 31952; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 31953; GFX11FAKE16-NEXT: v_min_u32_e32 v6, v6, v7 31954; GFX11FAKE16-NEXT: v_min_u32_e32 v7, v11, v9 31955; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31956; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 31957; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 31958; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6 31959; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 31960; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 31961; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 31962; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 31963; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4 31964; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 31965; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31966; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 31967; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4 31968; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7 31969; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 31970; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 31971; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v8 31972; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 31973; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 31974; GFX11FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 31975; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31976; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 31977; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v6 31978; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 31979; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 31980; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 31981; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 31982; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 31983; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) 31984; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 31985; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 31986; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 31987; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 31988; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 31989; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 31990; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 31991; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 31992; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 31993; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 31994; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 31995; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 31996; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 31997; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 31998; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 31999; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 32000; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 32001 %op = sitofp <3 x i64> %x to <3 x bfloat> 32002 ret <3 x bfloat> %op 32003} 32004 32005define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { 32006; GCN-LABEL: v_sitofp_v4i64_to_v4bf16: 32007; GCN: ; %bb.0: 32008; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32009; GCN-NEXT: v_ffbh_i32_e32 v8, v7 32010; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 32011; GCN-NEXT: v_ffbh_i32_e32 v10, v5 32012; GCN-NEXT: v_xor_b32_e32 v11, v4, v5 32013; GCN-NEXT: v_ffbh_i32_e32 v12, v3 32014; GCN-NEXT: v_xor_b32_e32 v13, v2, v3 32015; GCN-NEXT: v_ffbh_i32_e32 v14, v1 32016; GCN-NEXT: v_xor_b32_e32 v15, v0, v1 32017; GCN-NEXT: v_add_i32_e32 v8, vcc, -1, v8 32018; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32019; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v10 32020; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 32021; GCN-NEXT: v_add_i32_e32 v12, vcc, -1, v12 32022; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v13 32023; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v14 32024; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v15 32025; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v9 32026; GCN-NEXT: v_add_i32_e32 v11, vcc, 32, v11 32027; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v13 32028; GCN-NEXT: v_add_i32_e32 v15, vcc, 32, v15 32029; GCN-NEXT: v_min_u32_e32 v8, v8, v9 32030; GCN-NEXT: v_min_u32_e32 v9, v10, v11 32031; GCN-NEXT: v_min_u32_e32 v10, v12, v13 32032; GCN-NEXT: v_min_u32_e32 v11, v14, v15 32033; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 32034; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 32035; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9 32036; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9 32037; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 32038; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10 32039; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 32040; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11 32041; GCN-NEXT: v_min_u32_e32 v6, 1, v6 32042; GCN-NEXT: v_min_u32_e32 v4, 1, v4 32043; GCN-NEXT: v_min_u32_e32 v2, 1, v2 32044; GCN-NEXT: v_min_u32_e32 v0, 1, v0 32045; GCN-NEXT: v_or_b32_e32 v6, v7, v6 32046; GCN-NEXT: v_or_b32_e32 v4, v5, v4 32047; GCN-NEXT: v_or_b32_e32 v2, v3, v2 32048; GCN-NEXT: v_or_b32_e32 v0, v1, v0 32049; GCN-NEXT: v_cvt_f32_i32_e32 v1, v6 32050; GCN-NEXT: v_cvt_f32_i32_e32 v3, v4 32051; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2 32052; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 32053; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8 32054; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9 32055; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10 32056; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11 32057; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 32058; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 32059; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 32060; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 32061; GCN-NEXT: s_setpc_b64 s[30:31] 32062; 32063; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16: 32064; GFX7: ; %bb.0: 32065; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32066; GFX7-NEXT: v_xor_b32_e32 v9, v6, v7 32067; GFX7-NEXT: v_ffbh_i32_e32 v8, v7 32068; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32069; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8 32070; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9 32071; GFX7-NEXT: v_min_u32_e32 v8, v8, v9 32072; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 32073; GFX7-NEXT: v_xor_b32_e32 v9, v4, v5 32074; GFX7-NEXT: v_min_u32_e32 v6, 1, v6 32075; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 32076; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8 32077; GFX7-NEXT: v_ffbh_i32_e32 v8, v5 32078; GFX7-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32079; GFX7-NEXT: v_add_i32_e32 v8, vcc, -1, v8 32080; GFX7-NEXT: v_add_i32_e32 v9, vcc, 32, v9 32081; GFX7-NEXT: v_min_u32_e32 v8, v8, v9 32082; GFX7-NEXT: v_cvt_f32_i32_e32 v6, v6 32083; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 32084; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 32085; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 32086; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 32087; GFX7-NEXT: v_xor_b32_e32 v8, v2, v3 32088; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7 32089; GFX7-NEXT: v_ffbh_i32_e32 v7, v3 32090; GFX7-NEXT: v_ashrrev_i32_e32 v8, 31, v8 32091; GFX7-NEXT: v_cvt_f32_i32_e32 v4, v4 32092; GFX7-NEXT: v_add_i32_e32 v7, vcc, -1, v7 32093; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v8 32094; GFX7-NEXT: v_min_u32_e32 v7, v7, v8 32095; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 32096; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 32097; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 32098; GFX7-NEXT: v_xor_b32_e32 v5, v0, v1 32099; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 32100; GFX7-NEXT: v_ffbh_i32_e32 v3, v1 32101; GFX7-NEXT: v_ashrrev_i32_e32 v5, 31, v5 32102; GFX7-NEXT: v_add_i32_e32 v3, vcc, -1, v3 32103; GFX7-NEXT: v_add_i32_e32 v5, vcc, 32, v5 32104; GFX7-NEXT: v_min_u32_e32 v3, v3, v5 32105; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 32106; GFX7-NEXT: v_cvt_f32_i32_e32 v2, v2 32107; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 32108; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 32109; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 32110; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 32111; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 32112; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 32113; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 32114; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 32115; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 32116; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 32117; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 32118; GFX7-NEXT: s_setpc_b64 s[30:31] 32119; 32120; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16: 32121; GFX8: ; %bb.0: 32122; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32123; GFX8-NEXT: v_xor_b32_e32 v9, v4, v5 32124; GFX8-NEXT: v_ffbh_i32_e32 v8, v5 32125; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32126; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8 32127; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 32128; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 32129; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 32130; GFX8-NEXT: s_movk_i32 s4, 0x7fff 32131; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 32132; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 32133; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 32134; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8 32135; GFX8-NEXT: v_ldexp_f32 v8, v4, v5 32136; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1 32137; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 32138; GFX8-NEXT: v_xor_b32_e32 v5, v6, v7 32139; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4 32140; GFX8-NEXT: v_ffbh_i32_e32 v4, v7 32141; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 32142; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 32143; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 32144; GFX8-NEXT: v_min_u32_e32 v10, v4, v5 32145; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] 32146; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 32147; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 32148; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 32149; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 32150; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 32151; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1 32152; GFX8-NEXT: v_ffbh_i32_e32 v8, v1 32153; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32154; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 32155; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8 32156; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 32157; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 32158; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 32159; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 32160; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 32161; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 32162; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 32163; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 32164; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 32165; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 32166; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 32167; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 32168; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 32169; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc 32170; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 32171; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 32172; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 32173; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 32174; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3 32175; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0 32176; GFX8-NEXT: v_ffbh_i32_e32 v0, v3 32177; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 32178; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 32179; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 32180; GFX8-NEXT: v_min_u32_e32 v8, v0, v1 32181; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] 32182; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 32183; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 32184; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 32185; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 32186; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 32187; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 32188; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 32189; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 32190; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 32191; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 32192; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 32193; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 32194; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32195; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 32196; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 32197; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32198; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 32199; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 32200; GFX8-NEXT: s_setpc_b64 s[30:31] 32201; 32202; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16: 32203; GFX9: ; %bb.0: 32204; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32205; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5 32206; GFX9-NEXT: v_ffbh_i32_e32 v8, v5 32207; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 32208; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 32209; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 32210; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 32211; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 32212; GFX9-NEXT: s_movk_i32 s4, 0x7fff 32213; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 32214; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 32215; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 32216; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 32217; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 32218; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 32219; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7 32220; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 32221; GFX9-NEXT: v_ffbh_i32_e32 v4, v7 32222; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 32223; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 32224; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 32225; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 32226; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] 32227; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 32228; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 32229; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 32230; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 32231; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 32232; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 32233; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 32234; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 32235; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 32236; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 32237; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] 32238; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 32239; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 32240; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 32241; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 32242; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 32243; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 32244; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 32245; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 32246; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 32247; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 32248; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 32249; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 32250; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc 32251; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 32252; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 32253; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 32254; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 32255; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 32256; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 32257; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 32258; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 32259; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 32260; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] 32261; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 32262; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 32263; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 32264; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 32265; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 32266; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 32267; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 32268; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 32269; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 32270; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 32271; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 32272; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32273; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 32274; GFX9-NEXT: s_mov_b32 s4, 0x7060302 32275; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 32276; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 32277; GFX9-NEXT: s_setpc_b64 s[30:31] 32278; 32279; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16: 32280; GFX10: ; %bb.0: 32281; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32282; GFX10-NEXT: v_xor_b32_e32 v8, v4, v5 32283; GFX10-NEXT: v_ffbh_i32_e32 v9, v5 32284; GFX10-NEXT: v_xor_b32_e32 v11, v6, v7 32285; GFX10-NEXT: v_xor_b32_e32 v13, v0, v1 32286; GFX10-NEXT: v_ffbh_i32_e32 v10, v7 32287; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v8 32288; GFX10-NEXT: v_add_nc_u32_e32 v9, -1, v9 32289; GFX10-NEXT: v_ffbh_i32_e32 v12, v1 32290; GFX10-NEXT: v_xor_b32_e32 v14, v2, v3 32291; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v11 32292; GFX10-NEXT: v_add_nc_u32_e32 v8, 32, v8 32293; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v10 32294; GFX10-NEXT: v_add_nc_u32_e32 v12, -1, v12 32295; GFX10-NEXT: v_ashrrev_i32_e32 v14, 31, v14 32296; GFX10-NEXT: v_add_nc_u32_e32 v11, 32, v11 32297; GFX10-NEXT: v_min_u32_e32 v8, v9, v8 32298; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v13 32299; GFX10-NEXT: v_ffbh_i32_e32 v13, v3 32300; GFX10-NEXT: v_add_nc_u32_e32 v14, 32, v14 32301; GFX10-NEXT: v_min_u32_e32 v10, v10, v11 32302; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 32303; GFX10-NEXT: v_add_nc_u32_e32 v9, 32, v9 32304; GFX10-NEXT: v_add_nc_u32_e32 v13, -1, v13 32305; GFX10-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] 32306; GFX10-NEXT: v_min_u32_e32 v9, v12, v9 32307; GFX10-NEXT: v_min_u32_e32 v11, v13, v14 32308; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 32309; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 32310; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 32311; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 32312; GFX10-NEXT: v_min_u32_e32 v5, 1, v6 32313; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v8 32314; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 32315; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 32316; GFX10-NEXT: v_cvt_f32_i32_e32 v4, v4 32317; GFX10-NEXT: v_or_b32_e32 v5, v7, v5 32318; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 32319; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 32320; GFX10-NEXT: v_ldexp_f32 v2, v4, v6 32321; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v5 32322; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v10 32323; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 32324; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v9 32325; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 32326; GFX10-NEXT: v_sub_nc_u32_e32 v6, 32, v11 32327; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 32328; GFX10-NEXT: v_ldexp_f32 v3, v3, v4 32329; GFX10-NEXT: v_ldexp_f32 v0, v0, v5 32330; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 32331; GFX10-NEXT: v_ldexp_f32 v1, v1, v6 32332; GFX10-NEXT: v_add3_u32 v4, v7, v2, 0x7fff 32333; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 32334; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 32335; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32336; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 32337; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 32338; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 32339; GFX10-NEXT: v_add3_u32 v4, v6, v3, 0x7fff 32340; GFX10-NEXT: v_add3_u32 v5, v7, v0, 0x7fff 32341; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 32342; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32343; GFX10-NEXT: v_add3_u32 v7, v8, v1, 0x7fff 32344; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 32345; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo 32346; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32347; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo 32348; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 32349; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 32350; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo 32351; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 32352; GFX10-NEXT: s_setpc_b64 s[30:31] 32353; 32354; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16: 32355; GFX11: ; %bb.0: 32356; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32357; GFX11-NEXT: v_xor_b32_e32 v8, v4, v5 32358; GFX11-NEXT: v_cls_i32_e32 v9, v5 32359; GFX11-NEXT: v_xor_b32_e32 v11, v6, v7 32360; GFX11-NEXT: v_xor_b32_e32 v13, v0, v1 32361; GFX11-NEXT: v_cls_i32_e32 v10, v7 32362; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v8 32363; GFX11-NEXT: v_add_nc_u32_e32 v9, -1, v9 32364; GFX11-NEXT: v_cls_i32_e32 v12, v1 32365; GFX11-NEXT: v_xor_b32_e32 v14, v2, v3 32366; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 32367; GFX11-NEXT: v_add_nc_u32_e32 v8, 32, v8 32368; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10 32369; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12 32370; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v14 32371; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11 32372; GFX11-NEXT: v_min_u32_e32 v8, v9, v8 32373; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v13 32374; GFX11-NEXT: v_cls_i32_e32 v13, v3 32375; GFX11-NEXT: v_add_nc_u32_e32 v14, 32, v14 32376; GFX11-NEXT: v_min_u32_e32 v10, v10, v11 32377; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 32378; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9 32379; GFX11-NEXT: v_add_nc_u32_e32 v13, -1, v13 32380; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 32381; GFX11-NEXT: v_lshlrev_b64 v[6:7], v10, v[6:7] 32382; GFX11-NEXT: v_min_u32_e32 v9, v12, v9 32383; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32384; GFX11-NEXT: v_min_u32_e32 v11, v13, v14 32385; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 32386; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 32387; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 32388; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 32389; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 32390; GFX11-NEXT: v_min_u32_e32 v5, 1, v6 32391; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v8 32392; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 32393; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 32394; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 32395; GFX11-NEXT: v_or_b32_e32 v5, v7, v5 32396; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 32397; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 32398; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 32399; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 32400; GFX11-NEXT: v_ldexp_f32 v2, v4, v6 32401; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v5 32402; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v10 32403; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 32404; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v9 32405; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 32406; GFX11-NEXT: v_sub_nc_u32_e32 v6, 32, v11 32407; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 32408; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 32409; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 32410; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 32411; GFX11-NEXT: v_ldexp_f32 v1, v1, v6 32412; GFX11-NEXT: v_add3_u32 v4, v7, v2, 0x7fff 32413; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 32414; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 32415; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32416; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 32417; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 32418; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 32419; GFX11-NEXT: v_add3_u32 v4, v6, v3, 0x7fff 32420; GFX11-NEXT: v_add3_u32 v5, v7, v0, 0x7fff 32421; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 32422; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32423; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff 32424; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 32425; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32426; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo 32427; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32428; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo 32429; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 32430; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 32431; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 32432; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo 32433; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 32434; GFX11-NEXT: s_setpc_b64 s[30:31] 32435 %op = sitofp <4 x i64> %x to <4 x bfloat> 32436 ret <4 x bfloat> %op 32437} 32438 32439define bfloat @v_uitofp_i16_to_bf16(i16 %x) { 32440; GCN-LABEL: v_uitofp_i16_to_bf16: 32441; GCN: ; %bb.0: 32442; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32443; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 32444; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 32445; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32446; GCN-NEXT: s_setpc_b64 s[30:31] 32447; 32448; GFX7-LABEL: v_uitofp_i16_to_bf16: 32449; GFX7: ; %bb.0: 32450; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32451; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 32452; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 32453; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32454; GFX7-NEXT: s_setpc_b64 s[30:31] 32455; 32456; GFX8-LABEL: v_uitofp_i16_to_bf16: 32457; GFX8: ; %bb.0: 32458; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32459; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32460; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 32461; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 32462; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 32463; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 32464; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32465; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 32466; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32467; GFX8-NEXT: s_setpc_b64 s[30:31] 32468; 32469; GFX9-LABEL: v_uitofp_i16_to_bf16: 32470; GFX9: ; %bb.0: 32471; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32472; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32473; GFX9-NEXT: s_movk_i32 s4, 0x7fff 32474; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 32475; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 32476; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 32477; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32478; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 32479; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32480; GFX9-NEXT: s_setpc_b64 s[30:31] 32481; 32482; GFX10-LABEL: v_uitofp_i16_to_bf16: 32483; GFX10: ; %bb.0: 32484; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32485; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32486; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 32487; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 32488; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32489; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 32490; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 32491; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32492; GFX10-NEXT: s_setpc_b64 s[30:31] 32493; 32494; GFX11-LABEL: v_uitofp_i16_to_bf16: 32495; GFX11: ; %bb.0: 32496; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32497; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 32498; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 32499; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 32500; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 32501; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 32502; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32503; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 32504; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 32505; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 32506; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 32507; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32508; GFX11-NEXT: s_setpc_b64 s[30:31] 32509 %op = uitofp i16 %x to bfloat 32510 ret bfloat %op 32511} 32512 32513define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { 32514; GCN-LABEL: v_uitofp_v2i16_to_v2bf16: 32515; GCN: ; %bb.0: 32516; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32517; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 32518; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 32519; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 32520; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 32521; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32522; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32523; GCN-NEXT: s_setpc_b64 s[30:31] 32524; 32525; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16: 32526; GFX7: ; %bb.0: 32527; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32528; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 32529; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 32530; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 32531; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 32532; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32533; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32534; GFX7-NEXT: s_setpc_b64 s[30:31] 32535; 32536; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16: 32537; GFX8: ; %bb.0: 32538; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32539; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32540; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32541; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 32542; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 32543; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 32544; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 32545; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32546; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 32547; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 32548; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 32549; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 32550; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 32551; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32552; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 32553; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32554; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 32555; GFX8-NEXT: s_setpc_b64 s[30:31] 32556; 32557; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: 32558; GFX9: ; %bb.0: 32559; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32560; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32561; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32562; GFX9-NEXT: s_movk_i32 s4, 0x7fff 32563; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 32564; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 32565; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 32566; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32567; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 32568; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 32569; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 32570; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 32571; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32572; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 32573; GFX9-NEXT: s_mov_b32 s4, 0x7060302 32574; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 32575; GFX9-NEXT: s_setpc_b64 s[30:31] 32576; 32577; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16: 32578; GFX10: ; %bb.0: 32579; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32580; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32581; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32582; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 32583; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 32584; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 32585; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32586; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 32587; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 32588; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 32589; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 32590; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32591; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 32592; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 32593; GFX10-NEXT: s_setpc_b64 s[30:31] 32594; 32595; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16: 32596; GFX11: ; %bb.0: 32597; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32598; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 32599; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32600; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 32601; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 32602; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 32603; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 32604; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 32605; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 32606; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 32607; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32608; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 32609; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 32610; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 32611; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32612; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 32613; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32614; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 32615; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 32616; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 32617; GFX11-NEXT: s_setpc_b64 s[30:31] 32618 %op = uitofp <2 x i16> %x to <2 x bfloat> 32619 ret <2 x bfloat> %op 32620} 32621 32622define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { 32623; GCN-LABEL: v_uitofp_v3i16_to_v3bf16: 32624; GCN: ; %bb.0: 32625; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32626; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 32627; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 32628; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 32629; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 32630; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 32631; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 32632; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32633; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32634; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 32635; GCN-NEXT: s_setpc_b64 s[30:31] 32636; 32637; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16: 32638; GFX7: ; %bb.0: 32639; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32640; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 32641; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 32642; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 32643; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 32644; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 32645; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 32646; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32647; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32648; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 32649; GFX7-NEXT: s_setpc_b64 s[30:31] 32650; 32651; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16: 32652; GFX8: ; %bb.0: 32653; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32654; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32655; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32656; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32657; GFX8-NEXT: s_movk_i32 s4, 0x7fff 32658; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 32659; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 32660; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 32661; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 32662; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32663; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 32664; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 1 32665; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 32666; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 32667; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4 32668; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 32669; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 32670; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 32671; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 32672; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 32673; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 32674; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32675; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 32676; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32677; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 32678; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 32679; GFX8-NEXT: s_setpc_b64 s[30:31] 32680; 32681; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: 32682; GFX9: ; %bb.0: 32683; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32684; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32685; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32686; GFX9-NEXT: s_movk_i32 s4, 0x7fff 32687; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32688; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 32689; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 32690; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 32691; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32692; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 32693; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 32694; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 32695; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 32696; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 32697; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 32698; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 32699; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 32700; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 32701; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32702; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 32703; GFX9-NEXT: s_mov_b32 s4, 0x7060302 32704; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 32705; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 32706; GFX9-NEXT: s_setpc_b64 s[30:31] 32707; 32708; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16: 32709; GFX10: ; %bb.0: 32710; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32711; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32712; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32713; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32714; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 32715; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 32716; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 32717; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32718; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 32719; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 32720; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 32721; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 32722; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 32723; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 32724; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 32725; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32726; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 32727; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32728; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 32729; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 32730; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 32731; GFX10-NEXT: s_setpc_b64 s[30:31] 32732; 32733; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16: 32734; GFX11TRUE16: ; %bb.0: 32735; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32736; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 32737; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 32738; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 32739; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 32740; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 32741; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 32742; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 32743; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0 32744; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32745; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2 32746; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 32747; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 32748; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 32749; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 32750; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 32751; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 32752; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32753; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 32754; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 32755; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 32756; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32757; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 32758; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32759; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 32760; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32761; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 32762; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 32763; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 32764; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 32765; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 32766; 32767; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16: 32768; GFX11FAKE16: ; %bb.0: 32769; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32770; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 32771; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 32772; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 32773; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 32774; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 32775; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 32776; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 32777; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0 32778; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32779; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2 32780; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 32781; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 32782; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 32783; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 32784; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 32785; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 32786; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32787; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 32788; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 32789; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 32790; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32791; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 32792; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32793; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 32794; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32795; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 32796; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 32797; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 32798; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 32799; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 32800 %op = uitofp <3 x i16> %x to <3 x bfloat> 32801 ret <3 x bfloat> %op 32802} 32803 32804define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { 32805; GCN-LABEL: v_uitofp_v4i16_to_v4bf16: 32806; GCN: ; %bb.0: 32807; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32808; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 32809; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 32810; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 32811; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 32812; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 32813; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 32814; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 32815; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 32816; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32817; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32818; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 32819; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 32820; GCN-NEXT: s_setpc_b64 s[30:31] 32821; 32822; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16: 32823; GFX7: ; %bb.0: 32824; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32825; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 32826; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 32827; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 32828; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 32829; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 32830; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 32831; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 32832; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3 32833; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32834; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 32835; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 32836; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 32837; GFX7-NEXT: s_setpc_b64 s[30:31] 32838; 32839; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16: 32840; GFX8: ; %bb.0: 32841; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32842; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32843; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32844; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32845; GFX8-NEXT: s_movk_i32 s4, 0x7fff 32846; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 32847; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 32848; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 32849; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 32850; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 32851; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 32852; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 32853; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 32854; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 32855; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 32856; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32857; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 32858; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1 32859; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32860; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 32861; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 32862; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5 32863; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 32864; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 32865; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 32866; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 32867; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 32868; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 32869; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32870; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 32871; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 32872; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32873; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 32874; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 32875; GFX8-NEXT: s_setpc_b64 s[30:31] 32876; 32877; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: 32878; GFX9: ; %bb.0: 32879; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32880; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32881; GFX9-NEXT: s_movk_i32 s4, 0x7fff 32882; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32883; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 32884; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 32885; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 32886; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 32887; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 32888; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32889; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32890; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 32891; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 32892; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 32893; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 32894; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 32895; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 32896; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 32897; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 32898; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 32899; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 32900; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 32901; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 32902; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 32903; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 32904; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 32905; GFX9-NEXT: s_mov_b32 s4, 0x7060302 32906; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 32907; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 32908; GFX9-NEXT: s_setpc_b64 s[30:31] 32909; 32910; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16: 32911; GFX10: ; %bb.0: 32912; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32913; GFX10-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32914; GFX10-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 32915; GFX10-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32916; GFX10-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 32917; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 32918; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 32919; GFX10-NEXT: v_bfe_u32 v8, v3, 16, 1 32920; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32921; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 32922; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 32923; GFX10-NEXT: v_bfe_u32 v10, v0, 16, 1 32924; GFX10-NEXT: v_add3_u32 v8, v8, v3, 0x7fff 32925; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 32926; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v0 32927; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 32928; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 32929; GFX10-NEXT: v_add3_u32 v10, v10, v0, 0x7fff 32930; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 32931; GFX10-NEXT: v_add3_u32 v6, v6, v1, 0x7fff 32932; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo 32933; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32934; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo 32935; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32936; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 32937; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo 32938; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 32939; GFX10-NEXT: s_setpc_b64 s[30:31] 32940; 32941; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16: 32942; GFX11: ; %bb.0: 32943; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32944; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 32945; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 32946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 32947; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 32948; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 32949; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 32950; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 32951; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 32952; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 32953; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 32954; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 32955; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 32956; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v0 32957; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 32958; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff 32959; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 32960; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 32961; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 32962; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 32963; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 32964; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 32965; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 32966; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) 32967; GFX11-NEXT: v_bfe_u32 v10, v0, 16, 1 32968; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 32969; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 32970; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff 32971; GFX11-NEXT: v_add3_u32 v10, v10, v0, 0x7fff 32972; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 32973; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo 32974; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 32975; GFX11-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo 32976; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 32977; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 32978; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 32979; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo 32980; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 32981; GFX11-NEXT: s_setpc_b64 s[30:31] 32982 %op = uitofp <4 x i16> %x to <4 x bfloat> 32983 ret <4 x bfloat> %op 32984} 32985 32986define bfloat @v_uitofp_i32_to_bf16(i32 %x) { 32987; GCN-LABEL: v_uitofp_i32_to_bf16: 32988; GCN: ; %bb.0: 32989; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32990; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 32991; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32992; GCN-NEXT: s_setpc_b64 s[30:31] 32993; 32994; GFX7-LABEL: v_uitofp_i32_to_bf16: 32995; GFX7: ; %bb.0: 32996; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32997; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 32998; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 32999; GFX7-NEXT: s_setpc_b64 s[30:31] 33000; 33001; GFX8-LABEL: v_uitofp_i32_to_bf16: 33002; GFX8: ; %bb.0: 33003; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33004; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33005; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 33006; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 33007; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 33008; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v0 33009; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33010; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 33011; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33012; GFX8-NEXT: s_setpc_b64 s[30:31] 33013; 33014; GFX9-LABEL: v_uitofp_i32_to_bf16: 33015; GFX9: ; %bb.0: 33016; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33017; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33018; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33019; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 33020; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 33021; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 33022; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33023; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 33024; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33025; GFX9-NEXT: s_setpc_b64 s[30:31] 33026; 33027; GFX10-LABEL: v_uitofp_i32_to_bf16: 33028; GFX10: ; %bb.0: 33029; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33030; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33031; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 33032; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 33033; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33034; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 33035; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 33036; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33037; GFX10-NEXT: s_setpc_b64 s[30:31] 33038; 33039; GFX11-LABEL: v_uitofp_i32_to_bf16: 33040; GFX11: ; %bb.0: 33041; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33042; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 33043; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 33044; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 33045; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 33046; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33047; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 33048; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 33049; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 33050; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33051; GFX11-NEXT: s_setpc_b64 s[30:31] 33052 %op = uitofp i32 %x to bfloat 33053 ret bfloat %op 33054} 33055 33056define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { 33057; GCN-LABEL: v_uitofp_v2i32_to_v2bf16: 33058; GCN: ; %bb.0: 33059; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33060; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 33061; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33062; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33063; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33064; GCN-NEXT: s_setpc_b64 s[30:31] 33065; 33066; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16: 33067; GFX7: ; %bb.0: 33068; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33069; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33070; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 33071; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33072; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33073; GFX7-NEXT: s_setpc_b64 s[30:31] 33074; 33075; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16: 33076; GFX8: ; %bb.0: 33077; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33078; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33079; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 33080; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 33081; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 33082; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 33083; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 33084; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33085; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 33086; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 33087; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 33088; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 33089; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 33090; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33091; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 33092; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 33093; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 33094; GFX8-NEXT: s_setpc_b64 s[30:31] 33095; 33096; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16: 33097; GFX9: ; %bb.0: 33098; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33099; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33100; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 33101; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33102; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 33103; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 33104; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 33105; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33106; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 33107; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 33108; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 33109; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 33110; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33111; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 33112; GFX9-NEXT: s_mov_b32 s4, 0x7060302 33113; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 33114; GFX9-NEXT: s_setpc_b64 s[30:31] 33115; 33116; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16: 33117; GFX10: ; %bb.0: 33118; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33119; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33120; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 33121; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 33122; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 33123; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 33124; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33125; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 33126; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 33127; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 33128; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 33129; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33130; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 33131; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33132; GFX10-NEXT: s_setpc_b64 s[30:31] 33133; 33134; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16: 33135; GFX11: ; %bb.0: 33136; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33137; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 33138; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 33139; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33140; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 33141; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 33142; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 33143; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33144; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 33145; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 33146; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 33147; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 33148; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 33149; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33150; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 33151; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 33152; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33153; GFX11-NEXT: s_setpc_b64 s[30:31] 33154 %op = uitofp <2 x i32> %x to <2 x bfloat> 33155 ret <2 x bfloat> %op 33156} 33157 33158define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { 33159; GCN-LABEL: v_uitofp_v3i32_to_v3bf16: 33160; GCN: ; %bb.0: 33161; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33162; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 33163; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 33164; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33165; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33166; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33167; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 33168; GCN-NEXT: s_setpc_b64 s[30:31] 33169; 33170; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16: 33171; GFX7: ; %bb.0: 33172; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33173; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33174; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 33175; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 33176; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33177; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33178; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 33179; GFX7-NEXT: s_setpc_b64 s[30:31] 33180; 33181; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16: 33182; GFX8: ; %bb.0: 33183; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33184; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 33185; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33186; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 33187; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 33188; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 33189; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 33190; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v2 33191; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 33192; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 33193; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 33194; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 33195; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 33196; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 33197; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33198; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 33199; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 33200; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 33201; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 33202; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 33203; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33204; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 33205; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 33206; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 33207; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 33208; GFX8-NEXT: v_mov_b32_e32 v1, v2 33209; GFX8-NEXT: s_setpc_b64 s[30:31] 33210; 33211; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16: 33212; GFX9: ; %bb.0: 33213; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33214; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 33215; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33216; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33217; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 33218; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 33219; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 33220; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 33221; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 33222; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 33223; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 33224; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 33225; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 33226; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33227; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 33228; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 33229; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 33230; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 33231; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33232; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 33233; GFX9-NEXT: s_mov_b32 s4, 0x7060302 33234; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 33235; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 33236; GFX9-NEXT: s_setpc_b64 s[30:31] 33237; 33238; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16: 33239; GFX10: ; %bb.0: 33240; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33241; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33242; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 33243; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 33244; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 33245; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 33246; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 33247; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33248; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 33249; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 33250; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v1 33251; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 33252; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 33253; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 33254; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 33255; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33256; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 33257; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33258; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33259; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 33260; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 33261; GFX10-NEXT: s_setpc_b64 s[30:31] 33262; 33263; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16: 33264; GFX11TRUE16: ; %bb.0: 33265; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33266; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 33267; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 33268; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2 33269; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 33270; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 33271; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 33272; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 33273; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33274; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 33275; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 33276; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 33277; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 33278; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 33279; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 33280; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 33281; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33282; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 33283; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33284; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 33285; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33286; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 33287; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16 33288; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 33289; 33290; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16: 33291; GFX11FAKE16: ; %bb.0: 33292; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33293; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 33294; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 33295; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2 33296; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 33297; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 33298; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1 33299; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 33300; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33301; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 33302; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 33303; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 33304; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff 33305; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 33306; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 33307; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 33308; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33309; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo 33310; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33311; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 33312; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33313; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo 33314; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 33315; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 33316 %op = uitofp <3 x i32> %x to <3 x bfloat> 33317 ret <3 x bfloat> %op 33318} 33319 33320define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { 33321; GCN-LABEL: v_uitofp_v4i32_to_v4bf16: 33322; GCN: ; %bb.0: 33323; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33324; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 33325; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 33326; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 33327; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33328; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33329; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33330; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 33331; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 33332; GCN-NEXT: s_setpc_b64 s[30:31] 33333; 33334; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16: 33335; GFX7: ; %bb.0: 33336; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33337; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33338; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1 33339; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 33340; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3 33341; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 33342; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 33343; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 33344; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 33345; GFX7-NEXT: s_setpc_b64 s[30:31] 33346; 33347; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16: 33348; GFX8: ; %bb.0: 33349; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33350; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 33351; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 33352; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33353; GFX8-NEXT: s_movk_i32 s4, 0x7fff 33354; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 33355; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 33356; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 33357; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 33358; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 33359; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc 33360; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 33361; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 33362; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 33363; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v3 33364; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 33365; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 33366; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 33367; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 33368; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 33369; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 33370; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 33371; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33372; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc 33373; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 33374; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 33375; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 33376; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 33377; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33378; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc 33379; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 33380; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 33381; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 33382; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 33383; GFX8-NEXT: s_setpc_b64 s[30:31] 33384; 33385; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16: 33386; GFX9: ; %bb.0: 33387; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33388; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 33389; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 33390; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33391; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33392; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 33393; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 33394; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 33395; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 33396; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 33397; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 33398; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 33399; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 33400; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 33401; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 33402; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 33403; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 33404; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 33405; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 33406; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33407; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 33408; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 33409; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 33410; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 33411; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 33412; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc 33413; GFX9-NEXT: s_mov_b32 s4, 0x7060302 33414; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 33415; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 33416; GFX9-NEXT: s_setpc_b64 s[30:31] 33417; 33418; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16: 33419; GFX10: ; %bb.0: 33420; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33421; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 33422; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33423; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 33424; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 33425; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 33426; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 33427; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 33428; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33429; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 33430; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 33431; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 33432; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 33433; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 33434; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 33435; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 33436; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33437; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 33438; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 33439; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 33440; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 33441; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33442; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 33443; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 33444; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33445; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo 33446; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 33447; GFX10-NEXT: s_setpc_b64 s[30:31] 33448; 33449; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16: 33450; GFX11: ; %bb.0: 33451; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33452; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 33453; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 33454; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 33455; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 33456; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 33457; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 33458; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 33459; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 33460; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 33461; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33462; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff 33463; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 33464; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 33465; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 33466; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 33467; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo 33468; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33469; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 33470; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 33471; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 33472; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33473; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) 33474; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff 33475; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 33476; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 33477; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33478; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 33479; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo 33480; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 33481; GFX11-NEXT: s_setpc_b64 s[30:31] 33482 %op = uitofp <4 x i32> %x to <4 x bfloat> 33483 ret <4 x bfloat> %op 33484} 33485 33486define bfloat @v_uitofp_i64_to_bf16(i64 %x) { 33487; GCN-LABEL: v_uitofp_i64_to_bf16: 33488; GCN: ; %bb.0: 33489; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33490; GCN-NEXT: v_ffbh_u32_e32 v2, v1 33491; GCN-NEXT: v_min_u32_e32 v2, 32, v2 33492; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 33493; GCN-NEXT: v_min_u32_e32 v0, 1, v0 33494; GCN-NEXT: v_or_b32_e32 v0, v1, v0 33495; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33496; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 33497; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 33498; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33499; GCN-NEXT: s_setpc_b64 s[30:31] 33500; 33501; GFX7-LABEL: v_uitofp_i64_to_bf16: 33502; GFX7: ; %bb.0: 33503; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33504; GFX7-NEXT: v_ffbh_u32_e32 v2, v1 33505; GFX7-NEXT: v_min_u32_e32 v2, 32, v2 33506; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 33507; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 33508; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 33509; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33510; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 33511; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 33512; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33513; GFX7-NEXT: s_setpc_b64 s[30:31] 33514; 33515; GFX8-LABEL: v_uitofp_i64_to_bf16: 33516; GFX8: ; %bb.0: 33517; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33518; GFX8-NEXT: v_ffbh_u32_e32 v2, v1 33519; GFX8-NEXT: v_min_u32_e32 v2, 32, v2 33520; GFX8-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 33521; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 33522; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 33523; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33524; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 33525; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 33526; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 33527; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 33528; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 33529; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 33530; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33531; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 33532; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33533; GFX8-NEXT: s_setpc_b64 s[30:31] 33534; 33535; GFX9-LABEL: v_uitofp_i64_to_bf16: 33536; GFX9: ; %bb.0: 33537; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33538; GFX9-NEXT: v_ffbh_u32_e32 v2, v1 33539; GFX9-NEXT: v_min_u32_e32 v2, 32, v2 33540; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 33541; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33542; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 33543; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 33544; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33545; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 33546; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 33547; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 33548; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 33549; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 33550; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33551; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 33552; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33553; GFX9-NEXT: s_setpc_b64 s[30:31] 33554; 33555; GFX10-LABEL: v_uitofp_i64_to_bf16: 33556; GFX10: ; %bb.0: 33557; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33558; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 33559; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 33560; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 33561; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 33562; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 33563; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 33564; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33565; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 33566; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 33567; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 33568; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33569; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 33570; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 33571; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33572; GFX10-NEXT: s_setpc_b64 s[30:31] 33573; 33574; GFX11-LABEL: v_uitofp_i64_to_bf16: 33575; GFX11: ; %bb.0: 33576; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33577; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 33578; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 33579; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 33580; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 33581; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 33582; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 33583; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 33584; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 33585; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 33586; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 33587; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 33588; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 33589; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 33590; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 33591; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33592; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 33593; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 33594; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 33595; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33596; GFX11-NEXT: s_setpc_b64 s[30:31] 33597 %op = uitofp i64 %x to bfloat 33598 ret bfloat %op 33599} 33600 33601define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { 33602; GCN-LABEL: v_uitofp_v2i64_to_v2bf16: 33603; GCN: ; %bb.0: 33604; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33605; GCN-NEXT: v_ffbh_u32_e32 v4, v3 33606; GCN-NEXT: v_ffbh_u32_e32 v5, v1 33607; GCN-NEXT: v_min_u32_e32 v4, 32, v4 33608; GCN-NEXT: v_min_u32_e32 v5, 32, v5 33609; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 33610; GCN-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 33611; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 33612; GCN-NEXT: v_sub_i32_e32 v5, vcc, 32, v5 33613; GCN-NEXT: v_min_u32_e32 v2, 1, v2 33614; GCN-NEXT: v_min_u32_e32 v0, 1, v0 33615; GCN-NEXT: v_or_b32_e32 v2, v3, v2 33616; GCN-NEXT: v_or_b32_e32 v0, v1, v0 33617; GCN-NEXT: v_cvt_f32_u32_e32 v1, v2 33618; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33619; GCN-NEXT: v_ldexp_f32_e32 v1, v1, v4 33620; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v5 33621; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33622; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 33623; GCN-NEXT: s_setpc_b64 s[30:31] 33624; 33625; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16: 33626; GFX7: ; %bb.0: 33627; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33628; GFX7-NEXT: v_ffbh_u32_e32 v4, v3 33629; GFX7-NEXT: v_min_u32_e32 v4, 32, v4 33630; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 33631; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 32, v4 33632; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 33633; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 33634; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 33635; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 33636; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 33637; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 33638; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 33639; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 33640; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33641; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v4 33642; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 33643; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 33644; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33645; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 33646; GFX7-NEXT: s_setpc_b64 s[30:31] 33647; 33648; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16: 33649; GFX8: ; %bb.0: 33650; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33651; GFX8-NEXT: v_ffbh_u32_e32 v4, v1 33652; GFX8-NEXT: v_min_u32_e32 v4, 32, v4 33653; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 33654; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 33655; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 33656; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33657; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 33658; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 33659; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 33660; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 33661; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0 33662; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 33663; GFX8-NEXT: v_min_u32_e32 v6, 32, v0 33664; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] 33665; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 33666; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 33667; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 33668; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33669; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 33670; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 33671; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 33672; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 33673; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 33674; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 33675; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 33676; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 33677; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33678; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 33679; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 33680; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 33681; GFX8-NEXT: s_setpc_b64 s[30:31] 33682; 33683; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16: 33684; GFX9: ; %bb.0: 33685; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33686; GFX9-NEXT: v_ffbh_u32_e32 v4, v1 33687; GFX9-NEXT: v_min_u32_e32 v4, 32, v4 33688; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 33689; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33690; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 33691; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 33692; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33693; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 33694; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 33695; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 33696; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 33697; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 33698; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 33699; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] 33700; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 33701; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 33702; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 33703; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33704; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 33705; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 33706; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 33707; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 33708; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 33709; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 33710; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 33711; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33712; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 33713; GFX9-NEXT: s_mov_b32 s4, 0x7060302 33714; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 33715; GFX9-NEXT: s_setpc_b64 s[30:31] 33716; 33717; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16: 33718; GFX10: ; %bb.0: 33719; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33720; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 33721; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 33722; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 33723; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 33724; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 33725; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 33726; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 33727; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 33728; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 33729; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 33730; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v4 33731; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v5 33732; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33733; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 33734; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 33735; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 33736; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 33737; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 33738; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 33739; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33740; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 33741; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 33742; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 33743; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 33744; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33745; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 33746; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33747; GFX10-NEXT: s_setpc_b64 s[30:31] 33748; 33749; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16: 33750; GFX11: ; %bb.0: 33751; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33752; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 33753; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 33754; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33755; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 33756; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 33757; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33758; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 33759; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 33760; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33761; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 33762; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 33763; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33764; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 33765; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 33766; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v4 33767; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v5 33768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 33769; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 33770; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 33771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33772; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 33773; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 33774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 33775; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 33776; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 33777; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 33778; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33779; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 33780; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 33781; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff 33782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 33783; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 33784; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33785; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo 33786; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 33787; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 33788; GFX11-NEXT: s_setpc_b64 s[30:31] 33789 %op = uitofp <2 x i64> %x to <2 x bfloat> 33790 ret <2 x bfloat> %op 33791} 33792 33793define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { 33794; GCN-LABEL: v_uitofp_v3i64_to_v3bf16: 33795; GCN: ; %bb.0: 33796; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33797; GCN-NEXT: v_ffbh_u32_e32 v6, v5 33798; GCN-NEXT: v_ffbh_u32_e32 v7, v3 33799; GCN-NEXT: v_ffbh_u32_e32 v8, v1 33800; GCN-NEXT: v_min_u32_e32 v6, 32, v6 33801; GCN-NEXT: v_min_u32_e32 v7, 32, v7 33802; GCN-NEXT: v_min_u32_e32 v8, 32, v8 33803; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 33804; GCN-NEXT: v_sub_i32_e32 v6, vcc, 32, v6 33805; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 33806; GCN-NEXT: v_sub_i32_e32 v7, vcc, 32, v7 33807; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 33808; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 33809; GCN-NEXT: v_min_u32_e32 v4, 1, v4 33810; GCN-NEXT: v_min_u32_e32 v2, 1, v2 33811; GCN-NEXT: v_min_u32_e32 v0, 1, v0 33812; GCN-NEXT: v_or_b32_e32 v4, v5, v4 33813; GCN-NEXT: v_or_b32_e32 v2, v3, v2 33814; GCN-NEXT: v_or_b32_e32 v0, v1, v0 33815; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4 33816; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 33817; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 33818; GCN-NEXT: v_ldexp_f32_e32 v3, v1, v6 33819; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v7 33820; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v8 33821; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33822; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 33823; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 33824; GCN-NEXT: s_setpc_b64 s[30:31] 33825; 33826; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16: 33827; GFX7: ; %bb.0: 33828; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33829; GFX7-NEXT: v_ffbh_u32_e32 v6, v5 33830; GFX7-NEXT: v_min_u32_e32 v6, 32, v6 33831; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v6 33832; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 33833; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 33834; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 33835; GFX7-NEXT: v_ffbh_u32_e32 v6, v3 33836; GFX7-NEXT: v_min_u32_e32 v6, 32, v6 33837; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 33838; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4 33839; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 33840; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 33841; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 33842; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 33843; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 33844; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 33845; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 33846; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 33847; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 33848; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 33849; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v6 33850; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 33851; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 33852; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 33853; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 33854; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 33855; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 33856; GFX7-NEXT: s_setpc_b64 s[30:31] 33857; 33858; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16: 33859; GFX8: ; %bb.0: 33860; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33861; GFX8-NEXT: v_ffbh_u32_e32 v6, v5 33862; GFX8-NEXT: v_min_u32_e32 v6, 32, v6 33863; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 33864; GFX8-NEXT: v_ffbh_u32_e32 v7, v1 33865; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 33866; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 33867; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 33868; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v6 33869; GFX8-NEXT: v_min_u32_e32 v7, 32, v7 33870; GFX8-NEXT: v_ldexp_f32 v4, v4, v5 33871; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 33872; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] 33873; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 33874; GFX8-NEXT: s_movk_i32 s4, 0x7fff 33875; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 33876; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 33877; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 33878; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 33879; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 33880; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 33881; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 33882; GFX8-NEXT: v_ffbh_u32_e32 v5, v3 33883; GFX8-NEXT: v_min_u32_e32 v5, 32, v5 33884; GFX8-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 33885; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 33886; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v7 33887; GFX8-NEXT: v_ldexp_f32 v0, v0, v4 33888; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 33889; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 33890; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 33891; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 33892; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 33893; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 33894; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0 33895; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33896; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 33897; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 33898; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 33899; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 33900; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 33901; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 33902; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 33903; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 33904; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 33905; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 33906; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 33907; GFX8-NEXT: s_setpc_b64 s[30:31] 33908; 33909; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16: 33910; GFX9: ; %bb.0: 33911; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33912; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 33913; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 33914; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] 33915; GFX9-NEXT: s_movk_i32 s4, 0x7fff 33916; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 33917; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 33918; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 33919; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 33920; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 33921; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] 33922; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 33923; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 33924; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 33925; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33926; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 33927; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 33928; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 33929; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 33930; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 33931; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 33932; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc 33933; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 33934; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 33935; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 33936; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 33937; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 33938; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] 33939; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 33940; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 33941; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 33942; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 33943; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 33944; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 33945; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 33946; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 33947; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 33948; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 33949; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 33950; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 33951; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 33952; GFX9-NEXT: s_mov_b32 s4, 0x7060302 33953; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 33954; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 33955; GFX9-NEXT: s_setpc_b64 s[30:31] 33956; 33957; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16: 33958; GFX10: ; %bb.0: 33959; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33960; GFX10-NEXT: v_ffbh_u32_e32 v6, v1 33961; GFX10-NEXT: v_ffbh_u32_e32 v8, v3 33962; GFX10-NEXT: v_ffbh_u32_e32 v7, v5 33963; GFX10-NEXT: v_min_u32_e32 v6, 32, v6 33964; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 33965; GFX10-NEXT: v_min_u32_e32 v7, 32, v7 33966; GFX10-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] 33967; GFX10-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] 33968; GFX10-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] 33969; GFX10-NEXT: v_sub_nc_u32_e32 v7, 32, v7 33970; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 33971; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 33972; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 33973; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 33974; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 33975; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v6 33976; GFX10-NEXT: v_or_b32_e32 v1, v5, v4 33977; GFX10-NEXT: v_sub_nc_u32_e32 v4, 32, v8 33978; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 33979; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 33980; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 33981; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 33982; GFX10-NEXT: v_ldexp_f32 v2, v2, v4 33983; GFX10-NEXT: v_ldexp_f32 v1, v1, v7 33984; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 33985; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 33986; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v0 33987; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 33988; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 33989; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 33990; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 33991; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 33992; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 33993; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 33994; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 33995; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 33996; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 33997; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 33998; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 33999; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 34000; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 34001; GFX10-NEXT: s_setpc_b64 s[30:31] 34002; 34003; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16: 34004; GFX11TRUE16: ; %bb.0: 34005; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34006; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v6, v1 34007; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v7, v5 34008; GFX11TRUE16-NEXT: v_clz_i32_u32_e32 v8, v3 34009; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34010; GFX11TRUE16-NEXT: v_min_u32_e32 v6, 32, v6 34011; GFX11TRUE16-NEXT: v_min_u32_e32 v7, 32, v7 34012; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34013; GFX11TRUE16-NEXT: v_min_u32_e32 v8, 32, v8 34014; GFX11TRUE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] 34015; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34016; GFX11TRUE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] 34017; GFX11TRUE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] 34018; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7 34019; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34020; GFX11TRUE16-NEXT: v_min_u32_e32 v0, 1, v0 34021; GFX11TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 34022; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 34023; GFX11TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 34024; GFX11TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 34025; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34026; GFX11TRUE16-NEXT: v_or_b32_e32 v1, v5, v4 34027; GFX11TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 34028; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6 34029; GFX11TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8 34030; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 34031; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 34032; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2 34033; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34034; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 34035; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v7 34036; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34037; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 34038; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 34039; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34040; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 34041; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 34042; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 34043; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 34044; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 34045; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 34046; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 34047; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 34048; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 34049; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 34050; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 34051; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 34052; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 34053; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 34054; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 34055; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 34056; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34057; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 34058; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34059; 34060; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16: 34061; GFX11FAKE16: ; %bb.0: 34062; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34063; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v6, v1 34064; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v7, v5 34065; GFX11FAKE16-NEXT: v_clz_i32_u32_e32 v8, v3 34066; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34067; GFX11FAKE16-NEXT: v_min_u32_e32 v6, 32, v6 34068; GFX11FAKE16-NEXT: v_min_u32_e32 v7, 32, v7 34069; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34070; GFX11FAKE16-NEXT: v_min_u32_e32 v8, 32, v8 34071; GFX11FAKE16-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] 34072; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34073; GFX11FAKE16-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] 34074; GFX11FAKE16-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] 34075; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v7, 32, v7 34076; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34077; GFX11FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 34078; GFX11FAKE16-NEXT: v_min_u32_e32 v4, 1, v4 34079; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 34080; GFX11FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 34081; GFX11FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 34082; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34083; GFX11FAKE16-NEXT: v_or_b32_e32 v1, v5, v4 34084; GFX11FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 34085; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v3, 32, v6 34086; GFX11FAKE16-NEXT: v_sub_nc_u32_e32 v4, 32, v8 34087; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 34088; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 34089; GFX11FAKE16-NEXT: v_cvt_f32_u32_e32 v2, v2 34090; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34091; GFX11FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 34092; GFX11FAKE16-NEXT: v_ldexp_f32 v1, v1, v7 34093; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34094; GFX11FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 34095; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 34096; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34097; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 34098; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 34099; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 34100; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 34101; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 34102; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 34103; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 34104; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 34105; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 34106; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo 34107; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 34108; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 34109; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo 34110; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 34111; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 34112; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 34113; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 34114; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 34115; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34116 %op = uitofp <3 x i64> %x to <3 x bfloat> 34117 ret <3 x bfloat> %op 34118} 34119 34120define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { 34121; GCN-LABEL: v_uitofp_v4i64_to_v4bf16: 34122; GCN: ; %bb.0: 34123; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34124; GCN-NEXT: v_ffbh_u32_e32 v8, v7 34125; GCN-NEXT: v_ffbh_u32_e32 v9, v5 34126; GCN-NEXT: v_ffbh_u32_e32 v10, v3 34127; GCN-NEXT: v_ffbh_u32_e32 v11, v1 34128; GCN-NEXT: v_min_u32_e32 v8, 32, v8 34129; GCN-NEXT: v_min_u32_e32 v9, 32, v9 34130; GCN-NEXT: v_min_u32_e32 v10, 32, v10 34131; GCN-NEXT: v_min_u32_e32 v11, 32, v11 34132; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 34133; GCN-NEXT: v_sub_i32_e32 v8, vcc, 32, v8 34134; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v9 34135; GCN-NEXT: v_sub_i32_e32 v9, vcc, 32, v9 34136; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 34137; GCN-NEXT: v_sub_i32_e32 v10, vcc, 32, v10 34138; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 34139; GCN-NEXT: v_sub_i32_e32 v11, vcc, 32, v11 34140; GCN-NEXT: v_min_u32_e32 v6, 1, v6 34141; GCN-NEXT: v_min_u32_e32 v4, 1, v4 34142; GCN-NEXT: v_min_u32_e32 v2, 1, v2 34143; GCN-NEXT: v_min_u32_e32 v0, 1, v0 34144; GCN-NEXT: v_or_b32_e32 v6, v7, v6 34145; GCN-NEXT: v_or_b32_e32 v4, v5, v4 34146; GCN-NEXT: v_or_b32_e32 v2, v3, v2 34147; GCN-NEXT: v_or_b32_e32 v0, v1, v0 34148; GCN-NEXT: v_cvt_f32_u32_e32 v1, v6 34149; GCN-NEXT: v_cvt_f32_u32_e32 v3, v4 34150; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 34151; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 34152; GCN-NEXT: v_ldexp_f32_e32 v4, v1, v8 34153; GCN-NEXT: v_ldexp_f32_e32 v3, v3, v9 34154; GCN-NEXT: v_ldexp_f32_e32 v1, v2, v10 34155; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v11 34156; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34157; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 34158; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 34159; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 34160; GCN-NEXT: s_setpc_b64 s[30:31] 34161; 34162; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16: 34163; GFX7: ; %bb.0: 34164; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34165; GFX7-NEXT: v_ffbh_u32_e32 v8, v7 34166; GFX7-NEXT: v_min_u32_e32 v8, 32, v8 34167; GFX7-NEXT: v_lshl_b64 v[6:7], v[6:7], v8 34168; GFX7-NEXT: v_min_u32_e32 v6, 1, v6 34169; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 34170; GFX7-NEXT: v_cvt_f32_u32_e32 v6, v6 34171; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 32, v8 34172; GFX7-NEXT: v_ffbh_u32_e32 v8, v5 34173; GFX7-NEXT: v_ldexp_f32_e32 v6, v6, v7 34174; GFX7-NEXT: v_ffbh_u32_e32 v7, v3 34175; GFX7-NEXT: v_min_u32_e32 v7, 32, v7 34176; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 34177; GFX7-NEXT: v_min_u32_e32 v8, 32, v8 34178; GFX7-NEXT: v_min_u32_e32 v2, 1, v2 34179; GFX7-NEXT: v_lshl_b64 v[4:5], v[4:5], v8 34180; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 34181; GFX7-NEXT: v_ffbh_u32_e32 v3, v1 34182; GFX7-NEXT: v_min_u32_e32 v3, 32, v3 34183; GFX7-NEXT: v_min_u32_e32 v4, 1, v4 34184; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], v3 34185; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 34186; GFX7-NEXT: v_cvt_f32_u32_e32 v4, v4 34187; GFX7-NEXT: v_min_u32_e32 v0, 1, v0 34188; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2 34189; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 34190; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 34191; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v8 34192; GFX7-NEXT: v_ldexp_f32_e32 v4, v4, v5 34193; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 32, v7 34194; GFX7-NEXT: v_ldexp_f32_e32 v1, v2, v5 34195; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 32, v3 34196; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 34197; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34198; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 34199; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 34200; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 34201; GFX7-NEXT: s_setpc_b64 s[30:31] 34202; 34203; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16: 34204; GFX8: ; %bb.0: 34205; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34206; GFX8-NEXT: v_ffbh_u32_e32 v8, v5 34207; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 34208; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 34209; GFX8-NEXT: s_movk_i32 s4, 0x7fff 34210; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 34211; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 34212; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 34213; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 32, v8 34214; GFX8-NEXT: v_ldexp_f32 v8, v4, v5 34215; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1 34216; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 34217; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v4 34218; GFX8-NEXT: v_ffbh_u32_e32 v4, v7 34219; GFX8-NEXT: v_min_u32_e32 v10, 32, v4 34220; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] 34221; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 34222; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 34223; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 34224; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 34225; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 34226; GFX8-NEXT: v_ffbh_u32_e32 v8, v1 34227; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 34228; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 34229; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 34230; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 34231; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 34232; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 34233; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 34234; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 34235; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 34236; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 34237; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 34238; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 34239; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 34240; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc 34241; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 34242; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 34243; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 34244; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 34245; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v0 34246; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 34247; GFX8-NEXT: v_min_u32_e32 v8, 32, v0 34248; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] 34249; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v6 34250; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 34251; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 34252; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 34253; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 34254; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 34255; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v8 34256; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 34257; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 34258; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 34259; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 34260; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 34261; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 34262; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 34263; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 34264; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 34265; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 34266; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 34267; GFX8-NEXT: s_setpc_b64 s[30:31] 34268; 34269; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16: 34270; GFX9: ; %bb.0: 34271; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34272; GFX9-NEXT: v_ffbh_u32_e32 v8, v5 34273; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 34274; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 34275; GFX9-NEXT: s_movk_i32 s4, 0x7fff 34276; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 34277; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 34278; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 34279; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 34280; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 34281; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 34282; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 34283; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 34284; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 34285; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] 34286; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 34287; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 34288; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 34289; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 34290; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] 34291; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 34292; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 34293; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 34294; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 34295; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 34296; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 34297; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 34298; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 34299; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 34300; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 34301; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 34302; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 34303; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 34304; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 34305; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc 34306; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 34307; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 34308; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 34309; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 34310; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 34311; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] 34312; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 34313; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 34314; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 34315; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 34316; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 34317; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 34318; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 34319; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 34320; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 34321; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 34322; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 34323; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 34324; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 34325; GFX9-NEXT: s_mov_b32 s4, 0x7060302 34326; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 34327; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 34328; GFX9-NEXT: s_setpc_b64 s[30:31] 34329; 34330; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16: 34331; GFX10: ; %bb.0: 34332; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34333; GFX10-NEXT: v_ffbh_u32_e32 v8, v5 34334; GFX10-NEXT: v_ffbh_u32_e32 v10, v1 34335; GFX10-NEXT: v_ffbh_u32_e32 v11, v3 34336; GFX10-NEXT: v_ffbh_u32_e32 v9, v7 34337; GFX10-NEXT: v_min_u32_e32 v8, 32, v8 34338; GFX10-NEXT: v_min_u32_e32 v10, 32, v10 34339; GFX10-NEXT: v_min_u32_e32 v11, 32, v11 34340; GFX10-NEXT: v_min_u32_e32 v9, 32, v9 34341; GFX10-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 34342; GFX10-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 34343; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 34344; GFX10-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7] 34345; GFX10-NEXT: v_sub_nc_u32_e32 v8, 32, v8 34346; GFX10-NEXT: v_sub_nc_u32_e32 v9, 32, v9 34347; GFX10-NEXT: v_min_u32_e32 v4, 1, v4 34348; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 34349; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 34350; GFX10-NEXT: v_min_u32_e32 v6, 1, v6 34351; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 34352; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 34353; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 34354; GFX10-NEXT: v_sub_nc_u32_e32 v5, 32, v10 34355; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v11 34356; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v4 34357; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 34358; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 34359; GFX10-NEXT: v_or_b32_e32 v6, v7, v6 34360; GFX10-NEXT: v_ldexp_f32 v2, v2, v8 34361; GFX10-NEXT: v_ldexp_f32 v0, v0, v5 34362; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 34363; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v6 34364; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 34365; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 34366; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 34367; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 34368; GFX10-NEXT: v_ldexp_f32 v4, v4, v9 34369; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 34370; GFX10-NEXT: v_bfe_u32 v8, v1, 16, 1 34371; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 34372; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 34373; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 34374; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo 34375; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 34376; GFX10-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 34377; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v1 34378; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff 34379; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4 34380; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo 34381; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 34382; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo 34383; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 34384; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 34385; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 34386; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 34387; GFX10-NEXT: s_setpc_b64 s[30:31] 34388; 34389; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16: 34390; GFX11: ; %bb.0: 34391; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34392; GFX11-NEXT: v_clz_i32_u32_e32 v8, v5 34393; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 34394; GFX11-NEXT: v_clz_i32_u32_e32 v11, v3 34395; GFX11-NEXT: v_clz_i32_u32_e32 v9, v7 34396; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34397; GFX11-NEXT: v_min_u32_e32 v8, 32, v8 34398; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 34399; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34400; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 34401; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 34402; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34403; GFX11-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] 34404; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] 34405; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34406; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 34407; GFX11-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7] 34408; GFX11-NEXT: v_sub_nc_u32_e32 v8, 32, v8 34409; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 34410; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 34411; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 34412; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 34413; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 34414; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34415; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 34416; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 34417; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 34418; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 34419; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v10 34420; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v11 34421; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v4 34422; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 34423; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 34424; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 34425; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34426; GFX11-NEXT: v_ldexp_f32 v2, v2, v8 34427; GFX11-NEXT: v_ldexp_f32 v0, v0, v5 34428; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34429; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 34430; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v6 34431; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 34432; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 34433; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 34434; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 34435; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 34436; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 34437; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 34438; GFX11-NEXT: v_ldexp_f32 v4, v4, v9 34439; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 34440; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 34441; GFX11-NEXT: v_add3_u32 v8, v8, v1, 0x7fff 34442; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo 34443; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 34444; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 34445; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1 34446; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 34447; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo 34448; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 34449; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff 34450; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc_lo 34451; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 34452; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) 34453; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 34454; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo 34455; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 34456; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 34457; GFX11-NEXT: s_setpc_b64 s[30:31] 34458 %op = uitofp <4 x i64> %x to <4 x bfloat> 34459 ret <4 x bfloat> %op 34460} 34461 34462define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { 34463; GCN-LABEL: v_select_bf16: 34464; GCN: ; %bb.0: 34465; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34466; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 34467; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 34468; GCN-NEXT: v_and_b32_e32 v0, 1, v0 34469; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34470; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34471; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34472; GCN-NEXT: s_setpc_b64 s[30:31] 34473; 34474; GFX7-LABEL: v_select_bf16: 34475; GFX7: ; %bb.0: 34476; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34477; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 34478; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 34479; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 34480; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34481; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34482; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34483; GFX7-NEXT: s_setpc_b64 s[30:31] 34484; 34485; GFX8-LABEL: v_select_bf16: 34486; GFX8: ; %bb.0: 34487; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34488; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 34489; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34490; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34491; GFX8-NEXT: s_setpc_b64 s[30:31] 34492; 34493; GFX9-LABEL: v_select_bf16: 34494; GFX9: ; %bb.0: 34495; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34496; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 34497; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34498; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34499; GFX9-NEXT: s_setpc_b64 s[30:31] 34500; 34501; GFX10-LABEL: v_select_bf16: 34502; GFX10: ; %bb.0: 34503; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34504; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 34505; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34506; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34507; GFX10-NEXT: s_setpc_b64 s[30:31] 34508; 34509; GFX11TRUE16-LABEL: v_select_bf16: 34510; GFX11TRUE16: ; %bb.0: 34511; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34512; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 34513; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l 34514; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l 34515; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 34516; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 34517; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo 34518; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34519; 34520; GFX11FAKE16-LABEL: v_select_bf16: 34521; GFX11FAKE16: ; %bb.0: 34522; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34523; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 34524; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34525; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34526; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34527; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34528 %op = select i1 %cond, bfloat %a, bfloat %b 34529 ret bfloat %op 34530} 34531 34532define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { 34533; GCN-LABEL: v_select_fneg_lhs_bf16: 34534; GCN: ; %bb.0: 34535; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34536; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 34537; GCN-NEXT: v_and_b32_e32 v0, 1, v0 34538; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 34539; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34540; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34541; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34542; GCN-NEXT: s_setpc_b64 s[30:31] 34543; 34544; GFX7-LABEL: v_select_fneg_lhs_bf16: 34545; GFX7: ; %bb.0: 34546; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34547; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 34548; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 34549; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1 34550; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34551; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34552; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34553; GFX7-NEXT: s_setpc_b64 s[30:31] 34554; 34555; GFX8-LABEL: v_select_fneg_lhs_bf16: 34556; GFX8: ; %bb.0: 34557; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34558; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 34559; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1 34560; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34561; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34562; GFX8-NEXT: s_setpc_b64 s[30:31] 34563; 34564; GFX9-LABEL: v_select_fneg_lhs_bf16: 34565; GFX9: ; %bb.0: 34566; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34567; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 34568; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 34569; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34570; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34571; GFX9-NEXT: s_setpc_b64 s[30:31] 34572; 34573; GFX10-LABEL: v_select_fneg_lhs_bf16: 34574; GFX10: ; %bb.0: 34575; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34576; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 34577; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1 34578; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34579; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34580; GFX10-NEXT: s_setpc_b64 s[30:31] 34581; 34582; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16: 34583; GFX11TRUE16: ; %bb.0: 34584; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34585; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 34586; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l 34587; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l 34588; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34589; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 34590; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l 34591; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34592; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo 34593; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34594; 34595; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16: 34596; GFX11FAKE16: ; %bb.0: 34597; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34598; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 34599; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 34600; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 34601; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34602; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34603; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34604 %neg.a = fneg bfloat %a 34605 %op = select i1 %cond, bfloat %neg.a, bfloat %b 34606 ret bfloat %op 34607} 34608 34609define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { 34610; GCN-LABEL: v_select_fneg_rhs_bf16: 34611; GCN: ; %bb.0: 34612; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34613; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 34614; GCN-NEXT: v_and_b32_e32 v0, 1, v0 34615; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2 34616; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34617; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34618; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34619; GCN-NEXT: s_setpc_b64 s[30:31] 34620; 34621; GFX7-LABEL: v_select_fneg_rhs_bf16: 34622; GFX7: ; %bb.0: 34623; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34624; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 34625; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 34626; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2 34627; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34628; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34629; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34630; GFX7-NEXT: s_setpc_b64 s[30:31] 34631; 34632; GFX8-LABEL: v_select_fneg_rhs_bf16: 34633; GFX8: ; %bb.0: 34634; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34635; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 34636; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2 34637; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34638; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34639; GFX8-NEXT: s_setpc_b64 s[30:31] 34640; 34641; GFX9-LABEL: v_select_fneg_rhs_bf16: 34642; GFX9: ; %bb.0: 34643; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34644; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 34645; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2 34646; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34647; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34648; GFX9-NEXT: s_setpc_b64 s[30:31] 34649; 34650; GFX10-LABEL: v_select_fneg_rhs_bf16: 34651; GFX10: ; %bb.0: 34652; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34653; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 34654; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2 34655; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34656; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34657; GFX10-NEXT: s_setpc_b64 s[30:31] 34658; 34659; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16: 34660; GFX11TRUE16: ; %bb.0: 34661; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34662; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0 34663; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l 34664; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l 34665; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 34666; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 34667; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l 34668; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34669; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo 34670; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34671; 34672; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16: 34673; GFX11FAKE16: ; %bb.0: 34674; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34675; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 34676; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 34677; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 34678; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34679; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34680; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34681 %neg.b = fneg bfloat %b 34682 %op = select i1 %cond, bfloat %a, bfloat %neg.b 34683 ret bfloat %op 34684} 34685 34686define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) { 34687; GCN-LABEL: v_select_v2bf16: 34688; GCN: ; %bb.0: 34689; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34690; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 34691; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 34692; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 34693; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 34694; GCN-NEXT: v_and_b32_e32 v0, 1, v0 34695; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 34696; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 34697; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34698; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 34699; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34700; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 34701; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 34702; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 34703; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 34704; GCN-NEXT: s_setpc_b64 s[30:31] 34705; 34706; GFX7-LABEL: v_select_v2bf16: 34707; GFX7: ; %bb.0: 34708; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34709; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 34710; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 34711; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 34712; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 34713; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 34714; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 34715; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 34716; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34717; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 34718; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34719; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 34720; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 34721; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 34722; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 34723; GFX7-NEXT: s_setpc_b64 s[30:31] 34724; 34725; GFX8-LABEL: v_select_v2bf16: 34726; GFX8: ; %bb.0: 34727; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34728; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 34729; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34730; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34731; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 34732; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34733; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 34734; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 34735; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 34736; GFX8-NEXT: s_setpc_b64 s[30:31] 34737; 34738; GFX9-LABEL: v_select_v2bf16: 34739; GFX9: ; %bb.0: 34740; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34741; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 34742; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34743; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34744; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 34745; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34746; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 34747; GFX9-NEXT: s_mov_b32 s4, 0x5040100 34748; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 34749; GFX9-NEXT: s_setpc_b64 s[30:31] 34750; 34751; GFX10-LABEL: v_select_v2bf16: 34752; GFX10: ; %bb.0: 34753; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34754; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 34755; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 34756; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34757; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34758; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 34759; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo 34760; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34761; GFX10-NEXT: s_setpc_b64 s[30:31] 34762; 34763; GFX11TRUE16-LABEL: v_select_v2bf16: 34764; GFX11TRUE16: ; %bb.0: 34765; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34766; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 34767; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 34768; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34769; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 34770; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34771; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo 34772; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo 34773; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 34774; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 34775; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 34776; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34777; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34778; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34779; 34780; GFX11FAKE16-LABEL: v_select_v2bf16: 34781; GFX11FAKE16: ; %bb.0: 34782; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34783; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 34784; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34785; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 34786; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 34787; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34788; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3 34789; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34790; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34791; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34792 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b 34793 ret <2 x bfloat> %op 34794} 34795 34796define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) { 34797; GCN-LABEL: v_vselect_v2bf16: 34798; GCN: ; %bb.0: 34799; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34800; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 34801; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 34802; GCN-NEXT: v_and_b32_e32 v0, 1, v0 34803; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 34804; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 34805; GCN-NEXT: v_and_b32_e32 v1, 1, v1 34806; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 34807; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 34808; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34809; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 34810; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34811; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 34812; GCN-NEXT: s_setpc_b64 s[30:31] 34813; 34814; GFX7-LABEL: v_vselect_v2bf16: 34815; GFX7: ; %bb.0: 34816; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34817; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 34818; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 34819; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 34820; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 34821; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 34822; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 34823; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 34824; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 34825; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34826; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 34827; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 34828; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 34829; GFX7-NEXT: s_setpc_b64 s[30:31] 34830; 34831; GFX8-LABEL: v_vselect_v2bf16: 34832; GFX8: ; %bb.0: 34833; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34834; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 34835; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 34836; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34837; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 34838; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34839; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 34840; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 34841; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 34842; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 34843; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 34844; GFX8-NEXT: s_setpc_b64 s[30:31] 34845; 34846; GFX9-LABEL: v_vselect_v2bf16: 34847; GFX9: ; %bb.0: 34848; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34849; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 34850; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 34851; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 34852; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 34853; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 34854; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 34855; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 34856; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 34857; GFX9-NEXT: s_mov_b32 s4, 0x5040100 34858; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 34859; GFX9-NEXT: s_setpc_b64 s[30:31] 34860; 34861; GFX10-LABEL: v_vselect_v2bf16: 34862; GFX10: ; %bb.0: 34863; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34864; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 34865; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 34866; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34867; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3 34868; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34869; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo 34870; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 34871; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo 34872; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34873; GFX10-NEXT: s_setpc_b64 s[30:31] 34874; 34875; GFX11TRUE16-LABEL: v_vselect_v2bf16: 34876; GFX11TRUE16: ; %bb.0: 34877; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34878; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 34879; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 34880; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34881; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 34882; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 34883; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 34884; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 34885; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 34886; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo 34887; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v2.l, s0 34888; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 34889; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 34890; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 34891; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34892; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34893; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 34894; 34895; GFX11FAKE16-LABEL: v_vselect_v2bf16: 34896; GFX11FAKE16: ; %bb.0: 34897; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34898; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 34899; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 34900; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 34901; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) 34902; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 34903; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1 34904; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 34905; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 34906; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo 34907; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 34908; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 34909 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b 34910 ret <2 x bfloat> %op 34911} 34912 34913define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { 34914; GCN-LABEL: s_select_bf16: 34915; GCN: ; %bb.0: 34916; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 34917; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 34918; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 34919; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34920; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 34921; GCN-NEXT: v_readfirstlane_b32 s0, v0 34922; GCN-NEXT: ; return to shader part epilog 34923; 34924; GFX7-LABEL: s_select_bf16: 34925; GFX7: ; %bb.0: 34926; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 34927; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 34928; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 34929; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 34930; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 34931; GFX7-NEXT: v_readfirstlane_b32 s0, v0 34932; GFX7-NEXT: ; return to shader part epilog 34933; 34934; GFX8-LABEL: s_select_bf16: 34935; GFX8: ; %bb.0: 34936; GFX8-NEXT: v_mov_b32_e32 v1, s1 34937; GFX8-NEXT: v_mov_b32_e32 v2, s0 34938; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 34939; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 34940; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 34941; GFX8-NEXT: v_readfirstlane_b32 s0, v0 34942; GFX8-NEXT: ; return to shader part epilog 34943; 34944; GFX9-LABEL: s_select_bf16: 34945; GFX9: ; %bb.0: 34946; GFX9-NEXT: v_mov_b32_e32 v1, s1 34947; GFX9-NEXT: v_mov_b32_e32 v2, s0 34948; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 34949; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 34950; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 34951; GFX9-NEXT: v_readfirstlane_b32 s0, v0 34952; GFX9-NEXT: ; return to shader part epilog 34953; 34954; GFX10-LABEL: s_select_bf16: 34955; GFX10: ; %bb.0: 34956; GFX10-NEXT: v_mov_b32_e32 v1, s0 34957; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 34958; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo 34959; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 34960; GFX10-NEXT: v_readfirstlane_b32 s0, v0 34961; GFX10-NEXT: ; return to shader part epilog 34962; 34963; GFX11TRUE16-LABEL: s_select_bf16: 34964; GFX11TRUE16: ; %bb.0: 34965; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 34966; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 34967; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 34968; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo 34969; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 34970; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34971; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 34972; GFX11TRUE16-NEXT: ; return to shader part epilog 34973; 34974; GFX11FAKE16-LABEL: s_select_bf16: 34975; GFX11FAKE16: ; %bb.0: 34976; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0 34977; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 34978; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 34979; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo 34980; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 34981; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 34982; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 34983; GFX11FAKE16-NEXT: ; return to shader part epilog 34984 %cond = icmp eq i32 %c, 0 34985 %op = select i1 %cond, bfloat %a, bfloat %b 34986 %cast = bitcast bfloat %op to i16 34987 %zext = zext i16 %cast to i32 34988 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) 34989 ret i32 %readlane 34990} 34991 34992define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) { 34993; GCN-LABEL: s_select_v2bf16: 34994; GCN: ; %bb.0: 34995; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 34996; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 34997; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 34998; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s2 34999; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 35000; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35001; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 35002; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35003; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35004; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 35005; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35006; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 35007; GCN-NEXT: v_or_b32_e32 v0, v0, v1 35008; GCN-NEXT: v_readfirstlane_b32 s0, v0 35009; GCN-NEXT: ; return to shader part epilog 35010; 35011; GFX7-LABEL: s_select_v2bf16: 35012; GFX7: ; %bb.0: 35013; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 35014; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 35015; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 35016; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35017; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 35018; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 35019; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35020; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 35021; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35022; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35023; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 35024; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 35025; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 35026; GFX7-NEXT: v_readfirstlane_b32 s0, v0 35027; GFX7-NEXT: ; return to shader part epilog 35028; 35029; GFX8-LABEL: s_select_v2bf16: 35030; GFX8: ; %bb.0: 35031; GFX8-NEXT: s_lshr_b32 s2, s0, 16 35032; GFX8-NEXT: s_lshr_b32 s3, s1, 16 35033; GFX8-NEXT: v_mov_b32_e32 v1, s3 35034; GFX8-NEXT: v_mov_b32_e32 v2, s2 35035; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35036; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 35037; GFX8-NEXT: v_mov_b32_e32 v1, s1 35038; GFX8-NEXT: v_mov_b32_e32 v2, s0 35039; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 35040; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 35041; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 35042; GFX8-NEXT: v_readfirstlane_b32 s0, v0 35043; GFX8-NEXT: ; return to shader part epilog 35044; 35045; GFX9-LABEL: s_select_v2bf16: 35046; GFX9: ; %bb.0: 35047; GFX9-NEXT: s_lshr_b32 s2, s0, 16 35048; GFX9-NEXT: s_lshr_b32 s3, s1, 16 35049; GFX9-NEXT: v_mov_b32_e32 v1, s3 35050; GFX9-NEXT: v_mov_b32_e32 v2, s2 35051; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35052; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 35053; GFX9-NEXT: v_mov_b32_e32 v1, s1 35054; GFX9-NEXT: v_mov_b32_e32 v2, s0 35055; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 35056; GFX9-NEXT: s_mov_b32 s0, 0x5040100 35057; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0 35058; GFX9-NEXT: v_readfirstlane_b32 s0, v0 35059; GFX9-NEXT: ; return to shader part epilog 35060; 35061; GFX10-LABEL: s_select_v2bf16: 35062; GFX10: ; %bb.0: 35063; GFX10-NEXT: s_lshr_b32 s2, s0, 16 35064; GFX10-NEXT: v_mov_b32_e32 v2, s0 35065; GFX10-NEXT: v_mov_b32_e32 v1, s2 35066; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35067; GFX10-NEXT: s_lshr_b32 s3, s1, 16 35068; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo 35069; GFX10-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo 35070; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 35071; GFX10-NEXT: v_readfirstlane_b32 s0, v0 35072; GFX10-NEXT: ; return to shader part epilog 35073; 35074; GFX11TRUE16-LABEL: s_select_v2bf16: 35075; GFX11TRUE16: ; %bb.0: 35076; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16 35077; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 35078; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35079; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 35080; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 35081; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 35082; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 35083; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 35084; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo 35085; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo 35086; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 35087; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 35088; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 35089; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 35090; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 35091; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 35092; GFX11TRUE16-NEXT: ; return to shader part epilog 35093; 35094; GFX11FAKE16-LABEL: s_select_v2bf16: 35095; GFX11FAKE16: ; %bb.0: 35096; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 35097; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35098; GFX11FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 35099; GFX11FAKE16-NEXT: s_lshr_b32 s3, s1, 16 35100; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 35101; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo 35102; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 35103; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo 35104; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 35105; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 35106; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 35107; GFX11FAKE16-NEXT: ; return to shader part epilog 35108 %cond = icmp eq i32 %c, 0 35109 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b 35110 %cast = bitcast <2 x bfloat> %op to i32 35111 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) 35112 ret i32 %readlane 35113} 35114 35115define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) { 35116; GCN-LABEL: s_vselect_v2bf16: 35117; GCN: ; %bb.0: 35118; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 35119; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 35120; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 35121; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 35122; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 35123; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc 35124; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35125; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 35126; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35127; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 35128; GCN-NEXT: v_or_b32_e32 v0, v0, v1 35129; GCN-NEXT: v_readfirstlane_b32 s0, v0 35130; GCN-NEXT: ; return to shader part epilog 35131; 35132; GFX7-LABEL: s_vselect_v2bf16: 35133; GFX7: ; %bb.0: 35134; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 35135; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3 35136; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 35137; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 35138; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 35139; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc 35140; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35141; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 35142; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35143; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 35144; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 35145; GFX7-NEXT: v_readfirstlane_b32 s0, v0 35146; GFX7-NEXT: ; return to shader part epilog 35147; 35148; GFX8-LABEL: s_vselect_v2bf16: 35149; GFX8: ; %bb.0: 35150; GFX8-NEXT: s_lshr_b32 s2, s0, 16 35151; GFX8-NEXT: s_lshr_b32 s3, s1, 16 35152; GFX8-NEXT: v_mov_b32_e32 v2, s3 35153; GFX8-NEXT: v_mov_b32_e32 v3, s2 35154; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 35155; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 35156; GFX8-NEXT: v_mov_b32_e32 v2, s1 35157; GFX8-NEXT: v_mov_b32_e32 v3, s0 35158; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35159; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 35160; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 35161; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 35162; GFX8-NEXT: v_readfirstlane_b32 s0, v0 35163; GFX8-NEXT: ; return to shader part epilog 35164; 35165; GFX9-LABEL: s_vselect_v2bf16: 35166; GFX9: ; %bb.0: 35167; GFX9-NEXT: s_lshr_b32 s2, s0, 16 35168; GFX9-NEXT: s_lshr_b32 s3, s1, 16 35169; GFX9-NEXT: v_mov_b32_e32 v2, s3 35170; GFX9-NEXT: v_mov_b32_e32 v3, s2 35171; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 35172; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 35173; GFX9-NEXT: v_mov_b32_e32 v2, s1 35174; GFX9-NEXT: v_mov_b32_e32 v3, s0 35175; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 35176; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 35177; GFX9-NEXT: s_mov_b32 s0, 0x5040100 35178; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 35179; GFX9-NEXT: v_readfirstlane_b32 s0, v0 35180; GFX9-NEXT: ; return to shader part epilog 35181; 35182; GFX10-LABEL: s_vselect_v2bf16: 35183; GFX10: ; %bb.0: 35184; GFX10-NEXT: s_lshr_b32 s2, s0, 16 35185; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 35186; GFX10-NEXT: v_mov_b32_e32 v2, s2 35187; GFX10-NEXT: v_mov_b32_e32 v3, s0 35188; GFX10-NEXT: s_lshr_b32 s0, s1, 16 35189; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo 35190; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35191; GFX10-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo 35192; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 35193; GFX10-NEXT: v_readfirstlane_b32 s0, v0 35194; GFX10-NEXT: ; return to shader part epilog 35195; 35196; GFX11TRUE16-LABEL: s_vselect_v2bf16: 35197; GFX11TRUE16: ; %bb.0: 35198; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 35199; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16 35200; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35201; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 35202; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 35203; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 35204; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 35205; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 35206; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 35207; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s2 35208; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo 35209; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 35210; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l 35211; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 35212; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 35213; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 35214; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 35215; GFX11TRUE16-NEXT: ; return to shader part epilog 35216; 35217; GFX11FAKE16-LABEL: s_vselect_v2bf16: 35218; GFX11FAKE16: ; %bb.0: 35219; GFX11FAKE16-NEXT: s_lshr_b32 s2, s0, 16 35220; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 35221; GFX11FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 35222; GFX11FAKE16-NEXT: s_lshr_b32 s0, s1, 16 35223; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 35224; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo 35225; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 35226; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 35227; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo 35228; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 35229; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 35230; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 35231; GFX11FAKE16-NEXT: ; return to shader part epilog 35232 %cond = icmp eq <2 x i32> %c, zeroinitializer 35233 %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b 35234 %cast = bitcast <2 x bfloat> %op to i32 35235 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) 35236 ret i32 %readlane 35237} 35238 35239define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) { 35240; GCN-LABEL: v_select_v3bf16: 35241; GCN: ; %bb.0: 35242; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35243; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 35244; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 35245; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 35246; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35247; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 35248; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 35249; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35250; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 35251; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35252; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35253; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 35254; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 35255; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 35256; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35257; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35258; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc 35259; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 35260; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35261; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35262; GCN-NEXT: s_setpc_b64 s[30:31] 35263; 35264; GFX7-LABEL: v_select_v3bf16: 35265; GFX7: ; %bb.0: 35266; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35267; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 35268; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35269; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 35270; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 35271; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 35272; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 35273; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 35274; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35275; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 35276; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 35277; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 35278; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35279; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 35280; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35281; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35282; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc 35283; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 35284; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35285; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35286; GFX7-NEXT: s_setpc_b64 s[30:31] 35287; 35288; GFX8-LABEL: v_select_v3bf16: 35289; GFX8: ; %bb.0: 35290; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35291; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 35292; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35293; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 35294; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 35295; GFX8-NEXT: s_setpc_b64 s[30:31] 35296; 35297; GFX9-LABEL: v_select_v3bf16: 35298; GFX9: ; %bb.0: 35299; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35300; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 35301; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35302; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 35303; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 35304; GFX9-NEXT: s_setpc_b64 s[30:31] 35305; 35306; GFX10-LABEL: v_select_v3bf16: 35307; GFX10: ; %bb.0: 35308; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35309; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 35310; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35311; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo 35312; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo 35313; GFX10-NEXT: s_setpc_b64 s[30:31] 35314; 35315; GFX11-LABEL: v_select_v3bf16: 35316; GFX11: ; %bb.0: 35317; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35318; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 35319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 35320; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35321; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 35322; GFX11-NEXT: s_setpc_b64 s[30:31] 35323 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b 35324 ret <3 x bfloat> %op 35325} 35326 35327define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { 35328; GCN-LABEL: v_select_v4bf16: 35329; GCN: ; %bb.0: 35330; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35331; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 35332; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35333; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 35334; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 35335; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 35336; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 35337; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 35338; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 35339; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35340; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35341; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35342; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35343; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35344; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 35345; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16 35346; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 35347; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16 35348; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35349; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35350; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35351; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35352; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35353; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35354; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35355; GCN-NEXT: s_setpc_b64 s[30:31] 35356; 35357; GFX7-LABEL: v_select_v4bf16: 35358; GFX7: ; %bb.0: 35359; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35360; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 35361; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35362; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 35363; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 35364; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 35365; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 35366; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35367; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 35368; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35369; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 35370; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 35371; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 35372; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 35373; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35374; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 35375; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 35376; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 35377; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35378; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35379; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35380; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35381; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35382; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35383; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35384; GFX7-NEXT: s_setpc_b64 s[30:31] 35385; 35386; GFX8-LABEL: v_select_v4bf16: 35387; GFX8: ; %bb.0: 35388; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35389; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 35390; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35391; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 35392; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 35393; GFX8-NEXT: s_setpc_b64 s[30:31] 35394; 35395; GFX9-LABEL: v_select_v4bf16: 35396; GFX9: ; %bb.0: 35397; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35398; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 35399; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35400; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 35401; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 35402; GFX9-NEXT: s_setpc_b64 s[30:31] 35403; 35404; GFX10-LABEL: v_select_v4bf16: 35405; GFX10: ; %bb.0: 35406; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35407; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 35408; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35409; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo 35410; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc_lo 35411; GFX10-NEXT: s_setpc_b64 s[30:31] 35412; 35413; GFX11-LABEL: v_select_v4bf16: 35414; GFX11: ; %bb.0: 35415; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35416; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 35417; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 35418; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35419; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 35420; GFX11-NEXT: s_setpc_b64 s[30:31] 35421 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b 35422 ret <4 x bfloat> %op 35423} 35424 35425define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) { 35426; GCN-LABEL: v_select_v6bf16: 35427; GCN: ; %bb.0: 35428; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35429; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 35430; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35431; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 35432; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 35433; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 35434; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 35435; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 35436; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 35437; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 35438; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 35439; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 35440; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 35441; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35442; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35443; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35444; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35445; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 35446; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35447; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 35448; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 35449; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16 35450; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 35451; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16 35452; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 35453; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16 35454; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35455; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 35456; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35457; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35458; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35459; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35460; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35461; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35462; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35463; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35464; GCN-NEXT: s_setpc_b64 s[30:31] 35465; 35466; GFX7-LABEL: v_select_v6bf16: 35467; GFX7: ; %bb.0: 35468; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35469; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 35470; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35471; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 35472; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 35473; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 35474; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8 35475; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35476; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 35477; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 35478; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35479; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 35480; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 35481; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10 35482; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35483; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 35484; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 35485; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35486; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9 35487; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 35488; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12 35489; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 35490; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35491; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 35492; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 35493; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 35494; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35495; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 35496; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35497; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35498; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35499; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35500; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35501; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35502; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35503; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35504; GFX7-NEXT: s_setpc_b64 s[30:31] 35505; 35506; GFX8-LABEL: v_select_v6bf16: 35507; GFX8: ; %bb.0: 35508; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35509; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 35510; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35511; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 35512; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 35513; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 35514; GFX8-NEXT: s_setpc_b64 s[30:31] 35515; 35516; GFX9-LABEL: v_select_v6bf16: 35517; GFX9: ; %bb.0: 35518; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35519; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 35520; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35521; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 35522; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 35523; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 35524; GFX9-NEXT: s_setpc_b64 s[30:31] 35525; 35526; GFX10-LABEL: v_select_v6bf16: 35527; GFX10: ; %bb.0: 35528; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35529; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 35530; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35531; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo 35532; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 35533; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo 35534; GFX10-NEXT: s_setpc_b64 s[30:31] 35535; 35536; GFX11-LABEL: v_select_v6bf16: 35537; GFX11: ; %bb.0: 35538; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35539; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 35540; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 35541; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35542; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 35543; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo 35544; GFX11-NEXT: s_setpc_b64 s[30:31] 35545 %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b 35546 ret <6 x bfloat> %op 35547} 35548 35549define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { 35550; GCN-LABEL: v_select_v8bf16: 35551; GCN: ; %bb.0: 35552; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35553; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 35554; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35555; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 35556; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 35557; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 35558; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 35559; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 35560; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 35561; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 35562; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 35563; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 35564; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 35565; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 35566; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 35567; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 35568; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 35569; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35570; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35571; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 35572; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35573; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 35574; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35575; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 35576; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35577; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 35578; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 35579; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16 35580; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 35581; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16 35582; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 35583; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16 35584; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 35585; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16 35586; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35587; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 35588; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 35589; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35590; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35591; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35592; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35593; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35594; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35595; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35596; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35597; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 35598; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 35599; GCN-NEXT: s_setpc_b64 s[30:31] 35600; 35601; GFX7-LABEL: v_select_v8bf16: 35602; GFX7: ; %bb.0: 35603; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35604; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 35605; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35606; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 35607; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 35608; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 35609; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10 35610; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35611; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 35612; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 35613; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35614; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 35615; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 35616; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12 35617; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35618; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 35619; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 35620; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16 35621; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35622; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11 35623; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 35624; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14 35625; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35626; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 35627; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16 35628; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35629; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13 35630; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 35631; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16 35632; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16 35633; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35634; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15 35635; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 35636; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 35637; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35638; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 35639; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 35640; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35641; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35642; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35643; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35644; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35645; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35646; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35647; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35648; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 35649; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 35650; GFX7-NEXT: s_setpc_b64 s[30:31] 35651; 35652; GFX8-LABEL: v_select_v8bf16: 35653; GFX8: ; %bb.0: 35654; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35655; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 35656; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35657; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc 35658; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 35659; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 35660; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc 35661; GFX8-NEXT: s_setpc_b64 s[30:31] 35662; 35663; GFX9-LABEL: v_select_v8bf16: 35664; GFX9: ; %bb.0: 35665; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35666; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 35667; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35668; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc 35669; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 35670; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 35671; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc 35672; GFX9-NEXT: s_setpc_b64 s[30:31] 35673; 35674; GFX10-LABEL: v_select_v8bf16: 35675; GFX10: ; %bb.0: 35676; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35677; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 35678; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35679; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo 35680; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo 35681; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc_lo 35682; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc_lo 35683; GFX10-NEXT: s_setpc_b64 s[30:31] 35684; 35685; GFX11-LABEL: v_select_v8bf16: 35686; GFX11: ; %bb.0: 35687; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35688; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 35689; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 35690; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35691; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2 35692; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4 35693; GFX11-NEXT: s_setpc_b64 s[30:31] 35694 %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b 35695 ret <8 x bfloat> %op 35696} 35697 35698define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) { 35699; GCN-LABEL: v_select_v16bf16: 35700; GCN: ; %bb.0: 35701; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35702; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 35703; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35704; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35705; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 35706; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 35707; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 35708; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35709; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 35710; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 35711; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 35712; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35713; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 35714; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35715; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 35716; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 35717; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 35718; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 35719; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 35720; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 35721; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 35722; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 35723; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24 35724; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23 35725; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 35726; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 35727; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 35728; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 35729; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 35730; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 35731; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28 35732; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 35733; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 35734; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 35735; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30 35736; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 35737; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 35738; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 35739; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35740; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35741; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 35742; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 35743; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 35744; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 35745; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 35746; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35747; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 35748; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 35749; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 35750; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 35751; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 35752; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 35753; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 35754; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 35755; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 35756; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 35757; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 35758; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 35759; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 35760; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 35761; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 35762; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 35763; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 35764; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 35765; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35766; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc 35767; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc 35768; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc 35769; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 35770; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc 35771; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35772; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35773; GCN-NEXT: s_waitcnt vmcnt(1) 35774; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 35775; GCN-NEXT: s_waitcnt vmcnt(0) 35776; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 35777; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35778; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35779; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35780; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35781; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35782; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35783; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 35784; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 35785; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 35786; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 35787; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 35788; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 35789; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 35790; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 35791; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 35792; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 35793; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc 35794; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 35795; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 35796; GCN-NEXT: s_setpc_b64 s[30:31] 35797; 35798; GFX7-LABEL: v_select_v16bf16: 35799; GFX7: ; %bb.0: 35800; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35801; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 35802; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35803; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 35804; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 35805; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 35806; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 35807; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35808; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 35809; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35810; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 35811; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 35812; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 35813; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 35814; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35815; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 35816; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 35817; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 35818; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35819; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 35820; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 35821; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 35822; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 35823; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 35824; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 35825; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35826; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 35827; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 35828; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 35829; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 35830; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 35831; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 35832; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 35833; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 35834; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 35835; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 35836; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35837; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 35838; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 35839; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 35840; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 35841; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 35842; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 35843; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 35844; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 35845; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 35846; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 35847; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 35848; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 35849; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 35850; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 35851; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 35852; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 35853; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 35854; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 35855; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 35856; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 35857; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 35858; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 35859; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 35860; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 35861; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 35862; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 35863; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 35864; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35865; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc 35866; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc 35867; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc 35868; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 35869; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc 35870; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 35871; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 35872; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 35873; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 35874; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 35875; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 35876; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 35877; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 35878; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 35879; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 35880; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 35881; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 35882; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 35883; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 35884; GFX7-NEXT: s_waitcnt vmcnt(1) 35885; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 35886; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 35887; GFX7-NEXT: s_waitcnt vmcnt(0) 35888; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 35889; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 35890; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc 35891; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 35892; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 35893; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 35894; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 35895; GFX7-NEXT: s_setpc_b64 s[30:31] 35896; 35897; GFX8-LABEL: v_select_v16bf16: 35898; GFX8: ; %bb.0: 35899; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35900; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 35901; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35902; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc 35903; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc 35904; GFX8-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc 35905; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc 35906; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc 35907; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc 35908; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc 35909; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc 35910; GFX8-NEXT: s_setpc_b64 s[30:31] 35911; 35912; GFX9-LABEL: v_select_v16bf16: 35913; GFX9: ; %bb.0: 35914; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35915; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 35916; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35917; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc 35918; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc 35919; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc 35920; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc 35921; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc 35922; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc 35923; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc 35924; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc 35925; GFX9-NEXT: s_setpc_b64 s[30:31] 35926; 35927; GFX10-LABEL: v_select_v16bf16: 35928; GFX10: ; %bb.0: 35929; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35930; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 35931; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35932; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc_lo 35933; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc_lo 35934; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc_lo 35935; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc_lo 35936; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc_lo 35937; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc_lo 35938; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc_lo 35939; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc_lo 35940; GFX10-NEXT: s_setpc_b64 s[30:31] 35941; 35942; GFX11-LABEL: v_select_v16bf16: 35943; GFX11: ; %bb.0: 35944; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35945; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 35946; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 35947; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 35948; GFX11-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2 35949; GFX11-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4 35950; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6 35951; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8 35952; GFX11-NEXT: s_setpc_b64 s[30:31] 35953 %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b 35954 ret <16 x bfloat> %op 35955} 35956 35957define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) { 35958; GCN-LABEL: v_select_v32bf16: 35959; GCN: ; %bb.0: 35960; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35961; GCN-NEXT: v_and_b32_e32 v0, 1, v0 35962; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 35963; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 35964; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 35965; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 35966; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 35967; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 35968; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 35969; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 35970; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 35971; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6 35972; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 35973; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 35974; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 35975; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8 35976; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 35977; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 35978; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 35979; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 35980; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9 35981; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 35982; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 35983; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 35984; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 35985; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 35986; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 35987; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14 35988; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13 35989; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 35990; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 35991; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16 35992; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15 35993; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 35994; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 35995; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 35996; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17 35997; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 35998; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 35999; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20 36000; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19 36001; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 36002; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 36003; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12 36004; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22 36005; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21 36006; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 36007; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 36008; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 36009; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24 36010; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 36011; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 36012; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 36013; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 36014; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26 36015; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 36016; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 36017; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 36018; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 36019; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28 36020; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27 36021; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 36022; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 36023; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 36024; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30 36025; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 36026; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 36027; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16 36028; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24 36029; GCN-NEXT: s_waitcnt vmcnt(5) 36030; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 36031; GCN-NEXT: s_waitcnt vmcnt(4) 36032; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 36033; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 36034; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 36035; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 36036; GCN-NEXT: s_waitcnt vmcnt(4) 36037; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 36038; GCN-NEXT: s_waitcnt vmcnt(3) 36039; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 36040; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 36041; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 36042; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 36043; GCN-NEXT: s_waitcnt vmcnt(3) 36044; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 36045; GCN-NEXT: s_waitcnt vmcnt(2) 36046; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 36047; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 36048; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16 36049; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 36050; GCN-NEXT: s_waitcnt vmcnt(2) 36051; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 36052; GCN-NEXT: s_waitcnt vmcnt(1) 36053; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 36054; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 36055; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 36056; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16 36057; GCN-NEXT: s_waitcnt vmcnt(1) 36058; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 36059; GCN-NEXT: s_waitcnt vmcnt(0) 36060; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 36061; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 36062; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 36063; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 36064; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16 36065; GCN-NEXT: s_waitcnt vmcnt(1) 36066; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 36067; GCN-NEXT: s_waitcnt vmcnt(0) 36068; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 36069; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 36070; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 36071; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 36072; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 36073; GCN-NEXT: s_waitcnt vmcnt(1) 36074; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 36075; GCN-NEXT: s_waitcnt vmcnt(0) 36076; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 36077; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 36078; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 36079; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 36080; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 36081; GCN-NEXT: s_waitcnt vmcnt(1) 36082; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 36083; GCN-NEXT: s_waitcnt vmcnt(0) 36084; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 36085; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 36086; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 36087; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 36088; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 36089; GCN-NEXT: s_waitcnt vmcnt(1) 36090; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 36091; GCN-NEXT: s_waitcnt vmcnt(0) 36092; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 36093; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 36094; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 36095; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 36096; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16 36097; GCN-NEXT: s_waitcnt vmcnt(1) 36098; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 36099; GCN-NEXT: s_waitcnt vmcnt(0) 36100; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 36101; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 36102; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 36103; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 36104; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 36105; GCN-NEXT: s_waitcnt vmcnt(1) 36106; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 36107; GCN-NEXT: s_waitcnt vmcnt(0) 36108; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 36109; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 36110; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 36111; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 36112; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16 36113; GCN-NEXT: s_waitcnt vmcnt(1) 36114; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 36115; GCN-NEXT: s_waitcnt vmcnt(0) 36116; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 36117; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 36118; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 36119; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 36120; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 36121; GCN-NEXT: s_waitcnt vmcnt(1) 36122; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 36123; GCN-NEXT: s_waitcnt vmcnt(0) 36124; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 36125; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 36126; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 36127; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 36128; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16 36129; GCN-NEXT: s_waitcnt vmcnt(1) 36130; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 36131; GCN-NEXT: s_waitcnt vmcnt(0) 36132; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 36133; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 36134; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 36135; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 36136; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 36137; GCN-NEXT: s_waitcnt vmcnt(1) 36138; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 36139; GCN-NEXT: s_waitcnt vmcnt(0) 36140; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 36141; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 36142; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 36143; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 36144; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16 36145; GCN-NEXT: s_waitcnt vmcnt(1) 36146; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 36147; GCN-NEXT: s_waitcnt vmcnt(0) 36148; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 36149; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 36150; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 36151; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 36152; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 36153; GCN-NEXT: s_waitcnt vmcnt(1) 36154; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 36155; GCN-NEXT: s_waitcnt vmcnt(0) 36156; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 36157; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 36158; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 36159; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc 36160; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc 36161; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc 36162; GCN-NEXT: v_cndmask_b32_e32 v27, v27, v12, vcc 36163; GCN-NEXT: v_cndmask_b32_e32 v26, v26, v11, vcc 36164; GCN-NEXT: v_cndmask_b32_e32 v25, v25, v10, vcc 36165; GCN-NEXT: v_cndmask_b32_e32 v24, v24, v9, vcc 36166; GCN-NEXT: v_cndmask_b32_e32 v23, v23, v8, vcc 36167; GCN-NEXT: v_cndmask_b32_e32 v22, v22, v7, vcc 36168; GCN-NEXT: v_cndmask_b32_e32 v13, v21, v6, vcc 36169; GCN-NEXT: v_cndmask_b32_e32 v11, v20, v5, vcc 36170; GCN-NEXT: v_cndmask_b32_e32 v9, v19, v4, vcc 36171; GCN-NEXT: v_cndmask_b32_e32 v7, v18, v3, vcc 36172; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v2, vcc 36173; GCN-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc 36174; GCN-NEXT: v_cndmask_b32_e32 v1, v15, v0, vcc 36175; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 36176; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 36177; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 36178; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 36179; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 36180; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 36181; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7 36182; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 36183; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 36184; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 36185; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 36186; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 36187; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 36188; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 36189; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 36190; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 36191; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v23 36192; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 36193; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v24 36194; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v24 36195; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v25 36196; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v25 36197; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26 36198; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 36199; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v27 36200; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 36201; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v28 36202; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v28 36203; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29 36204; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 36205; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 36206; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 36207; GCN-NEXT: s_setpc_b64 s[30:31] 36208; 36209; GFX7-LABEL: v_select_v32bf16: 36210; GFX7: ; %bb.0: 36211; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36212; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 36213; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36214; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 36215; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 36216; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4 36217; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36218; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 36219; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 36220; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 36221; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 36222; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 36223; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 36224; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 36225; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 36226; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 36227; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 36228; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10 36229; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 36230; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 36231; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 36232; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 36233; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 36234; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 36235; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 36236; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 36237; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 36238; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 36239; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 36240; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 36241; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 36242; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 36243; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 36244; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 36245; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 36246; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 36247; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 36248; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 36249; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 36250; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 36251; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 36252; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 36253; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 36254; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 36255; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 36256; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 36257; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 36258; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 36259; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 36260; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 36261; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 36262; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 36263; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 36264; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 36265; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 36266; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 36267; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 36268; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 36269; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 36270; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 36271; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 36272; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 36273; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 36274; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 36275; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 36276; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 36277; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 36278; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 36279; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 36280; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 36281; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 36282; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 36283; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 36284; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 36285; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 36286; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 36287; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 36288; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 36289; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 36290; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 36291; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 36292; GFX7-NEXT: s_waitcnt vmcnt(14) 36293; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 36294; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 36295; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 36296; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 36297; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36298; GFX7-NEXT: s_waitcnt vmcnt(13) 36299; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 36300; GFX7-NEXT: s_waitcnt vmcnt(12) 36301; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 36302; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 36303; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 36304; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 36305; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc 36306; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 36307; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 36308; GFX7-NEXT: s_waitcnt vmcnt(12) 36309; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 36310; GFX7-NEXT: s_waitcnt vmcnt(11) 36311; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 36312; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 36313; GFX7-NEXT: s_waitcnt vmcnt(9) 36314; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 36315; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 36316; GFX7-NEXT: s_waitcnt vmcnt(7) 36317; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 36318; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 36319; GFX7-NEXT: s_waitcnt vmcnt(6) 36320; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 36321; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 36322; GFX7-NEXT: s_waitcnt vmcnt(5) 36323; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 36324; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 36325; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 36326; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 36327; GFX7-NEXT: s_waitcnt vmcnt(4) 36328; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 36329; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 36330; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 36331; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 36332; GFX7-NEXT: s_waitcnt vmcnt(3) 36333; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 36334; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 36335; GFX7-NEXT: s_waitcnt vmcnt(1) 36336; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 36337; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 36338; GFX7-NEXT: s_waitcnt vmcnt(0) 36339; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 36340; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 36341; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16 36342; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 36343; GFX7-NEXT: s_waitcnt vmcnt(0) 36344; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 36345; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 36346; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 36347; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 36348; GFX7-NEXT: s_waitcnt vmcnt(0) 36349; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 36350; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 36351; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16 36352; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 36353; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc 36354; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 36355; GFX7-NEXT: s_waitcnt vmcnt(0) 36356; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 36357; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 36358; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16 36359; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 36360; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc 36361; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc 36362; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc 36363; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 36364; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 36365; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 36366; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 36367; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 36368; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 36369; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 36370; GFX7-NEXT: s_waitcnt vmcnt(0) 36371; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36372; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16 36373; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 36374; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc 36375; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 36376; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 36377; GFX7-NEXT: s_waitcnt vmcnt(0) 36378; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36379; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 36380; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 36381; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc 36382; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 36383; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 36384; GFX7-NEXT: s_waitcnt vmcnt(0) 36385; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36386; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16 36387; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 36388; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc 36389; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 36390; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 36391; GFX7-NEXT: s_waitcnt vmcnt(0) 36392; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36393; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16 36394; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 36395; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc 36396; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 36397; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 36398; GFX7-NEXT: s_waitcnt vmcnt(0) 36399; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36400; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16 36401; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 36402; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc 36403; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 36404; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 36405; GFX7-NEXT: s_waitcnt vmcnt(0) 36406; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36407; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16 36408; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 36409; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc 36410; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21 36411; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 36412; GFX7-NEXT: s_waitcnt vmcnt(0) 36413; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36414; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16 36415; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 36416; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc 36417; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 36418; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 36419; GFX7-NEXT: s_waitcnt vmcnt(0) 36420; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36421; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16 36422; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 36423; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc 36424; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25 36425; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 36426; GFX7-NEXT: s_waitcnt vmcnt(0) 36427; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36428; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16 36429; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 36430; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc 36431; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27 36432; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 36433; GFX7-NEXT: s_waitcnt vmcnt(0) 36434; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36435; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16 36436; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 36437; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc 36438; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29 36439; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 36440; GFX7-NEXT: s_waitcnt vmcnt(0) 36441; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 36442; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 36443; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16 36444; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 36445; GFX7-NEXT: s_waitcnt vmcnt(0) 36446; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 36447; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 36448; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16 36449; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc 36450; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 36451; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 36452; GFX7-NEXT: s_setpc_b64 s[30:31] 36453; 36454; GFX8-LABEL: v_select_v32bf16: 36455; GFX8: ; %bb.0: 36456; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36457; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 36458; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 36459; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc 36460; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc 36461; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 36462; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 36463; GFX8-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc 36464; GFX8-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc 36465; GFX8-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc 36466; GFX8-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc 36467; GFX8-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc 36468; GFX8-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc 36469; GFX8-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc 36470; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc 36471; GFX8-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc 36472; GFX8-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc 36473; GFX8-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc 36474; GFX8-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc 36475; GFX8-NEXT: s_waitcnt vmcnt(1) 36476; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc 36477; GFX8-NEXT: s_waitcnt vmcnt(0) 36478; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc 36479; GFX8-NEXT: s_setpc_b64 s[30:31] 36480; 36481; GFX9-LABEL: v_select_v32bf16: 36482; GFX9: ; %bb.0: 36483; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36484; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 36485; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 36486; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc 36487; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc 36488; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 36489; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 36490; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc 36491; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc 36492; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc 36493; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc 36494; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc 36495; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc 36496; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc 36497; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc 36498; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc 36499; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc 36500; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc 36501; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc 36502; GFX9-NEXT: s_waitcnt vmcnt(1) 36503; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc 36504; GFX9-NEXT: s_waitcnt vmcnt(0) 36505; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc 36506; GFX9-NEXT: s_setpc_b64 s[30:31] 36507; 36508; GFX10-LABEL: v_select_v32bf16: 36509; GFX10: ; %bb.0: 36510; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36511; GFX10-NEXT: s_clause 0x1 36512; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 36513; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 36514; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 36515; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 36516; GFX10-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc_lo 36517; GFX10-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc_lo 36518; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc_lo 36519; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc_lo 36520; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc_lo 36521; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc_lo 36522; GFX10-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc_lo 36523; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc_lo 36524; GFX10-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc_lo 36525; GFX10-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc_lo 36526; GFX10-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc_lo 36527; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc_lo 36528; GFX10-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc_lo 36529; GFX10-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc_lo 36530; GFX10-NEXT: s_waitcnt vmcnt(1) 36531; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc_lo 36532; GFX10-NEXT: s_waitcnt vmcnt(0) 36533; GFX10-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc_lo 36534; GFX10-NEXT: s_setpc_b64 s[30:31] 36535; 36536; GFX11-LABEL: v_select_v32bf16: 36537; GFX11: ; %bb.0: 36538; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36539; GFX11-NEXT: s_clause 0x1 36540; GFX11-NEXT: scratch_load_b32 v31, off, s32 36541; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 36542; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 36543; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 36544; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 36545; GFX11-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2 36546; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4 36547; GFX11-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6 36548; GFX11-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8 36549; GFX11-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10 36550; GFX11-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12 36551; GFX11-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14 36552; GFX11-NEXT: s_waitcnt vmcnt(0) 36553; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16 36554; GFX11-NEXT: s_setpc_b64 s[30:31] 36555 %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b 36556 ret <32 x bfloat> %op 36557} 36558 36559define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { 36560; GCN-LABEL: s_select_v3bf16: 36561; GCN: ; %bb.0: 36562; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 36563; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 36564; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 36565; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 36566; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 36567; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 36568; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 36569; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 36570; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 36571; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 36572; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 36573; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 36574; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36575; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc 36576; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 36577; GCN-NEXT: v_readfirstlane_b32 s0, v1 36578; GCN-NEXT: v_readfirstlane_b32 s1, v0 36579; GCN-NEXT: ; return to shader part epilog 36580; 36581; GFX7-LABEL: s_select_v3bf16: 36582; GFX7: ; %bb.0: 36583; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 36584; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 36585; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 36586; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 36587; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 36588; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36589; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 36590; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 36591; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 36592; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 36593; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 36594; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 36595; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36596; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 36597; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 36598; GFX7-NEXT: v_readfirstlane_b32 s0, v1 36599; GFX7-NEXT: v_readfirstlane_b32 s1, v0 36600; GFX7-NEXT: ; return to shader part epilog 36601; 36602; GFX8-LABEL: s_select_v3bf16: 36603; GFX8: ; %bb.0: 36604; GFX8-NEXT: v_mov_b32_e32 v1, s2 36605; GFX8-NEXT: v_mov_b32_e32 v2, s0 36606; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36607; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 36608; GFX8-NEXT: v_mov_b32_e32 v1, s3 36609; GFX8-NEXT: v_mov_b32_e32 v2, s1 36610; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 36611; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 36612; GFX8-NEXT: v_readfirstlane_b32 s0, v0 36613; GFX8-NEXT: v_readfirstlane_b32 s1, v1 36614; GFX8-NEXT: ; return to shader part epilog 36615; 36616; GFX9-LABEL: s_select_v3bf16: 36617; GFX9: ; %bb.0: 36618; GFX9-NEXT: v_mov_b32_e32 v1, s2 36619; GFX9-NEXT: v_mov_b32_e32 v2, s0 36620; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36621; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 36622; GFX9-NEXT: v_mov_b32_e32 v1, s3 36623; GFX9-NEXT: v_mov_b32_e32 v2, s1 36624; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 36625; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 36626; GFX9-NEXT: v_readfirstlane_b32 s0, v0 36627; GFX9-NEXT: v_readfirstlane_b32 s1, v1 36628; GFX9-NEXT: ; return to shader part epilog 36629; 36630; GFX10-LABEL: s_select_v3bf16: 36631; GFX10: ; %bb.0: 36632; GFX10-NEXT: v_mov_b32_e32 v1, s0 36633; GFX10-NEXT: v_mov_b32_e32 v2, s1 36634; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36635; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo 36636; GFX10-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo 36637; GFX10-NEXT: v_readfirstlane_b32 s0, v0 36638; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 36639; GFX10-NEXT: v_readfirstlane_b32 s1, v1 36640; GFX10-NEXT: ; return to shader part epilog 36641; 36642; GFX11-LABEL: s_select_v3bf16: 36643; GFX11: ; %bb.0: 36644; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 36645; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36646; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 36647; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo 36648; GFX11-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo 36649; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 36650; GFX11-NEXT: v_readfirstlane_b32 s0, v0 36651; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 36652; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 36653; GFX11-NEXT: v_readfirstlane_b32 s1, v1 36654; GFX11-NEXT: ; return to shader part epilog 36655 %cond = icmp eq i32 %c, 0 36656 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b 36657 %cast = bitcast <3 x bfloat> %op to i48 36658 %elt0 = trunc i48 %cast to i32 36659 %elt1.hi = lshr i48 %cast, 32 36660 %elt1 = trunc i48 %elt1.hi to i32 36661 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) 36662 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) 36663 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 36664 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1 36665 ret <2 x i32> %bv.1 36666} 36667 36668define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) { 36669; GCN-LABEL: s_select_v4bf16: 36670; GCN: ; %bb.0: 36671; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 36672; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 36673; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5 36674; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4 36675; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3 36676; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 36677; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 36678; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 36679; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 36680; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 36681; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 36682; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 36683; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 36684; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 36685; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 36686; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 36687; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36688; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 36689; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 36690; GCN-NEXT: v_readfirstlane_b32 s0, v1 36691; GCN-NEXT: v_readfirstlane_b32 s1, v0 36692; GCN-NEXT: ; return to shader part epilog 36693; 36694; GFX7-LABEL: s_select_v4bf16: 36695; GFX7: ; %bb.0: 36696; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 36697; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 36698; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 36699; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 36700; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 36701; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36702; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 36703; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 36704; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 36705; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 36706; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 36707; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 36708; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 36709; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 36710; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 36711; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 36712; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36713; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 36714; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 36715; GFX7-NEXT: v_readfirstlane_b32 s0, v1 36716; GFX7-NEXT: v_readfirstlane_b32 s1, v0 36717; GFX7-NEXT: ; return to shader part epilog 36718; 36719; GFX8-LABEL: s_select_v4bf16: 36720; GFX8: ; %bb.0: 36721; GFX8-NEXT: v_mov_b32_e32 v1, s3 36722; GFX8-NEXT: v_mov_b32_e32 v2, s1 36723; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36724; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 36725; GFX8-NEXT: v_mov_b32_e32 v1, s2 36726; GFX8-NEXT: v_mov_b32_e32 v2, s0 36727; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 36728; GFX8-NEXT: v_readfirstlane_b32 s0, v1 36729; GFX8-NEXT: v_readfirstlane_b32 s1, v0 36730; GFX8-NEXT: ; return to shader part epilog 36731; 36732; GFX9-LABEL: s_select_v4bf16: 36733; GFX9: ; %bb.0: 36734; GFX9-NEXT: v_mov_b32_e32 v1, s3 36735; GFX9-NEXT: v_mov_b32_e32 v2, s1 36736; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36737; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 36738; GFX9-NEXT: v_mov_b32_e32 v1, s2 36739; GFX9-NEXT: v_mov_b32_e32 v2, s0 36740; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 36741; GFX9-NEXT: v_readfirstlane_b32 s0, v1 36742; GFX9-NEXT: v_readfirstlane_b32 s1, v0 36743; GFX9-NEXT: ; return to shader part epilog 36744; 36745; GFX10-LABEL: s_select_v4bf16: 36746; GFX10: ; %bb.0: 36747; GFX10-NEXT: v_mov_b32_e32 v1, s1 36748; GFX10-NEXT: v_mov_b32_e32 v2, s0 36749; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36750; GFX10-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo 36751; GFX10-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo 36752; GFX10-NEXT: v_readfirstlane_b32 s1, v0 36753; GFX10-NEXT: v_readfirstlane_b32 s0, v1 36754; GFX10-NEXT: ; return to shader part epilog 36755; 36756; GFX11-LABEL: s_select_v4bf16: 36757; GFX11: ; %bb.0: 36758; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 36759; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36760; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 36761; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo 36762; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo 36763; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 36764; GFX11-NEXT: v_readfirstlane_b32 s1, v0 36765; GFX11-NEXT: v_readfirstlane_b32 s0, v1 36766; GFX11-NEXT: ; return to shader part epilog 36767 %cond = icmp eq i32 %c, 0 36768 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b 36769 %cast = bitcast <4 x bfloat> %op to <2 x i32> 36770 %elt0 = extractelement <2 x i32> %cast, i32 0 36771 %elt1 = extractelement <2 x i32> %cast, i32 1 36772 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) 36773 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) 36774 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 36775 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1 36776 ret <2 x i32> %bv.1 36777} 36778 36779define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) { 36780; GCN-LABEL: s_vselect_v4bf16: 36781; GCN: ; %bb.0: 36782; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0 36783; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4 36784; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s1 36785; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s5 36786; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s2 36787; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s6 36788; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 36789; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s7 36790; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 36791; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc 36792; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 36793; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc 36794; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 36795; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc 36796; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36797; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc 36798; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 36799; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36800; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 36801; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 36802; GCN-NEXT: v_or_b32_e32 v2, v2, v3 36803; GCN-NEXT: v_or_b32_e32 v0, v0, v1 36804; GCN-NEXT: v_readfirstlane_b32 s0, v0 36805; GCN-NEXT: v_readfirstlane_b32 s1, v2 36806; GCN-NEXT: ; return to shader part epilog 36807; 36808; GFX7-LABEL: s_vselect_v4bf16: 36809; GFX7: ; %bb.0: 36810; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3 36811; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 36812; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 36813; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s2 36814; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 36815; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc 36816; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 36817; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s1 36818; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s5 36819; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc 36820; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 36821; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0 36822; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s4 36823; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc 36824; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36825; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc 36826; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 36827; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 36828; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 36829; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 36830; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 36831; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 36832; GFX7-NEXT: v_readfirstlane_b32 s0, v0 36833; GFX7-NEXT: v_readfirstlane_b32 s1, v2 36834; GFX7-NEXT: ; return to shader part epilog 36835; 36836; GFX8-LABEL: s_vselect_v4bf16: 36837; GFX8: ; %bb.0: 36838; GFX8-NEXT: s_lshr_b32 s4, s1, 16 36839; GFX8-NEXT: s_lshr_b32 s5, s3, 16 36840; GFX8-NEXT: v_mov_b32_e32 v4, s5 36841; GFX8-NEXT: v_mov_b32_e32 v5, s4 36842; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 36843; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 36844; GFX8-NEXT: v_mov_b32_e32 v4, s3 36845; GFX8-NEXT: v_mov_b32_e32 v5, s1 36846; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 36847; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 36848; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 36849; GFX8-NEXT: s_lshr_b32 s1, s0, 16 36850; GFX8-NEXT: s_lshr_b32 s3, s2, 16 36851; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 36852; GFX8-NEXT: v_mov_b32_e32 v3, s3 36853; GFX8-NEXT: v_mov_b32_e32 v4, s1 36854; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 36855; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 36856; GFX8-NEXT: v_mov_b32_e32 v3, s2 36857; GFX8-NEXT: v_mov_b32_e32 v4, s0 36858; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36859; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 36860; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 36861; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 36862; GFX8-NEXT: v_readfirstlane_b32 s0, v0 36863; GFX8-NEXT: v_readfirstlane_b32 s1, v2 36864; GFX8-NEXT: ; return to shader part epilog 36865; 36866; GFX9-LABEL: s_vselect_v4bf16: 36867; GFX9: ; %bb.0: 36868; GFX9-NEXT: s_lshr_b32 s4, s1, 16 36869; GFX9-NEXT: s_lshr_b32 s5, s3, 16 36870; GFX9-NEXT: v_mov_b32_e32 v4, s5 36871; GFX9-NEXT: v_mov_b32_e32 v5, s4 36872; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 36873; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 36874; GFX9-NEXT: v_mov_b32_e32 v4, s3 36875; GFX9-NEXT: v_mov_b32_e32 v5, s1 36876; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 36877; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc 36878; GFX9-NEXT: s_mov_b32 s1, 0x5040100 36879; GFX9-NEXT: s_lshr_b32 s3, s0, 16 36880; GFX9-NEXT: s_lshr_b32 s4, s2, 16 36881; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1 36882; GFX9-NEXT: v_mov_b32_e32 v3, s4 36883; GFX9-NEXT: v_mov_b32_e32 v4, s3 36884; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 36885; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 36886; GFX9-NEXT: v_mov_b32_e32 v3, s2 36887; GFX9-NEXT: v_mov_b32_e32 v4, s0 36888; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 36889; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 36890; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1 36891; GFX9-NEXT: v_readfirstlane_b32 s0, v0 36892; GFX9-NEXT: v_readfirstlane_b32 s1, v2 36893; GFX9-NEXT: ; return to shader part epilog 36894; 36895; GFX10-LABEL: s_vselect_v4bf16: 36896; GFX10: ; %bb.0: 36897; GFX10-NEXT: s_lshr_b32 s4, s1, 16 36898; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 36899; GFX10-NEXT: v_mov_b32_e32 v4, s4 36900; GFX10-NEXT: s_lshr_b32 s4, s3, 16 36901; GFX10-NEXT: s_lshr_b32 s5, s0, 16 36902; GFX10-NEXT: v_mov_b32_e32 v6, s0 36903; GFX10-NEXT: s_lshr_b32 s0, s2, 16 36904; GFX10-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo 36905; GFX10-NEXT: v_mov_b32_e32 v4, s5 36906; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 36907; GFX10-NEXT: v_mov_b32_e32 v5, s1 36908; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo 36909; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36910; GFX10-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo 36911; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 36912; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 36913; GFX10-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo 36914; GFX10-NEXT: v_readfirstlane_b32 s0, v0 36915; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 36916; GFX10-NEXT: v_readfirstlane_b32 s1, v1 36917; GFX10-NEXT: ; return to shader part epilog 36918; 36919; GFX11TRUE16-LABEL: s_vselect_v4bf16: 36920; GFX11TRUE16: ; %bb.0: 36921; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16 36922; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36923; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 36924; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16 36925; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 36926; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 36927; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 36928; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16 36929; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 36930; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 36931; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8 36932; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 36933; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 36934; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2 36935; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 36936; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1 36937; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s6 36938; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.l, s4 36939; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 36940; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo 36941; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s5 36942; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 36943; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l 36944; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 36945; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 36946; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h 36947; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 36948; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 36949; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 36950; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1 36951; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 36952; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 36953; GFX11TRUE16-NEXT: ; return to shader part epilog 36954; 36955; GFX11FAKE16-LABEL: s_vselect_v4bf16: 36956; GFX11FAKE16: ; %bb.0: 36957; GFX11FAKE16-NEXT: s_lshr_b32 s4, s1, 16 36958; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 36959; GFX11FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 36960; GFX11FAKE16-NEXT: s_lshr_b32 s4, s3, 16 36961; GFX11FAKE16-NEXT: s_lshr_b32 s5, s0, 16 36962; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 36963; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo 36964; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, s5 36965; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 36966; GFX11FAKE16-NEXT: v_mov_b32_e32 v6, s0 36967; GFX11FAKE16-NEXT: s_lshr_b32 s0, s2, 16 36968; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) 36969; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo 36970; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 36971; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 36972; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo 36973; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 36974; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 36975; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo 36976; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 36977; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 36978; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 36979; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 36980; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 36981; GFX11FAKE16-NEXT: ; return to shader part epilog 36982 %cond = icmp eq <4 x i32> %c, zeroinitializer 36983 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b 36984 %cast = bitcast <4 x bfloat> %op to <2 x i32> 36985 %elt0 = extractelement <2 x i32> %cast, i32 0 36986 %elt1 = extractelement <2 x i32> %cast, i32 1 36987 %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) 36988 %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) 36989 %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 36990 %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1 36991 ret <2 x i32> %bv.1 36992} 36993 36994define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) { 36995; GCN-LABEL: v_vselect_v4bf16: 36996; GCN: ; %bb.0: 36997; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36998; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 36999; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37000; GCN-NEXT: v_and_b32_e32 v0, 1, v0 37001; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 37002; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 37003; GCN-NEXT: v_and_b32_e32 v1, 1, v1 37004; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 37005; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 37006; GCN-NEXT: v_and_b32_e32 v2, 1, v2 37007; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37008; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 37009; GCN-NEXT: v_and_b32_e32 v3, 1, v3 37010; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37011; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc 37012; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37013; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc 37014; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37015; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc 37016; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37017; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc 37018; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37019; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37020; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 37021; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 37022; GCN-NEXT: s_setpc_b64 s[30:31] 37023; 37024; GFX7-LABEL: v_vselect_v4bf16: 37025; GFX7: ; %bb.0: 37026; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37027; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 37028; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 37029; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 37030; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 37031; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37032; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 37033; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 37034; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 37035; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc 37036; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37037; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 37038; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 37039; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 37040; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v6, vcc 37041; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37042; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 37043; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37044; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc 37045; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37046; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v4, vcc 37047; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37048; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37049; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 37050; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 37051; GFX7-NEXT: s_setpc_b64 s[30:31] 37052; 37053; GFX8-LABEL: v_vselect_v4bf16: 37054; GFX8: ; %bb.0: 37055; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37056; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 37057; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 37058; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v5 37059; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 37060; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37061; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 37062; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc 37063; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37064; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 37065; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 37066; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 37067; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6 37068; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37069; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc 37070; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37071; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc 37072; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 37073; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37074; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 37075; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37076; GFX8-NEXT: s_setpc_b64 s[30:31] 37077; 37078; GFX9-LABEL: v_vselect_v4bf16: 37079; GFX9: ; %bb.0: 37080; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37081; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 37082; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 37083; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37084; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 37085; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 37086; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 37087; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 37088; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37089; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 37090; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 37091; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37092; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc 37093; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 37094; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6 37095; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37096; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc 37097; GFX9-NEXT: s_mov_b32 s4, 0x5040100 37098; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 37099; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 37100; GFX9-NEXT: s_setpc_b64 s[30:31] 37101; 37102; GFX10-LABEL: v_vselect_v4bf16: 37103; GFX10: ; %bb.0: 37104; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37105; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 37106; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 37107; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 37108; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 37109; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4 37110; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 37111; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6 37112; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo 37113; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 37114; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5 37115; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7 37116; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo 37117; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37118; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo 37119; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 37120; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 37121; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo 37122; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 37123; GFX10-NEXT: s_setpc_b64 s[30:31] 37124; 37125; GFX11TRUE16-LABEL: v_vselect_v4bf16: 37126; GFX11TRUE16: ; %bb.0: 37127; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37128; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 37129; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 37130; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 37131; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7 37132; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) 37133; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 37134; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v2 37135; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37136; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 37137; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 37138; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 37139; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 37140; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 37141; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 37142; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 37143; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo 37144; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v8.l, v3.l, s1 37145; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 37146; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v5.l, s2 37147; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l 37148; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h 37149; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 37150; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l 37151; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h 37152; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 37153; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 37154; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 37155; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 37156; 37157; GFX11FAKE16-LABEL: v_vselect_v4bf16: 37158; GFX11FAKE16: ; %bb.0: 37159; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37160; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 37161; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 37162; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 37163; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 37164; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 37165; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3 37166; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 37167; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 37168; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 37169; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 37170; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1 37171; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 37172; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37173; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo 37174; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 37175; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 37176; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo 37177; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 37178; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 37179; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 37180 %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b 37181 ret <4 x bfloat> %op 37182} 37183 37184define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) { 37185; GCN-LABEL: v_vselect_v8bf16: 37186; GCN: ; %bb.0: 37187; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37188; GCN-NEXT: v_and_b32_e32 v7, 1, v7 37189; GCN-NEXT: v_and_b32_e32 v6, 1, v6 37190; GCN-NEXT: v_and_b32_e32 v5, 1, v5 37191; GCN-NEXT: v_and_b32_e32 v4, 1, v4 37192; GCN-NEXT: v_and_b32_e32 v3, 1, v3 37193; GCN-NEXT: v_and_b32_e32 v2, 1, v2 37194; GCN-NEXT: v_and_b32_e32 v1, 1, v1 37195; GCN-NEXT: v_and_b32_e32 v0, 1, v0 37196; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 37197; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 37198; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 37199; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 37200; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 37201; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 37202; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37203; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 37204; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 37205; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 37206; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 37207; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 37208; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 37209; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 37210; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 37211; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 37212; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 37213; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc 37214; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37215; GCN-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc 37216; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37217; GCN-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc 37218; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37219; GCN-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc 37220; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37221; GCN-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc 37222; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37223; GCN-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc 37224; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37225; GCN-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc 37226; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37227; GCN-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc 37228; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37229; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37230; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 37231; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 37232; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 37233; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 37234; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 37235; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 37236; GCN-NEXT: s_setpc_b64 s[30:31] 37237; 37238; GFX7-LABEL: v_vselect_v8bf16: 37239; GFX7: ; %bb.0: 37240; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37241; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 37242; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 37243; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 37244; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 37245; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 37246; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc 37247; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 37248; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 37249; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37250; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 37251; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc 37252; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 37253; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 37254; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37255; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 37256; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc 37257; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 37258; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 37259; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37260; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 37261; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc 37262; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 37263; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 37264; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37265; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 37266; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc 37267; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 37268; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 37269; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18 37270; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37271; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 37272; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 37273; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17 37274; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc 37275; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37276; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37277; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16 37278; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc 37279; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37280; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc 37281; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37282; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37283; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 37284; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 37285; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 37286; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 37287; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 37288; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 37289; GFX7-NEXT: s_setpc_b64 s[30:31] 37290; 37291; GFX8-LABEL: v_vselect_v8bf16: 37292; GFX8: ; %bb.0: 37293; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37294; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 37295; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 37296; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v11 37297; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 37298; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 37299; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 37300; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc 37301; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37302; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 37303; GFX8-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc 37304; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10 37305; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14 37306; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37307; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 37308; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v11, vcc 37309; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37310; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 37311; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc 37312; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9 37313; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v13 37314; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37315; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 37316; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v10, vcc 37317; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37318; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 37319; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc 37320; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8 37321; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v12 37322; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37323; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v9, vcc 37324; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37325; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc 37326; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 37327; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37328; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 37329; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37330; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 37331; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7 37332; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37333; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37334; GFX8-NEXT: s_setpc_b64 s[30:31] 37335; 37336; GFX9-LABEL: v_vselect_v8bf16: 37337; GFX9: ; %bb.0: 37338; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37339; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 37340; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 37341; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37342; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 37343; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc 37344; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 37345; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 37346; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 37347; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 37348; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc 37349; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37350; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 37351; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc 37352; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 37353; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 37354; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37355; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 37356; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc 37357; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37358; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 37359; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc 37360; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 37361; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 37362; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37363; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 37364; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc 37365; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37366; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc 37367; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 37368; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 37369; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37370; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc 37371; GFX9-NEXT: s_mov_b32 s4, 0x5040100 37372; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 37373; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 37374; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 37375; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4 37376; GFX9-NEXT: s_setpc_b64 s[30:31] 37377; 37378; GFX10-LABEL: v_vselect_v8bf16: 37379; GFX10: ; %bb.0: 37380; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37381; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 37382; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 37383; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 37384; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 37385; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10 37386; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 37387; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14 37388; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 37389; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 37390; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 37391; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo 37392; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 37393; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 37394; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11 37395; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15 37396; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo 37397; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 37398; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 37399; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12 37400; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo 37401; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 37402; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo 37403; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 37404; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9 37405; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13 37406; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo 37407; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37408; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo 37409; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 37410; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 37411; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo 37412; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 37413; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 37414; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo 37415; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 37416; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 37417; GFX10-NEXT: s_setpc_b64 s[30:31] 37418; 37419; GFX11TRUE16-LABEL: v_vselect_v8bf16: 37420; GFX11TRUE16: ; %bb.0: 37421; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37422; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 37423; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 37424; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 37425; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 37426; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 37427; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37428; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 37429; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v7 37430; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v6 37431; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 37432; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5 37433; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15 37434; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 37435; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1 37436; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v11 37437; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 37438; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v2 37439; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v3 37440; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 37441; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s2 37442; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12 37443; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v9 37444; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v13 37445; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10 37446; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v14 37447; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v15.l, v11.l, s3 37448; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v14.l, v10.l, s4 37449; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo 37450; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v12.l, v8.l, s0 37451; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v5.l, v4.l, s1 37452; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v13.l, v9.l, s5 37453; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v6.l, s6 37454; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h 37455; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l 37456; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h 37457; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l 37458; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h 37459; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l 37460; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.h 37461; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l 37462; GFX11TRUE16-NEXT: v_perm_b32 v0, v4, v5, 0x5040100 37463; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v6, 0x5040100 37464; GFX11TRUE16-NEXT: v_perm_b32 v2, v3, v7, 0x5040100 37465; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) 37466; GFX11TRUE16-NEXT: v_perm_b32 v3, v8, v9, 0x5040100 37467; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 37468; 37469; GFX11FAKE16-LABEL: v_vselect_v8bf16: 37470; GFX11FAKE16: ; %bb.0: 37471; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37472; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v10 37473; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14 37474; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 37475; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 37476; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 37477; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 37478; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 37479; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1 37480; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 37481; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15 37482; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 37483; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 37484; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3 37485; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 37486; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 37487; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 37488; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 37489; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo 37490; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 37491; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7 37492; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) 37493; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 37494; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 37495; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 37496; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo 37497; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 37498; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo 37499; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 37500; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 37501; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 37502; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo 37503; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 37504; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 37505; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo 37506; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 37507; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 37508; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 37509; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 37510 %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b 37511 ret <8 x bfloat> %op 37512} 37513 37514define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) { 37515; GCN-LABEL: v_vselect_v16bf16: 37516; GCN: ; %bb.0: 37517; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37518; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 37519; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill 37520; GCN-NEXT: s_mov_b64 exec, s[4:5] 37521; GCN-NEXT: s_waitcnt expcnt(0) 37522; GCN-NEXT: v_writelane_b32 v31, s30, 0 37523; GCN-NEXT: v_writelane_b32 v31, s31, 1 37524; GCN-NEXT: v_writelane_b32 v31, s34, 2 37525; GCN-NEXT: v_writelane_b32 v31, s35, 3 37526; GCN-NEXT: v_and_b32_e32 v0, 1, v0 37527; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37528; GCN-NEXT: v_and_b32_e32 v0, 1, v1 37529; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 37530; GCN-NEXT: v_and_b32_e32 v0, 1, v2 37531; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 37532; GCN-NEXT: v_and_b32_e32 v0, 1, v3 37533; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 37534; GCN-NEXT: v_and_b32_e32 v0, 1, v4 37535; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 37536; GCN-NEXT: v_and_b32_e32 v0, 1, v5 37537; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 37538; GCN-NEXT: v_and_b32_e32 v0, 1, v6 37539; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 37540; GCN-NEXT: v_and_b32_e32 v0, 1, v7 37541; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 37542; GCN-NEXT: v_and_b32_e32 v0, 1, v8 37543; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 37544; GCN-NEXT: v_and_b32_e32 v0, 1, v9 37545; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 37546; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 37547; GCN-NEXT: v_and_b32_e32 v1, 1, v10 37548; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1 37549; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 37550; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 37551; GCN-NEXT: v_and_b32_e32 v2, 1, v11 37552; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2 37553; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 37554; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 37555; GCN-NEXT: v_and_b32_e32 v3, 1, v12 37556; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3 37557; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 37558; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19 37559; GCN-NEXT: v_and_b32_e32 v7, 1, v13 37560; GCN-NEXT: v_and_b32_e32 v8, 1, v14 37561; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7 37562; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 37563; GCN-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v8 37564; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 37565; GCN-NEXT: v_and_b32_e32 v9, 1, v15 37566; GCN-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v9 37567; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 37568; GCN-NEXT: s_waitcnt vmcnt(2) 37569; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37570; GCN-NEXT: s_waitcnt vmcnt(1) 37571; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37572; GCN-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[34:35] 37573; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 37574; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 37575; GCN-NEXT: s_waitcnt vmcnt(1) 37576; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 37577; GCN-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[30:31] 37578; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 37579; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v29 37580; GCN-NEXT: s_waitcnt vmcnt(1) 37581; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37582; GCN-NEXT: v_cndmask_b32_e64 v13, v7, v9, s[28:29] 37583; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 37584; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v28 37585; GCN-NEXT: s_waitcnt vmcnt(1) 37586; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37587; GCN-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[26:27] 37588; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 37589; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v27 37590; GCN-NEXT: s_waitcnt vmcnt(1) 37591; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37592; GCN-NEXT: v_cndmask_b32_e64 v11, v7, v9, s[24:25] 37593; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 37594; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26 37595; GCN-NEXT: s_waitcnt vmcnt(1) 37596; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37597; GCN-NEXT: v_cndmask_b32_e64 v10, v8, v9, s[22:23] 37598; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 37599; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v25 37600; GCN-NEXT: s_waitcnt vmcnt(1) 37601; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37602; GCN-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[20:21] 37603; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 37604; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v24 37605; GCN-NEXT: s_waitcnt vmcnt(1) 37606; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 37607; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[18:19] 37608; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 37609; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v23 37610; GCN-NEXT: s_waitcnt vmcnt(1) 37611; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 37612; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[16:17] 37613; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 37614; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 37615; GCN-NEXT: s_waitcnt vmcnt(1) 37616; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 37617; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[14:15] 37618; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 37619; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 37620; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 37621; GCN-NEXT: s_waitcnt vmcnt(1) 37622; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 37623; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13] 37624; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 37625; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 37626; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 37627; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 37628; GCN-NEXT: s_waitcnt vmcnt(1) 37629; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 37630; GCN-NEXT: s_waitcnt vmcnt(0) 37631; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 37632; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11] 37633; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9] 37634; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] 37635; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] 37636; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 37637; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37638; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37639; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 37640; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 37641; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 37642; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 37643; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 37644; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 37645; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 37646; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 37647; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 37648; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 37649; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 37650; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 37651; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 37652; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 37653; GCN-NEXT: v_readlane_b32 s35, v31, 3 37654; GCN-NEXT: v_readlane_b32 s34, v31, 2 37655; GCN-NEXT: v_readlane_b32 s31, v31, 1 37656; GCN-NEXT: v_readlane_b32 s30, v31, 0 37657; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 37658; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload 37659; GCN-NEXT: s_mov_b64 exec, s[4:5] 37660; GCN-NEXT: s_waitcnt vmcnt(0) 37661; GCN-NEXT: s_setpc_b64 s[30:31] 37662; 37663; GFX7-LABEL: v_vselect_v16bf16: 37664; GFX7: ; %bb.0: 37665; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37666; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 37667; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 37668; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8 37669; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7 37670; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 37671; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 37672; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 37673; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15 37674; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 37675; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14 37676; GFX7-NEXT: v_and_b32_e32 v13, 1, v13 37677; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13 37678; GFX7-NEXT: v_and_b32_e32 v12, 1, v12 37679; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12 37680; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 37681; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 37682; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 37683; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 37684; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 37685; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 37686; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 37687; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9 37688; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 37689; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 37690; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 37691; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 37692; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 37693; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 37694; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 37695; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 37696; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 37697; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 37698; GFX7-NEXT: s_waitcnt vmcnt(1) 37699; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 37700; GFX7-NEXT: s_waitcnt vmcnt(0) 37701; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37702; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13] 37703; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 37704; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 37705; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 37706; GFX7-NEXT: s_waitcnt vmcnt(0) 37707; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37708; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11] 37709; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 37710; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29 37711; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 37712; GFX7-NEXT: s_waitcnt vmcnt(0) 37713; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37714; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9] 37715; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 37716; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28 37717; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 37718; GFX7-NEXT: s_waitcnt vmcnt(0) 37719; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37720; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7] 37721; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 37722; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27 37723; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 37724; GFX7-NEXT: s_waitcnt vmcnt(0) 37725; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37726; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5] 37727; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 37728; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26 37729; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 37730; GFX7-NEXT: s_waitcnt vmcnt(0) 37731; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37732; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc 37733; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37734; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22 37735; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 37736; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 37737; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25 37738; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 37739; GFX7-NEXT: s_waitcnt vmcnt(1) 37740; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 37741; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc 37742; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37743; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21 37744; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 37745; GFX7-NEXT: s_waitcnt vmcnt(1) 37746; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37747; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19] 37748; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 37749; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24 37750; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 37751; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 37752; GFX7-NEXT: s_waitcnt vmcnt(1) 37753; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 37754; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc 37755; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37756; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 37757; GFX7-NEXT: s_waitcnt vmcnt(1) 37758; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 37759; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17] 37760; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23 37761; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 37762; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 37763; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 37764; GFX7-NEXT: s_waitcnt vmcnt(1) 37765; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 37766; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc 37767; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 37768; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37769; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 37770; GFX7-NEXT: s_waitcnt vmcnt(2) 37771; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 37772; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15] 37773; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 37774; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 37775; GFX7-NEXT: s_waitcnt vmcnt(1) 37776; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 37777; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc 37778; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 37779; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37780; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 37781; GFX7-NEXT: s_waitcnt vmcnt(2) 37782; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 37783; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc 37784; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37785; GFX7-NEXT: s_waitcnt vmcnt(1) 37786; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20 37787; GFX7-NEXT: s_waitcnt vmcnt(0) 37788; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 37789; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc 37790; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37791; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc 37792; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 37793; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 37794; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 37795; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 37796; GFX7-NEXT: s_setpc_b64 s[30:31] 37797; 37798; GFX8-LABEL: v_vselect_v16bf16: 37799; GFX8: ; %bb.0: 37800; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37801; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 37802; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill 37803; GFX8-NEXT: s_mov_b64 exec, s[4:5] 37804; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 37805; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37806; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 37807; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 37808; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 37809; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 37810; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 37811; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 37812; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 37813; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 37814; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 37815; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 37816; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 37817; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 37818; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 37819; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 37820; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 37821; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 37822; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 37823; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 37824; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 37825; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 37826; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 37827; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 37828; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 37829; GFX8-NEXT: v_writelane_b32 v31, s30, 0 37830; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 37831; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 37832; GFX8-NEXT: v_writelane_b32 v31, s31, 1 37833; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 37834; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 37835; GFX8-NEXT: v_writelane_b32 v31, s34, 2 37836; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 37837; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 37838; GFX8-NEXT: v_writelane_b32 v31, s35, 3 37839; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 37840; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22 37841; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30 37842; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29] 37843; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21 37844; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29 37845; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25] 37846; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20 37847; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28 37848; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21] 37849; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 37850; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23 37851; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24 37852; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27] 37853; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 37854; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23] 37855; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19] 37856; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15] 37857; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11] 37858; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7] 37859; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 37860; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 37861; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37862; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37863; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37864; GFX8-NEXT: s_waitcnt vmcnt(0) 37865; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31] 37866; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 37867; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35] 37868; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19 37869; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27 37870; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17] 37871; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18 37872; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26 37873; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13] 37874; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v17 37875; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25 37876; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[8:9] 37877; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16 37878; GFX8-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[4:5] 37879; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v16, vcc 37880; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 37881; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 37882; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 37883; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 37884; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11 37885; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37886; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37887; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37888; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37889; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 37890; GFX8-NEXT: v_readlane_b32 s35, v31, 3 37891; GFX8-NEXT: v_readlane_b32 s34, v31, 2 37892; GFX8-NEXT: v_readlane_b32 s31, v31, 1 37893; GFX8-NEXT: v_readlane_b32 s30, v31, 0 37894; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 37895; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload 37896; GFX8-NEXT: s_mov_b64 exec, s[4:5] 37897; GFX8-NEXT: s_waitcnt vmcnt(0) 37898; GFX8-NEXT: s_setpc_b64 s[30:31] 37899; 37900; GFX9-LABEL: v_vselect_v16bf16: 37901; GFX9: ; %bb.0: 37902; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37903; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 37904; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 37905; GFX9-NEXT: v_and_b32_e32 v13, 1, v13 37906; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc 37907; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 37908; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 37909; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 37910; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 37911; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc 37912; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 37913; GFX9-NEXT: v_and_b32_e32 v10, 1, v11 37914; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc 37915; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 37916; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29 37917; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 37918; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc 37919; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 37920; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 37921; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 37922; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 37923; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20 37924; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc 37925; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28 37926; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 37927; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 37928; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 37929; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc 37930; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 37931; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19 37932; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27 37933; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 37934; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc 37935; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 37936; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 37937; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc 37938; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 37939; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 37940; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 37941; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 37942; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc 37943; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 37944; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 37945; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc 37946; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 37947; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 37948; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 37949; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 37950; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 37951; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 37952; GFX9-NEXT: s_mov_b32 s4, 0x5040100 37953; GFX9-NEXT: s_waitcnt vmcnt(0) 37954; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc 37955; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 37956; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 37957; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc 37958; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 37959; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc 37960; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 37961; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 37962; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 37963; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc 37964; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 37965; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc 37966; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 37967; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 37968; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 37969; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc 37970; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 37971; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 37972; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 37973; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 37974; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4 37975; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4 37976; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4 37977; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4 37978; GFX9-NEXT: s_setpc_b64 s[30:31] 37979; 37980; GFX10-LABEL: v_vselect_v16bf16: 37981; GFX10: ; %bb.0: 37982; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37983; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 37984; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 37985; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 37986; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 37987; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v22 37988; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v30 37989; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 37990; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 37991; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 37992; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21 37993; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29 37994; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo 37995; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 37996; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 37997; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 37998; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20 37999; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28 38000; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo 38001; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 38002; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 38003; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 38004; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 38005; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 38006; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo 38007; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 38008; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17 38009; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25 38010; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 38011; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 38012; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo 38013; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 38014; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16 38015; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24 38016; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 38017; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18 38018; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo 38019; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 38020; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v26 38021; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 38022; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v19 38023; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v27 38024; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo 38025; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 38026; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 38027; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v23 38028; GFX10-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo 38029; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 38030; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo 38031; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 38032; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo 38033; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 38034; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo 38035; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 38036; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo 38037; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 38038; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo 38039; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 38040; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 38041; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo 38042; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 38043; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 38044; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 38045; GFX10-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo 38046; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 38047; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 38048; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 38049; GFX10-NEXT: s_waitcnt vmcnt(0) 38050; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31 38051; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo 38052; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 38053; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo 38054; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 38055; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100 38056; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100 38057; GFX10-NEXT: s_setpc_b64 s[30:31] 38058; 38059; GFX11TRUE16-LABEL: v_vselect_v16bf16: 38060; GFX11TRUE16: ; %bb.0: 38061; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38062; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32 38063; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 38064; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 38065; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 38066; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 38067; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 38068; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 38069; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 38070; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 38071; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 38072; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 38073; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 38074; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 38075; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 38076; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 38077; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 38078; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 38079; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 38080; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 38081; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 38082; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 38083; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 38084; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 38085; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 38086; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 38087; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 38088; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 38089; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 38090; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 38091; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 38092; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v28.l, v20.l, s8 38093; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v38.l, v37.l, s7 38094; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 38095; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 38096; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 38097; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 38098; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 38099; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 38100; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 38101; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 38102; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 38103; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 38104; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 38105; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 38106; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 38107; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12 38108; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 38109; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10 38110; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 38111; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 38112; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s6 38113; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v48.l, v39.l, s5 38114; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo 38115; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v24.l, v16.l, s0 38116; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.h 38117; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l 38118; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v30.l, v22.l, s10 38119; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v33.l, s11 38120; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v29.l, v21.l, s12 38121; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v36.l, v35.l, s9 38122; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v52.l, v51.l, s1 38123; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v25.l, v17.l, s2 38124; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v50.l, v49.l, s3 38125; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h 38126; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l 38127; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h 38128; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l 38129; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v26.l, v18.l, s4 38130; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h 38131; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l 38132; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h 38133; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h 38134; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.l 38135; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v0.h 38136; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l 38137; GFX11TRUE16-NEXT: v_perm_b32 v0, v7, v8, 0x5040100 38138; GFX11TRUE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100 38139; GFX11TRUE16-NEXT: v_perm_b32 v5, v14, v15, 0x5040100 38140; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) 38141; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v31 38142; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v31.l, v23.l, s14 38143; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 38144; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, v32.l, s13 38145; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l 38146; GFX11TRUE16-NEXT: v_perm_b32 v2, v6, v4, 0x5040100 38147; GFX11TRUE16-NEXT: v_perm_b32 v4, v12, v13, 0x5040100 38148; GFX11TRUE16-NEXT: v_perm_b32 v6, v16, v17, 0x5040100 38149; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h 38150; GFX11TRUE16-NEXT: v_perm_b32 v3, v10, v11, 0x5040100 38151; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 38152; GFX11TRUE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 38153; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 38154; 38155; GFX11FAKE16-LABEL: v_vselect_v16bf16: 38156; GFX11FAKE16: ; %bb.0: 38157; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38158; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32 38159; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 38160; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 38161; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 38162; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 38163; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 38164; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30 38165; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 38166; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 38167; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 38168; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26 38169; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 38170; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11 38171; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 38172; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 38173; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 38174; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 38175; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 38176; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo 38177; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 38178; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 38179; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24 38180; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 38181; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 38182; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo 38183; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 38184; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 38185; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20 38186; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 38187; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 38188; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo 38189; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 38190; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 38191; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25 38192; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 38193; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 38194; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo 38195; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 38196; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 38197; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 38198; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo 38199; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 38200; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo 38201; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 38202; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7 38203; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 38204; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo 38205; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 38206; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo 38207; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 38208; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo 38209; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 38210; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo 38211; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 38212; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 38213; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 38214; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo 38215; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 38216; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 38217; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 38218; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo 38219; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 38220; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 38221; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) 38222; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v31 38223; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 38224; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 38225; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 38226; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo 38227; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 38228; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo 38229; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 38230; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 38231; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) 38232; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 38233; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 38234 %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b 38235 ret <16 x bfloat> %op 38236} 38237 38238define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) { 38239; GCN-LABEL: v_vselect_v32bf16: 38240; GCN: ; %bb.0: 38241; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38242; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill 38243; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill 38244; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill 38245; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill 38246; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill 38247; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill 38248; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill 38249; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill 38250; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill 38251; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill 38252; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill 38253; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill 38254; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill 38255; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill 38256; GCN-NEXT: v_and_b32_e32 v0, 1, v0 38257; GCN-NEXT: v_and_b32_e32 v1, 1, v1 38258; GCN-NEXT: v_and_b32_e32 v2, 1, v2 38259; GCN-NEXT: v_and_b32_e32 v36, 1, v13 38260; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 38261; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 38262; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 38263; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 38264; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 38265; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 38266; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 38267; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 38268; GCN-NEXT: v_and_b32_e32 v53, 1, v26 38269; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 38270; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 38271; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 38272; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 38273; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 38274; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 38275; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108 38276; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:112 38277; GCN-NEXT: v_and_b32_e32 v27, 1, v27 38278; GCN-NEXT: v_and_b32_e32 v28, 1, v28 38279; GCN-NEXT: v_and_b32_e32 v29, 1, v29 38280; GCN-NEXT: v_and_b32_e32 v30, 1, v30 38281; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 38282; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 38283; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 38284; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 38285; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:252 38286; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:248 38287; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:244 38288; GCN-NEXT: s_waitcnt expcnt(6) 38289; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:240 38290; GCN-NEXT: s_waitcnt vmcnt(14) 38291; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v37 38292; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 38293; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v36 38294; GCN-NEXT: s_waitcnt vmcnt(5) 38295; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v43 38296; GCN-NEXT: s_waitcnt vmcnt(3) 38297; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v44 38298; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v30 38299; GCN-NEXT: v_cndmask_b32_e64 v30, v37, v36, s[4:5] 38300; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:236 38301; GCN-NEXT: s_waitcnt expcnt(5) 38302; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:232 38303; GCN-NEXT: s_waitcnt expcnt(4) 38304; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:228 38305; GCN-NEXT: s_waitcnt expcnt(3) 38306; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:224 38307; GCN-NEXT: s_waitcnt expcnt(2) 38308; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:220 38309; GCN-NEXT: s_waitcnt expcnt(1) 38310; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:216 38311; GCN-NEXT: s_waitcnt expcnt(0) 38312; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 38313; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 38314; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 38315; GCN-NEXT: s_waitcnt vmcnt(10) 38316; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v45 38317; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 38318; GCN-NEXT: s_waitcnt vmcnt(9) 38319; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v46 38320; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 38321; GCN-NEXT: s_waitcnt vmcnt(8) 38322; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v47 38323; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 38324; GCN-NEXT: s_waitcnt vmcnt(7) 38325; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 38326; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v29 38327; GCN-NEXT: v_cndmask_b32_e64 v29, v43, v42, s[4:5] 38328; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 38329; GCN-NEXT: v_cndmask_b32_e64 v28, v44, v41, s[4:5] 38330; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v27 38331; GCN-NEXT: v_cndmask_b32_e64 v27, v45, v55, s[4:5] 38332; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v53 38333; GCN-NEXT: v_cndmask_b32_e64 v36, v36, v54, s[4:5] 38334; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 38335; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 38336; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 38337; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 38338; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 38339; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 38340; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 38341; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 38342; GCN-NEXT: v_and_b32_e32 v3, 1, v3 38343; GCN-NEXT: v_and_b32_e32 v4, 1, v4 38344; GCN-NEXT: v_and_b32_e32 v5, 1, v5 38345; GCN-NEXT: v_and_b32_e32 v6, 1, v6 38346; GCN-NEXT: v_and_b32_e32 v18, 1, v18 38347; GCN-NEXT: v_and_b32_e32 v22, 1, v22 38348; GCN-NEXT: v_and_b32_e32 v23, 1, v23 38349; GCN-NEXT: v_and_b32_e32 v24, 1, v24 38350; GCN-NEXT: v_and_b32_e32 v25, 1, v25 38351; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 38352; GCN-NEXT: s_waitcnt vmcnt(14) 38353; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v56 38354; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 38355; GCN-NEXT: s_waitcnt vmcnt(13) 38356; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v57 38357; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 38358; GCN-NEXT: s_waitcnt vmcnt(12) 38359; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v58 38360; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 38361; GCN-NEXT: s_waitcnt vmcnt(11) 38362; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v59 38363; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v25 38364; GCN-NEXT: v_cndmask_b32_e64 v25, v46, v52, s[4:5] 38365; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 38366; GCN-NEXT: v_cndmask_b32_e64 v24, v47, v51, s[4:5] 38367; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v23 38368; GCN-NEXT: v_cndmask_b32_e64 v23, v56, v50, s[4:5] 38369; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v22 38370; GCN-NEXT: v_cndmask_b32_e64 v22, v57, v49, s[4:5] 38371; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 38372; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 38373; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 38374; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 38375; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 38376; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 38377; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 38378; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:208 38379; GCN-NEXT: v_and_b32_e32 v19, 1, v19 38380; GCN-NEXT: v_and_b32_e32 v20, 1, v20 38381; GCN-NEXT: v_and_b32_e32 v21, 1, v21 38382; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 38383; GCN-NEXT: s_waitcnt vmcnt(14) 38384; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v60 38385; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 38386; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v61 38387; GCN-NEXT: s_waitcnt vmcnt(3) 38388; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 38389; GCN-NEXT: s_waitcnt vmcnt(2) 38390; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 38391; GCN-NEXT: s_waitcnt vmcnt(1) 38392; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 38393; GCN-NEXT: s_waitcnt vmcnt(0) 38394; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 38395; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21 38396; GCN-NEXT: v_cndmask_b32_e64 v21, v58, v48, s[4:5] 38397; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 38398; GCN-NEXT: v_cndmask_b32_e64 v20, v59, v39, s[4:5] 38399; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19 38400; GCN-NEXT: v_cndmask_b32_e64 v19, v57, v56, s[4:5] 38401; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 38402; GCN-NEXT: v_cndmask_b32_e64 v18, v47, v46, s[4:5] 38403; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 38404; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:148 38405; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 38406; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 38407; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 38408; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 38409; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 38410; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 38411; GCN-NEXT: v_and_b32_e32 v7, 1, v7 38412; GCN-NEXT: v_and_b32_e32 v8, 1, v8 38413; GCN-NEXT: v_and_b32_e32 v9, 1, v9 38414; GCN-NEXT: v_and_b32_e32 v10, 1, v10 38415; GCN-NEXT: v_and_b32_e32 v14, 1, v14 38416; GCN-NEXT: v_and_b32_e32 v15, 1, v15 38417; GCN-NEXT: v_and_b32_e32 v16, 1, v16 38418; GCN-NEXT: v_and_b32_e32 v17, 1, v17 38419; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 38420; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 38421; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 38422; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 38423; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 38424; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 38425; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 38426; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 38427; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 38428; GCN-NEXT: v_cndmask_b32_e64 v17, v52, v51, s[4:5] 38429; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 38430; GCN-NEXT: v_cndmask_b32_e64 v16, v50, v49, s[4:5] 38431; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15 38432; GCN-NEXT: v_cndmask_b32_e64 v15, v35, v34, s[4:5] 38433; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 38434; GCN-NEXT: v_cndmask_b32_e64 v14, v33, v32, s[4:5] 38435; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 38436; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 38437; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 38438; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 38439; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 38440; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 38441; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 38442; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 38443; GCN-NEXT: v_and_b32_e32 v11, 1, v11 38444; GCN-NEXT: v_and_b32_e32 v12, 1, v12 38445; GCN-NEXT: v_cndmask_b32_e32 v38, v38, v40, vcc 38446; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 38447; GCN-NEXT: v_and_b32_e32 v26, 1, v26 38448; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 38449; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 38450; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 38451; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 38452; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 38453; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 38454; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 38455; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 38456; GCN-NEXT: s_waitcnt vmcnt(14) 38457; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 38458; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 38459; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 38460; GCN-NEXT: s_waitcnt vmcnt(13) 38461; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 38462; GCN-NEXT: s_waitcnt vmcnt(12) 38463; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 38464; GCN-NEXT: s_waitcnt vmcnt(11) 38465; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 38466; GCN-NEXT: s_waitcnt vmcnt(10) 38467; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 38468; GCN-NEXT: s_waitcnt vmcnt(9) 38469; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v59 38470; GCN-NEXT: s_waitcnt vmcnt(8) 38471; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 38472; GCN-NEXT: s_waitcnt vmcnt(7) 38473; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 38474; GCN-NEXT: s_waitcnt vmcnt(6) 38475; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 38476; GCN-NEXT: s_waitcnt vmcnt(5) 38477; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 38478; GCN-NEXT: s_waitcnt vmcnt(4) 38479; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 38480; GCN-NEXT: s_waitcnt vmcnt(3) 38481; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 38482; GCN-NEXT: s_waitcnt vmcnt(2) 38483; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 38484; GCN-NEXT: s_waitcnt vmcnt(1) 38485; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 38486; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 38487; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 38488; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 38489; GCN-NEXT: s_waitcnt vmcnt(0) 38490; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 38491; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 38492; GCN-NEXT: v_cndmask_b32_e32 v12, v31, v13, vcc 38493; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 38494; GCN-NEXT: v_cndmask_b32_e32 v11, v52, v51, vcc 38495; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 38496; GCN-NEXT: v_cndmask_b32_e32 v10, v50, v49, vcc 38497; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 38498; GCN-NEXT: v_cndmask_b32_e32 v9, v35, v34, vcc 38499; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 38500; GCN-NEXT: v_cndmask_b32_e32 v8, v33, v32, vcc 38501; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 38502; GCN-NEXT: v_cndmask_b32_e32 v7, v59, v58, vcc 38503; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 38504; GCN-NEXT: v_cndmask_b32_e32 v6, v57, v56, vcc 38505; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 38506; GCN-NEXT: v_cndmask_b32_e32 v5, v47, v46, vcc 38507; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 38508; GCN-NEXT: v_cndmask_b32_e32 v4, v48, v39, vcc 38509; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 38510; GCN-NEXT: v_cndmask_b32_e32 v3, v45, v44, vcc 38511; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 38512; GCN-NEXT: v_cndmask_b32_e32 v2, v43, v42, vcc 38513; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 38514; GCN-NEXT: v_cndmask_b32_e32 v1, v41, v55, vcc 38515; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 38516; GCN-NEXT: v_cndmask_b32_e32 v0, v54, v53, vcc 38517; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 38518; GCN-NEXT: v_cndmask_b32_e32 v31, v40, v37, vcc 38519; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 38520; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 38521; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 38522; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 38523; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 38524; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 38525; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 38526; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 38527; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 38528; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 38529; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 38530; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 38531; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 38532; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 38533; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 38534; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 38535; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 38536; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 38537; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 38538; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 38539; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 38540; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 38541; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 38542; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 38543; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 38544; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 38545; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 38546; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 38547; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 38548; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 38549; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 38550; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 38551; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload 38552; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload 38553; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload 38554; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload 38555; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload 38556; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload 38557; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload 38558; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload 38559; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload 38560; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload 38561; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload 38562; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload 38563; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload 38564; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload 38565; GCN-NEXT: s_waitcnt vmcnt(0) 38566; GCN-NEXT: s_setpc_b64 s[30:31] 38567; 38568; GFX7-LABEL: v_vselect_v32bf16: 38569; GFX7: ; %bb.0: 38570; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38571; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 38572; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 38573; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 38574; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 38575; GFX7-NEXT: v_and_b32_e32 v25, 1, v25 38576; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v25 38577; GFX7-NEXT: v_and_b32_e32 v30, 1, v30 38578; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v30 38579; GFX7-NEXT: v_and_b32_e32 v29, 1, v29 38580; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v29 38581; GFX7-NEXT: v_and_b32_e32 v28, 1, v28 38582; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v28 38583; GFX7-NEXT: v_and_b32_e32 v27, 1, v27 38584; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v27 38585; GFX7-NEXT: v_and_b32_e32 v26, 1, v26 38586; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v26 38587; GFX7-NEXT: v_and_b32_e32 v23, 1, v23 38588; GFX7-NEXT: v_and_b32_e32 v22, 1, v22 38589; GFX7-NEXT: v_and_b32_e32 v21, 1, v21 38590; GFX7-NEXT: v_and_b32_e32 v20, 1, v20 38591; GFX7-NEXT: v_and_b32_e32 v19, 1, v19 38592; GFX7-NEXT: v_and_b32_e32 v18, 1, v18 38593; GFX7-NEXT: v_and_b32_e32 v17, 1, v17 38594; GFX7-NEXT: v_and_b32_e32 v16, 1, v16 38595; GFX7-NEXT: v_and_b32_e32 v15, 1, v15 38596; GFX7-NEXT: v_and_b32_e32 v14, 1, v14 38597; GFX7-NEXT: v_and_b32_e32 v13, 1, v13 38598; GFX7-NEXT: v_and_b32_e32 v12, 1, v12 38599; GFX7-NEXT: v_and_b32_e32 v11, 1, v11 38600; GFX7-NEXT: v_and_b32_e32 v10, 1, v10 38601; GFX7-NEXT: v_and_b32_e32 v9, 1, v9 38602; GFX7-NEXT: v_and_b32_e32 v8, 1, v8 38603; GFX7-NEXT: v_and_b32_e32 v7, 1, v7 38604; GFX7-NEXT: v_and_b32_e32 v6, 1, v6 38605; GFX7-NEXT: v_and_b32_e32 v5, 1, v5 38606; GFX7-NEXT: v_and_b32_e32 v4, 1, v4 38607; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 38608; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 38609; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 38610; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 38611; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 38612; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 38613; GFX7-NEXT: s_waitcnt vmcnt(3) 38614; GFX7-NEXT: v_and_b32_e32 v24, 1, v24 38615; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v24 38616; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:124 38617; GFX7-NEXT: s_waitcnt vmcnt(3) 38618; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38619; GFX7-NEXT: s_waitcnt vmcnt(2) 38620; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38621; GFX7-NEXT: s_waitcnt vmcnt(1) 38622; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 38623; GFX7-NEXT: s_waitcnt vmcnt(0) 38624; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38625; GFX7-NEXT: v_cndmask_b32_e64 v30, v25, v24, s[12:13] 38626; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:120 38627; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 38628; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 38629; GFX7-NEXT: s_waitcnt vmcnt(1) 38630; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38631; GFX7-NEXT: s_waitcnt vmcnt(0) 38632; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38633; GFX7-NEXT: v_cndmask_b32_e64 v29, v25, v24, s[14:15] 38634; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 38635; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:244 38636; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 38637; GFX7-NEXT: s_waitcnt vmcnt(1) 38638; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38639; GFX7-NEXT: s_waitcnt vmcnt(0) 38640; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38641; GFX7-NEXT: v_cndmask_b32_e64 v28, v25, v24, s[16:17] 38642; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 38643; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 38644; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 38645; GFX7-NEXT: s_waitcnt vmcnt(1) 38646; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38647; GFX7-NEXT: s_waitcnt vmcnt(0) 38648; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38649; GFX7-NEXT: v_cndmask_b32_e64 v27, v25, v24, s[10:11] 38650; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 38651; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:236 38652; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 38653; GFX7-NEXT: s_waitcnt vmcnt(1) 38654; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38655; GFX7-NEXT: s_waitcnt vmcnt(0) 38656; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38657; GFX7-NEXT: v_cndmask_b32_e64 v26, v25, v24, s[8:9] 38658; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104 38659; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232 38660; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 38661; GFX7-NEXT: s_waitcnt vmcnt(1) 38662; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38663; GFX7-NEXT: s_waitcnt vmcnt(0) 38664; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 38665; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v24, s[6:7] 38666; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 38667; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 38668; GFX7-NEXT: s_waitcnt vmcnt(0) 38669; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38670; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v24, s[4:5] 38671; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 38672; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 38673; GFX7-NEXT: s_waitcnt vmcnt(0) 38674; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 38675; GFX7-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc 38676; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 38677; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 38678; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 38679; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 38680; GFX7-NEXT: s_waitcnt vmcnt(1) 38681; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 38682; GFX7-NEXT: s_waitcnt vmcnt(0) 38683; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38684; GFX7-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc 38685; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 38686; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 38687; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 38688; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 38689; GFX7-NEXT: s_waitcnt vmcnt(1) 38690; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 38691; GFX7-NEXT: s_waitcnt vmcnt(0) 38692; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38693; GFX7-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc 38694; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 38695; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 38696; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 38697; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 38698; GFX7-NEXT: s_waitcnt vmcnt(1) 38699; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 38700; GFX7-NEXT: s_waitcnt vmcnt(0) 38701; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38702; GFX7-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc 38703; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 38704; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 38705; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 38706; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 38707; GFX7-NEXT: s_waitcnt vmcnt(1) 38708; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 38709; GFX7-NEXT: s_waitcnt vmcnt(0) 38710; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38711; GFX7-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc 38712; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 38713; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 38714; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 38715; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 38716; GFX7-NEXT: s_waitcnt vmcnt(1) 38717; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 38718; GFX7-NEXT: s_waitcnt vmcnt(0) 38719; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38720; GFX7-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc 38721; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 38722; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 38723; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 38724; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 38725; GFX7-NEXT: s_waitcnt vmcnt(1) 38726; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 38727; GFX7-NEXT: s_waitcnt vmcnt(0) 38728; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38729; GFX7-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc 38730; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 38731; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 38732; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 38733; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 38734; GFX7-NEXT: s_waitcnt vmcnt(1) 38735; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 38736; GFX7-NEXT: s_waitcnt vmcnt(0) 38737; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38738; GFX7-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc 38739; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 38740; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 38741; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:196 38742; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 38743; GFX7-NEXT: s_waitcnt vmcnt(1) 38744; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 38745; GFX7-NEXT: s_waitcnt vmcnt(0) 38746; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38747; GFX7-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc 38748; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 38749; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 38750; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 38751; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 38752; GFX7-NEXT: s_waitcnt vmcnt(1) 38753; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 38754; GFX7-NEXT: s_waitcnt vmcnt(0) 38755; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38756; GFX7-NEXT: v_cndmask_b32_e32 v15, v32, v15, vcc 38757; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 38758; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 38759; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:188 38760; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 38761; GFX7-NEXT: s_waitcnt vmcnt(1) 38762; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 38763; GFX7-NEXT: s_waitcnt vmcnt(0) 38764; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38765; GFX7-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc 38766; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 38767; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 38768; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 38769; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 38770; GFX7-NEXT: s_waitcnt vmcnt(1) 38771; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 38772; GFX7-NEXT: s_waitcnt vmcnt(0) 38773; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38774; GFX7-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc 38775; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 38776; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 38777; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:180 38778; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 38779; GFX7-NEXT: s_waitcnt vmcnt(1) 38780; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 38781; GFX7-NEXT: s_waitcnt vmcnt(0) 38782; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38783; GFX7-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc 38784; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 38785; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 38786; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 38787; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 38788; GFX7-NEXT: s_waitcnt vmcnt(1) 38789; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 38790; GFX7-NEXT: s_waitcnt vmcnt(0) 38791; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38792; GFX7-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc 38793; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 38794; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 38795; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 38796; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 38797; GFX7-NEXT: s_waitcnt vmcnt(1) 38798; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 38799; GFX7-NEXT: s_waitcnt vmcnt(0) 38800; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38801; GFX7-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc 38802; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 38803; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 38804; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168 38805; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 38806; GFX7-NEXT: s_waitcnt vmcnt(1) 38807; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 38808; GFX7-NEXT: s_waitcnt vmcnt(0) 38809; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38810; GFX7-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc 38811; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 38812; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 38813; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 38814; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 38815; GFX7-NEXT: s_waitcnt vmcnt(1) 38816; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 38817; GFX7-NEXT: s_waitcnt vmcnt(0) 38818; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38819; GFX7-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc 38820; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 38821; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 38822; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 38823; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 38824; GFX7-NEXT: s_waitcnt vmcnt(1) 38825; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 38826; GFX7-NEXT: s_waitcnt vmcnt(0) 38827; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38828; GFX7-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc 38829; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 38830; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 38831; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:156 38832; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 38833; GFX7-NEXT: s_waitcnt vmcnt(1) 38834; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 38835; GFX7-NEXT: s_waitcnt vmcnt(0) 38836; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38837; GFX7-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc 38838; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 38839; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 38840; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 38841; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 38842; GFX7-NEXT: s_waitcnt vmcnt(1) 38843; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 38844; GFX7-NEXT: s_waitcnt vmcnt(0) 38845; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38846; GFX7-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc 38847; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 38848; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 38849; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 38850; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 38851; GFX7-NEXT: s_waitcnt vmcnt(1) 38852; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 38853; GFX7-NEXT: s_waitcnt vmcnt(0) 38854; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38855; GFX7-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc 38856; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 38857; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 38858; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 38859; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 38860; GFX7-NEXT: s_waitcnt vmcnt(1) 38861; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 38862; GFX7-NEXT: s_waitcnt vmcnt(0) 38863; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38864; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc 38865; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 38866; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 38867; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 38868; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 38869; GFX7-NEXT: s_waitcnt vmcnt(1) 38870; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 38871; GFX7-NEXT: s_waitcnt vmcnt(0) 38872; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38873; GFX7-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc 38874; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 38875; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 38876; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 38877; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 38878; GFX7-NEXT: s_waitcnt vmcnt(1) 38879; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 38880; GFX7-NEXT: s_waitcnt vmcnt(0) 38881; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38882; GFX7-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc 38883; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 38884; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 38885; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 38886; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 38887; GFX7-NEXT: s_waitcnt vmcnt(1) 38888; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 38889; GFX7-NEXT: s_waitcnt vmcnt(0) 38890; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 38891; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc 38892; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 38893; GFX7-NEXT: s_setpc_b64 s[30:31] 38894; 38895; GFX8-LABEL: v_vselect_v32bf16: 38896; GFX8: ; %bb.0: 38897; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38898; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 38899; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill 38900; GFX8-NEXT: s_mov_b64 exec, s[4:5] 38901; GFX8-NEXT: v_writelane_b32 v34, s30, 0 38902; GFX8-NEXT: v_writelane_b32 v34, s31, 1 38903; GFX8-NEXT: v_writelane_b32 v34, s34, 2 38904; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 38905; GFX8-NEXT: v_writelane_b32 v34, s35, 3 38906; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 38907; GFX8-NEXT: v_and_b32_e32 v0, 1, v1 38908; GFX8-NEXT: v_writelane_b32 v34, s36, 4 38909; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 38910; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 38911; GFX8-NEXT: v_writelane_b32 v34, s37, 5 38912; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 38913; GFX8-NEXT: v_and_b32_e32 v0, 1, v3 38914; GFX8-NEXT: v_writelane_b32 v34, s38, 6 38915; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 38916; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 38917; GFX8-NEXT: v_writelane_b32 v34, s39, 7 38918; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 38919; GFX8-NEXT: v_and_b32_e32 v0, 1, v5 38920; GFX8-NEXT: v_writelane_b32 v34, s40, 8 38921; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 38922; GFX8-NEXT: v_and_b32_e32 v0, 1, v6 38923; GFX8-NEXT: v_writelane_b32 v34, s41, 9 38924; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 38925; GFX8-NEXT: v_and_b32_e32 v0, 1, v7 38926; GFX8-NEXT: v_writelane_b32 v34, s42, 10 38927; GFX8-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 38928; GFX8-NEXT: v_and_b32_e32 v0, 1, v8 38929; GFX8-NEXT: v_writelane_b32 v34, s43, 11 38930; GFX8-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 38931; GFX8-NEXT: v_and_b32_e32 v0, 1, v9 38932; GFX8-NEXT: v_writelane_b32 v34, s44, 12 38933; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 38934; GFX8-NEXT: v_and_b32_e32 v0, 1, v10 38935; GFX8-NEXT: v_writelane_b32 v34, s45, 13 38936; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 38937; GFX8-NEXT: v_and_b32_e32 v0, 1, v11 38938; GFX8-NEXT: v_writelane_b32 v34, s46, 14 38939; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 38940; GFX8-NEXT: v_and_b32_e32 v0, 1, v12 38941; GFX8-NEXT: v_writelane_b32 v34, s47, 15 38942; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 38943; GFX8-NEXT: v_and_b32_e32 v0, 1, v13 38944; GFX8-NEXT: v_writelane_b32 v34, s48, 16 38945; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 38946; GFX8-NEXT: v_and_b32_e32 v0, 1, v14 38947; GFX8-NEXT: v_writelane_b32 v34, s49, 17 38948; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 38949; GFX8-NEXT: v_and_b32_e32 v0, 1, v15 38950; GFX8-NEXT: v_writelane_b32 v34, s50, 18 38951; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 38952; GFX8-NEXT: v_and_b32_e32 v0, 1, v16 38953; GFX8-NEXT: v_writelane_b32 v34, s51, 19 38954; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 38955; GFX8-NEXT: v_and_b32_e32 v0, 1, v17 38956; GFX8-NEXT: v_writelane_b32 v34, s52, 20 38957; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 38958; GFX8-NEXT: v_and_b32_e32 v0, 1, v18 38959; GFX8-NEXT: v_writelane_b32 v34, s53, 21 38960; GFX8-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 38961; GFX8-NEXT: v_and_b32_e32 v0, 1, v19 38962; GFX8-NEXT: v_writelane_b32 v34, s54, 22 38963; GFX8-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 38964; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 38965; GFX8-NEXT: v_writelane_b32 v34, s55, 23 38966; GFX8-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 38967; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 38968; GFX8-NEXT: v_writelane_b32 v34, s56, 24 38969; GFX8-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 38970; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 38971; GFX8-NEXT: v_writelane_b32 v34, s57, 25 38972; GFX8-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 38973; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 38974; GFX8-NEXT: v_writelane_b32 v34, s58, 26 38975; GFX8-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 38976; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 38977; GFX8-NEXT: v_writelane_b32 v34, s59, 27 38978; GFX8-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 38979; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 38980; GFX8-NEXT: v_writelane_b32 v34, s60, 28 38981; GFX8-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 38982; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 38983; GFX8-NEXT: v_writelane_b32 v34, s61, 29 38984; GFX8-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 38985; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 38986; GFX8-NEXT: v_writelane_b32 v34, s62, 30 38987; GFX8-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 38988; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 38989; GFX8-NEXT: v_writelane_b32 v34, s63, 31 38990; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 38991; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 38992; GFX8-NEXT: v_writelane_b32 v34, s64, 32 38993; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 38994; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 38995; GFX8-NEXT: v_writelane_b32 v34, s65, 33 38996; GFX8-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 38997; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 38998; GFX8-NEXT: v_writelane_b32 v34, s66, 34 38999; GFX8-NEXT: v_writelane_b32 v34, s67, 35 39000; GFX8-NEXT: s_waitcnt vmcnt(0) 39001; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 39002; GFX8-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 39003; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 39004; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 39005; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 39006; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 39007; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 39008; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 39009; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 39010; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 39011; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 39012; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 39013; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 39014; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 39015; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 39016; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 39017; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 39018; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 39019; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 39020; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 39021; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 39022; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 39023; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 39024; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 39025; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 39026; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 39027; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 39028; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 39029; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 39030; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 39031; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 39032; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 39033; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:128 39034; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 39035; GFX8-NEXT: s_waitcnt vmcnt(1) 39036; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29 39037; GFX8-NEXT: s_waitcnt vmcnt(0) 39038; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v32 39039; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[66:67] 39040; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[64:65] 39041; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v31 39042; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v30 39043; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[62:63] 39044; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v31, s[60:61] 39045; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v27 39046; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v26 39047; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v31, s[58:59] 39048; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] 39049; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v25 39050; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v24 39051; GFX8-NEXT: v_cndmask_b32_e64 v27, v33, v27, s[54:55] 39052; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] 39053; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v23 39054; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22 39055; GFX8-NEXT: v_cndmask_b32_e64 v25, v33, v25, s[50:51] 39056; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] 39057; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21 39058; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20 39059; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[46:47] 39060; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] 39061; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19 39062; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18 39063; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[42:43] 39064; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] 39065; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17 39066; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16 39067; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[38:39] 39068; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] 39069; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15 39070; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14 39071; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[34:35] 39072; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] 39073; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13 39074; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12 39075; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29] 39076; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] 39077; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11 39078; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10 39079; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25] 39080; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] 39081; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9 39082; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8 39083; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21] 39084; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] 39085; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7 39086; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6 39087; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17] 39088; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] 39089; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 39090; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4 39091; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13] 39092; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] 39093; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 39094; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2 39095; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9] 39096; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] 39097; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1 39098; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0 39099; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5] 39100; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 39101; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3 39102; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39103; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 39104; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39105; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 39106; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9 39107; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39108; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39109; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11 39110; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13 39111; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15 39112; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17 39113; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39114; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39115; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39116; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39117; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19 39118; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21 39119; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23 39120; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v25 39121; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v27 39122; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v31 39123; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v32 39124; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v28 39125; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39126; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39127; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39128; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39129; GFX8-NEXT: v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39130; GFX8-NEXT: v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39131; GFX8-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39132; GFX8-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 39133; GFX8-NEXT: v_readlane_b32 s67, v34, 35 39134; GFX8-NEXT: v_readlane_b32 s66, v34, 34 39135; GFX8-NEXT: v_readlane_b32 s65, v34, 33 39136; GFX8-NEXT: v_readlane_b32 s64, v34, 32 39137; GFX8-NEXT: v_readlane_b32 s63, v34, 31 39138; GFX8-NEXT: v_readlane_b32 s62, v34, 30 39139; GFX8-NEXT: v_readlane_b32 s61, v34, 29 39140; GFX8-NEXT: v_readlane_b32 s60, v34, 28 39141; GFX8-NEXT: v_readlane_b32 s59, v34, 27 39142; GFX8-NEXT: v_readlane_b32 s58, v34, 26 39143; GFX8-NEXT: v_readlane_b32 s57, v34, 25 39144; GFX8-NEXT: v_readlane_b32 s56, v34, 24 39145; GFX8-NEXT: v_readlane_b32 s55, v34, 23 39146; GFX8-NEXT: v_readlane_b32 s54, v34, 22 39147; GFX8-NEXT: v_readlane_b32 s53, v34, 21 39148; GFX8-NEXT: v_readlane_b32 s52, v34, 20 39149; GFX8-NEXT: v_readlane_b32 s51, v34, 19 39150; GFX8-NEXT: v_readlane_b32 s50, v34, 18 39151; GFX8-NEXT: v_readlane_b32 s49, v34, 17 39152; GFX8-NEXT: v_readlane_b32 s48, v34, 16 39153; GFX8-NEXT: v_readlane_b32 s47, v34, 15 39154; GFX8-NEXT: v_readlane_b32 s46, v34, 14 39155; GFX8-NEXT: v_readlane_b32 s45, v34, 13 39156; GFX8-NEXT: v_readlane_b32 s44, v34, 12 39157; GFX8-NEXT: v_readlane_b32 s43, v34, 11 39158; GFX8-NEXT: v_readlane_b32 s42, v34, 10 39159; GFX8-NEXT: v_readlane_b32 s41, v34, 9 39160; GFX8-NEXT: v_readlane_b32 s40, v34, 8 39161; GFX8-NEXT: v_readlane_b32 s39, v34, 7 39162; GFX8-NEXT: v_readlane_b32 s38, v34, 6 39163; GFX8-NEXT: v_readlane_b32 s37, v34, 5 39164; GFX8-NEXT: v_readlane_b32 s36, v34, 4 39165; GFX8-NEXT: v_readlane_b32 s35, v34, 3 39166; GFX8-NEXT: v_readlane_b32 s34, v34, 2 39167; GFX8-NEXT: v_readlane_b32 s31, v34, 1 39168; GFX8-NEXT: v_readlane_b32 s30, v34, 0 39169; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 39170; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload 39171; GFX8-NEXT: s_mov_b64 exec, s[4:5] 39172; GFX8-NEXT: s_waitcnt vmcnt(0) 39173; GFX8-NEXT: s_setpc_b64 s[30:31] 39174; 39175; GFX9-LABEL: v_vselect_v32bf16: 39176; GFX9: ; %bb.0: 39177; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39178; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 39179; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill 39180; GFX9-NEXT: s_mov_b64 exec, s[4:5] 39181; GFX9-NEXT: v_writelane_b32 v33, s30, 0 39182; GFX9-NEXT: v_writelane_b32 v33, s31, 1 39183; GFX9-NEXT: v_writelane_b32 v33, s34, 2 39184; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 39185; GFX9-NEXT: v_writelane_b32 v33, s35, 3 39186; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 39187; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 39188; GFX9-NEXT: v_writelane_b32 v33, s36, 4 39189; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 39190; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 39191; GFX9-NEXT: v_writelane_b32 v33, s37, 5 39192; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 39193; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 39194; GFX9-NEXT: v_writelane_b32 v33, s38, 6 39195; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 39196; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 39197; GFX9-NEXT: v_writelane_b32 v33, s39, 7 39198; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 39199; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 39200; GFX9-NEXT: v_writelane_b32 v33, s40, 8 39201; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 39202; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 39203; GFX9-NEXT: v_writelane_b32 v33, s41, 9 39204; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 39205; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 39206; GFX9-NEXT: v_writelane_b32 v33, s42, 10 39207; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 39208; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 39209; GFX9-NEXT: v_writelane_b32 v33, s43, 11 39210; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 39211; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 39212; GFX9-NEXT: v_writelane_b32 v33, s44, 12 39213; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 39214; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 39215; GFX9-NEXT: v_writelane_b32 v33, s45, 13 39216; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 39217; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 39218; GFX9-NEXT: v_writelane_b32 v33, s46, 14 39219; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 39220; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 39221; GFX9-NEXT: v_writelane_b32 v33, s47, 15 39222; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 39223; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 39224; GFX9-NEXT: v_writelane_b32 v33, s48, 16 39225; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 39226; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 39227; GFX9-NEXT: v_writelane_b32 v33, s49, 17 39228; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 39229; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 39230; GFX9-NEXT: v_writelane_b32 v33, s50, 18 39231; GFX9-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 39232; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 39233; GFX9-NEXT: v_writelane_b32 v33, s51, 19 39234; GFX9-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 39235; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 39236; GFX9-NEXT: v_writelane_b32 v33, s52, 20 39237; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 39238; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 39239; GFX9-NEXT: v_writelane_b32 v33, s53, 21 39240; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 39241; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 39242; GFX9-NEXT: v_writelane_b32 v33, s54, 22 39243; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 39244; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 39245; GFX9-NEXT: v_writelane_b32 v33, s55, 23 39246; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 39247; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 39248; GFX9-NEXT: v_writelane_b32 v33, s56, 24 39249; GFX9-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v0 39250; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 39251; GFX9-NEXT: v_writelane_b32 v33, s57, 25 39252; GFX9-NEXT: v_cmp_eq_u32_e64 s[50:51], 1, v0 39253; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 39254; GFX9-NEXT: v_writelane_b32 v33, s58, 26 39255; GFX9-NEXT: v_cmp_eq_u32_e64 s[52:53], 1, v0 39256; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 39257; GFX9-NEXT: v_writelane_b32 v33, s59, 27 39258; GFX9-NEXT: v_cmp_eq_u32_e64 s[54:55], 1, v0 39259; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 39260; GFX9-NEXT: v_writelane_b32 v33, s60, 28 39261; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 39262; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 39263; GFX9-NEXT: v_writelane_b32 v33, s61, 29 39264; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 39265; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 39266; GFX9-NEXT: v_writelane_b32 v33, s62, 30 39267; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 39268; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 39269; GFX9-NEXT: v_writelane_b32 v33, s63, 31 39270; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 39271; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 39272; GFX9-NEXT: v_writelane_b32 v33, s64, 32 39273; GFX9-NEXT: v_writelane_b32 v33, s65, 33 39274; GFX9-NEXT: v_writelane_b32 v33, s66, 34 39275; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 39276; GFX9-NEXT: v_writelane_b32 v33, s67, 35 39277; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 39278; GFX9-NEXT: s_waitcnt vmcnt(0) 39279; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 39280; GFX9-NEXT: v_cmp_eq_u32_e64 s[64:65], 1, v0 39281; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 39282; GFX9-NEXT: v_cmp_eq_u32_e64 s[66:67], 1, v0 39283; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 39284; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 39285; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 39286; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 39287; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 39288; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 39289; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 39290; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 39291; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 39292; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 39293; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 39294; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 39295; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 39296; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 39297; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 39298; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 39299; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 39300; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 39301; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 39302; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 39303; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 39304; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 39305; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 39306; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 39307; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 39308; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 39309; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 39310; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 39311; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 39312; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60 39313; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 39314; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 39315; GFX9-NEXT: s_waitcnt vmcnt(0) 39316; GFX9-NEXT: v_cndmask_b32_e64 v29, v31, v32, s[66:67] 39317; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 39318; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 39319; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[64:65] 39320; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v30, s[62:63] 39321; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 39322; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 39323; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v30, s[60:61] 39324; GFX9-NEXT: v_cndmask_b32_e64 v30, v26, v27, s[58:59] 39325; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 39326; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 39327; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[56:57] 39328; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[54:55] 39329; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 39330; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 39331; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[52:53] 39332; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[50:51] 39333; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 39334; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 39335; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[48:49] 39336; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[46:47] 39337; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 39338; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 39339; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[44:45] 39340; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[42:43] 39341; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 39342; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 39343; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[40:41] 39344; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[38:39] 39345; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 39346; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 39347; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[36:37] 39348; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[34:35] 39349; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 39350; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 39351; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[30:31] 39352; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] 39353; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 39354; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 39355; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] 39356; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] 39357; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 39358; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 39359; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] 39360; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] 39361; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 39362; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 39363; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] 39364; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] 39365; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 39366; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 39367; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] 39368; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] 39369; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 39370; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 39371; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] 39372; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] 39373; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 39374; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 39375; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] 39376; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] 39377; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 39378; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 39379; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 39380; GFX9-NEXT: s_mov_b32 s4, 0x5040100 39381; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 39382; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 39383; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 39384; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 39385; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 39386; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 39387; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 39388; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 39389; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 39390; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 39391; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 39392; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 39393; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 39394; GFX9-NEXT: v_perm_b32 v13, v26, v30, s4 39395; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 39396; GFX9-NEXT: v_perm_b32 v15, v31, v29, s4 39397; GFX9-NEXT: v_readlane_b32 s67, v33, 35 39398; GFX9-NEXT: v_readlane_b32 s66, v33, 34 39399; GFX9-NEXT: v_readlane_b32 s65, v33, 33 39400; GFX9-NEXT: v_readlane_b32 s64, v33, 32 39401; GFX9-NEXT: v_readlane_b32 s63, v33, 31 39402; GFX9-NEXT: v_readlane_b32 s62, v33, 30 39403; GFX9-NEXT: v_readlane_b32 s61, v33, 29 39404; GFX9-NEXT: v_readlane_b32 s60, v33, 28 39405; GFX9-NEXT: v_readlane_b32 s59, v33, 27 39406; GFX9-NEXT: v_readlane_b32 s58, v33, 26 39407; GFX9-NEXT: v_readlane_b32 s57, v33, 25 39408; GFX9-NEXT: v_readlane_b32 s56, v33, 24 39409; GFX9-NEXT: v_readlane_b32 s55, v33, 23 39410; GFX9-NEXT: v_readlane_b32 s54, v33, 22 39411; GFX9-NEXT: v_readlane_b32 s53, v33, 21 39412; GFX9-NEXT: v_readlane_b32 s52, v33, 20 39413; GFX9-NEXT: v_readlane_b32 s51, v33, 19 39414; GFX9-NEXT: v_readlane_b32 s50, v33, 18 39415; GFX9-NEXT: v_readlane_b32 s49, v33, 17 39416; GFX9-NEXT: v_readlane_b32 s48, v33, 16 39417; GFX9-NEXT: v_readlane_b32 s47, v33, 15 39418; GFX9-NEXT: v_readlane_b32 s46, v33, 14 39419; GFX9-NEXT: v_readlane_b32 s45, v33, 13 39420; GFX9-NEXT: v_readlane_b32 s44, v33, 12 39421; GFX9-NEXT: v_readlane_b32 s43, v33, 11 39422; GFX9-NEXT: v_readlane_b32 s42, v33, 10 39423; GFX9-NEXT: v_readlane_b32 s41, v33, 9 39424; GFX9-NEXT: v_readlane_b32 s40, v33, 8 39425; GFX9-NEXT: v_readlane_b32 s39, v33, 7 39426; GFX9-NEXT: v_readlane_b32 s38, v33, 6 39427; GFX9-NEXT: v_readlane_b32 s37, v33, 5 39428; GFX9-NEXT: v_readlane_b32 s36, v33, 4 39429; GFX9-NEXT: v_readlane_b32 s35, v33, 3 39430; GFX9-NEXT: v_readlane_b32 s34, v33, 2 39431; GFX9-NEXT: v_readlane_b32 s31, v33, 1 39432; GFX9-NEXT: v_readlane_b32 s30, v33, 0 39433; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 39434; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload 39435; GFX9-NEXT: s_mov_b64 exec, s[4:5] 39436; GFX9-NEXT: s_waitcnt vmcnt(0) 39437; GFX9-NEXT: s_setpc_b64 s[30:31] 39438; 39439; GFX10-LABEL: v_vselect_v32bf16: 39440; GFX10: ; %bb.0: 39441; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39442; GFX10-NEXT: s_clause 0xa 39443; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 39444; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 39445; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 39446; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 39447; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32 39448; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 39449; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 39450; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 39451; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 39452; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 39453; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 39454; GFX10-NEXT: v_and_b32_e32 v30, 1, v30 39455; GFX10-NEXT: v_and_b32_e32 v18, 1, v18 39456; GFX10-NEXT: v_and_b32_e32 v12, 1, v12 39457; GFX10-NEXT: v_and_b32_e32 v13, 1, v13 39458; GFX10-NEXT: v_and_b32_e32 v19, 1, v19 39459; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 39460; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18 39461; GFX10-NEXT: v_and_b32_e32 v28, 1, v28 39462; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13 39463; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19 39464; GFX10-NEXT: v_and_b32_e32 v26, 1, v26 39465; GFX10-NEXT: v_and_b32_e32 v24, 1, v24 39466; GFX10-NEXT: v_and_b32_e32 v22, 1, v22 39467; GFX10-NEXT: v_and_b32_e32 v20, 1, v20 39468; GFX10-NEXT: v_and_b32_e32 v21, 1, v21 39469; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 39470; GFX10-NEXT: v_and_b32_e32 v14, 1, v14 39471; GFX10-NEXT: v_and_b32_e32 v17, 1, v17 39472; GFX10-NEXT: v_and_b32_e32 v15, 1, v15 39473; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 39474; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 39475; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 39476; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 39477; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 39478; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 39479; GFX10-NEXT: v_and_b32_e32 v11, 1, v11 39480; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 39481; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 39482; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 39483; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 39484; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 39485; GFX10-NEXT: s_waitcnt vmcnt(10) 39486; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31 39487; GFX10-NEXT: s_waitcnt vmcnt(9) 39488; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32 39489; GFX10-NEXT: s_waitcnt vmcnt(8) 39490; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33 39491; GFX10-NEXT: s_waitcnt vmcnt(7) 39492; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6 39493; GFX10-NEXT: s_waitcnt vmcnt(6) 39494; GFX10-NEXT: v_and_b32_e32 v35, 1, v35 39495; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12 39496; GFX10-NEXT: s_waitcnt vmcnt(4) 39497; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo 39498; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39499; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39500; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35 39501; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34 39502; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6 39503; GFX10-NEXT: s_clause 0x6 39504; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 39505; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 39506; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 39507; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 39508; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 39509; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 39510; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 39511; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4 39512; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo 39513; GFX10-NEXT: s_clause 0x1 39514; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124 39515; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 39516; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 39517; GFX10-NEXT: v_and_b32_e32 v28, 1, v29 39518; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5 39519; GFX10-NEXT: s_waitcnt vmcnt(3) 39520; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52 39521; GFX10-NEXT: s_waitcnt vmcnt(0) 39522; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo 39523; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39524; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39525; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 39526; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo 39527; GFX10-NEXT: s_clause 0x1 39528; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 39529; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 39530; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 39531; GFX10-NEXT: v_and_b32_e32 v26, 1, v27 39532; GFX10-NEXT: s_waitcnt vmcnt(0) 39533; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo 39534; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39535; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39536; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 39537; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo 39538; GFX10-NEXT: s_clause 0x1 39539; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 39540; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 39541; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 39542; GFX10-NEXT: v_and_b32_e32 v24, 1, v25 39543; GFX10-NEXT: s_waitcnt vmcnt(0) 39544; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo 39545; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39546; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39547; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 39548; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo 39549; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 39550; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 39551; GFX10-NEXT: v_and_b32_e32 v22, 1, v23 39552; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49 39553; GFX10-NEXT: s_waitcnt vmcnt(0) 39554; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo 39555; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39556; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 39557; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53 39558; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo 39559; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 39560; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48 39561; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39 39562; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo 39563; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 39564; GFX10-NEXT: s_clause 0x1 39565; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 39566; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 39567; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo 39568; GFX10-NEXT: s_clause 0x1 39569; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 39570; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 39571; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 39572; GFX10-NEXT: s_waitcnt vmcnt(0) 39573; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo 39574; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 39575; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39576; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39577; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo 39578; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 39579; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 39580; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 39581; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo 39582; GFX10-NEXT: s_clause 0x1 39583; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 39584; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 39585; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 39586; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo 39587; GFX10-NEXT: s_clause 0x1 39588; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 39589; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 39590; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 39591; GFX10-NEXT: s_waitcnt vmcnt(2) 39592; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo 39593; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 39594; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39595; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39596; GFX10-NEXT: s_waitcnt vmcnt(0) 39597; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo 39598; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 39599; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39 39600; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38 39601; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo 39602; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 39603; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48 39604; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo 39605; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 39606; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34 39607; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo 39608; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 39609; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33 39610; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32 39611; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo 39612; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 39613; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31 39614; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19 39615; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo 39616; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 39617; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo 39618; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 39619; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo 39620; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 39621; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo 39622; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 39623; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 39624; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo 39625; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 39626; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 39627; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 39628; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100 39629; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 39630; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo 39631; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 39632; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 39633; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 39634; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100 39635; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 39636; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 39637; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100 39638; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100 39639; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100 39640; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100 39641; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100 39642; GFX10-NEXT: s_setpc_b64 s[30:31] 39643; 39644; GFX11TRUE16-LABEL: v_vselect_v32bf16: 39645; GFX11TRUE16: ; %bb.0: 39646; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39647; GFX11TRUE16-NEXT: s_clause 0x1f 39648; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32 39649; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:128 39650; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:64 39651; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124 39652; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:60 39653; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:120 39654; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:56 39655; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:116 39656; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:52 39657; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:112 39658; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:48 39659; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:108 39660; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:44 39661; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:104 39662; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:40 39663; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:100 39664; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:36 39665; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96 39666; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32 39667; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:92 39668; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:28 39669; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:88 39670; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:24 39671; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:84 39672; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:20 39673; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80 39674; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:16 39675; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76 39676; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:12 39677; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:72 39678; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:8 39679; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:68 39680; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 39681; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 39682; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 39683; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22 39684; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24 39685; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26 39686; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28 39687; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30 39688; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 39689; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 39690; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 39691; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 39692; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 39693; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 39694; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 39695; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 39696; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 39697; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 39698; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 39699; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 39700; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 39701; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17 39702; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 39703; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19 39704; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18 39705; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21 39706; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20 39707; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23 39708; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25 39709; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27 39710; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29 39711; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 39712; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 39713; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22 39714; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24 39715; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30 39716; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26 39717; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28 39718; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 39719; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 39720; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 39721; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 39722; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 39723; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 39724; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 39725; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 39726; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 39727; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10 39728; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 39729; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12 39730; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 39731; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 39732; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17 39733; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16 39734; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19 39735; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18 39736; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21 39737; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20 39738; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23 39739; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25 39740; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27 39741; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29 39742; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 39743; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) 39744; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v31 39745; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31) 39746; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32 39747; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30) 39748; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v33.l, s26 39749; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v33 39750; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28) 39751; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v35.l, s29 39752; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v35 39753; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v34 39754; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) 39755; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v36.l, v37.l, s27 39756; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v37 39757; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v36 39758; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24) 39759; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v38.l, v39.l, s24 39760; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v39 39761; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v38 39762; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22) 39763; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v48.l, v49.l, s22 39764; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v49 39765; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v48 39766; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18) 39767; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v53 39768; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v52 39769; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14) 39770; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v65 39771; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v64 39772; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v50.l, v51.l, s20 39773; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11) 39774; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v68 39775; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10) 39776; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v69 39777; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9) 39778; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v70 39779; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8) 39780; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v71 39781; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7) 39782; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v80 39783; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6) 39784; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v81 39785; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5) 39786; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v82 39787; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4) 39788; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v83 39789; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3) 39790; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v84 39791; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2) 39792; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v85 39793; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1) 39794; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v86 39795; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) 39796; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v86.l, v87.l, s0 39797; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v87 39798; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v8 39799; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v51 39800; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v50 39801; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v52.l, v53.l, s18 39802; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v54.l, v55.l, s16 39803; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55 39804; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54 39805; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v64.l, v65.l, s14 39806; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.l, v67.l, s12 39807; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v67 39808; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v66 39809; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.l, v71.l, s8 39810; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v82.l, v83.l, s4 39811; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v10.l, v9.l, s28 39812; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v12.l, v11.l, s25 39813; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v13.l, s23 39814; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v18.l, v15.l, s21 39815; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v22.l, v21.l, s17 39816; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s13 39817; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v30.l, v29.l, s9 39818; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v32.l, v31.l, s7 39819; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v34.l, v33.l, s5 39820; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v36.l, v35.l, s3 39821; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v38.l, v37.l, s1 39822; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo 39823; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, s0 39824; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v68.l, v69.l, s10 39825; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v80.l, v81.l, s6 39826; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v84.l, v85.l, s2 39827; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v20.l, v19.l, s19 39828; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v23.l, s15 39829; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v27.l, s11 39830; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.h 39831; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h 39832; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h 39833; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v4.h 39834; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l 39835; GFX11TRUE16-NEXT: v_mov_b16_e32 v23.l, v3.h 39836; GFX11TRUE16-NEXT: v_mov_b16_e32 v24.l, v3.l 39837; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v2.h 39838; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v2.l 39839; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v1.h 39840; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v1.l 39841; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v0.h 39842; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.l 39843; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v15.l 39844; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h 39845; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l 39846; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h 39847; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l 39848; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.h 39849; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.h 39850; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.h 39851; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.h 39852; GFX11TRUE16-NEXT: v_mov_b16_e32 v31.l, v9.l 39853; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v8.h 39854; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.l 39855; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h 39856; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v18, 0x5040100 39857; GFX11TRUE16-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 39858; GFX11TRUE16-NEXT: v_perm_b32 v2, v2, v19, 0x5040100 39859; GFX11TRUE16-NEXT: v_perm_b32 v3, v3, v6, 0x5040100 39860; GFX11TRUE16-NEXT: v_perm_b32 v4, v4, v20, 0x5040100 39861; GFX11TRUE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 39862; GFX11TRUE16-NEXT: v_perm_b32 v6, v12, v21, 0x5040100 39863; GFX11TRUE16-NEXT: v_perm_b32 v7, v14, v22, 0x5040100 39864; GFX11TRUE16-NEXT: v_perm_b32 v8, v11, v23, 0x5040100 39865; GFX11TRUE16-NEXT: v_perm_b32 v9, v16, v24, 0x5040100 39866; GFX11TRUE16-NEXT: v_perm_b32 v10, v10, v25, 0x5040100 39867; GFX11TRUE16-NEXT: v_perm_b32 v11, v17, v26, 0x5040100 39868; GFX11TRUE16-NEXT: v_perm_b32 v12, v31, v27, 0x5040100 39869; GFX11TRUE16-NEXT: v_perm_b32 v13, v32, v28, 0x5040100 39870; GFX11TRUE16-NEXT: v_perm_b32 v14, v33, v29, 0x5040100 39871; GFX11TRUE16-NEXT: v_perm_b32 v15, v15, v30, 0x5040100 39872; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 39873; 39874; GFX11FAKE16-LABEL: v_vselect_v32bf16: 39875; GFX11FAKE16: ; %bb.0: 39876; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39877; GFX11FAKE16-NEXT: s_clause 0x1f 39878; GFX11FAKE16-NEXT: scratch_load_u16 v31, off, s32 39879; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:128 39880; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:64 39881; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:124 39882; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:60 39883; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120 39884; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56 39885; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116 39886; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52 39887; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112 39888; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48 39889; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108 39890; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44 39891; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104 39892; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40 39893; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100 39894; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36 39895; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:96 39896; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:32 39897; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:92 39898; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:28 39899; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:88 39900; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:24 39901; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:84 39902; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:20 39903; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:80 39904; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:16 39905; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:76 39906; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:12 39907; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:72 39908; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8 39909; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68 39910; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 39911; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30 39912; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28 39913; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26 39914; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24 39915; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22 39916; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30 39917; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 39918; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20 39919; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18 39920; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16 39921; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30) 39922; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo 39923; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 39924; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 39925; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 39926; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 39927; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 39928; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28) 39929; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo 39930; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 39931; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7 39932; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 39933; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 39934; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 39935; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26) 39936; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo 39937; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 39938; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5 39939; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 39940; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 39941; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 39942; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24) 39943; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo 39944; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 39945; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11 39946; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 39947; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 39948; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 39949; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22) 39950; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo 39951; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 39952; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9 39953; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49 39954; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 39955; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8 39956; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20) 39957; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo 39958; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 39959; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15 39960; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51 39961; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50 39962; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10 39963; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18) 39964; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo 39965; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 39966; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13 39967; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53 39968; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52 39969; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12 39970; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16) 39971; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo 39972; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55 39973; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54 39974; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14 39975; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 39976; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 39977; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14) 39978; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19 39979; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 39980; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17 39981; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65 39982; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64 39983; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12) 39984; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo 39985; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 39986; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23 39987; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67 39988; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66 39989; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10) 39990; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo 39991; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 39992; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21 39993; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69 39994; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68 39995; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8) 39996; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo 39997; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 39998; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27 39999; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71 40000; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70 40001; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6) 40002; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo 40003; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 40004; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25 40005; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81 40006; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80 40007; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4) 40008; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo 40009; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 40010; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31 40011; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83 40012; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 40013; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2) 40014; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo 40015; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 40016; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29 40017; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85 40018; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84 40019; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) 40020; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo 40021; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31 40022; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87 40023; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86 40024; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo 40025; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29 40026; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo 40027; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 40028; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo 40029; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 40030; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo 40031; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 40032; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo 40033; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 40034; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo 40035; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 40036; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo 40037; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 40038; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo 40039; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 40040; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo 40041; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 40042; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo 40043; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 40044; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo 40045; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 40046; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo 40047; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 40048; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo 40049; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 40050; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo 40051; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 40052; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 40053; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 40054; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo 40055; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 40056; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 40057; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 40058; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 40059; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 40060; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo 40061; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 40062; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 40063; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 40064; GFX11FAKE16-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 40065; GFX11FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 40066; GFX11FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 40067; GFX11FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 40068; GFX11FAKE16-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 40069; GFX11FAKE16-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 40070; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100 40071; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100 40072; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 40073 %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b 40074 ret <32 x bfloat> %op 40075} 40076 40077declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat) 40078declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) 40079declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) 40080declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 40081 40082define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { 40083; GCN-LABEL: v_fma_bf16: 40084; GCN: ; %bb.0: 40085; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40086; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40087; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40088; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40089; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40090; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40091; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40092; GCN-NEXT: v_fma_f32 v0, v0, v1, v2 40093; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40094; GCN-NEXT: s_setpc_b64 s[30:31] 40095; 40096; GFX7-LABEL: v_fma_bf16: 40097; GFX7: ; %bb.0: 40098; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40099; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 40100; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 40101; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 40102; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40103; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40104; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40105; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2 40106; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40107; GFX7-NEXT: s_setpc_b64 s[30:31] 40108; 40109; GFX8-LABEL: v_fma_bf16: 40110; GFX8: ; %bb.0: 40111; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40112; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 40113; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40114; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40115; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 40116; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 40117; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 40118; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 40119; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 40120; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40121; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40122; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40123; GFX8-NEXT: s_setpc_b64 s[30:31] 40124; 40125; GFX9-LABEL: v_fma_bf16: 40126; GFX9: ; %bb.0: 40127; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40128; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 40129; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40130; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40131; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 40132; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 40133; GFX9-NEXT: s_movk_i32 s4, 0x7fff 40134; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 40135; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 40136; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40137; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40138; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40139; GFX9-NEXT: s_setpc_b64 s[30:31] 40140; 40141; GFX10-LABEL: v_fma_bf16: 40142; GFX10: ; %bb.0: 40143; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40144; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 40145; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40146; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40147; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 40148; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 40149; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2 40150; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 40151; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff 40152; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 40153; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40154; GFX10-NEXT: s_setpc_b64 s[30:31] 40155; 40156; GFX11-LABEL: v_fma_bf16: 40157; GFX11: ; %bb.0: 40158; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40159; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 40160; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40161; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40162; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 40163; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1 40164; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 40165; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v2 40166; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 40167; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 40168; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff 40169; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 40170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 40171; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40172; GFX11-NEXT: s_setpc_b64 s[30:31] 40173 %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) 40174 ret bfloat %op 40175} 40176 40177define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 40178; GCN-LABEL: v_fma_v2bf16: 40179; GCN: ; %bb.0: 40180; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40181; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40182; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40183; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 40184; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40185; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 40186; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 40187; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40188; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40189; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40190; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40191; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40192; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40193; GCN-NEXT: v_fma_f32 v1, v1, v3, v5 40194; GCN-NEXT: v_fma_f32 v0, v0, v2, v4 40195; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40196; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40197; GCN-NEXT: s_setpc_b64 s[30:31] 40198; 40199; GFX7-LABEL: v_fma_v2bf16: 40200; GFX7: ; %bb.0: 40201; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40202; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 40203; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 40204; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 40205; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 40206; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 40207; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 40208; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40209; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40210; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40211; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5 40212; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 40213; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40214; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40215; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3 40216; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40217; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40218; GFX7-NEXT: s_setpc_b64 s[30:31] 40219; 40220; GFX8-LABEL: v_fma_v2bf16: 40221; GFX8: ; %bb.0: 40222; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40223; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 40224; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 40225; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 40226; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3 40227; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 40228; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 40229; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40230; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40231; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40232; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 40233; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 40234; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 40235; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40236; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 40237; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 40238; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 40239; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 40240; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 40241; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40242; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40243; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40244; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 40245; GFX8-NEXT: s_setpc_b64 s[30:31] 40246; 40247; GFX9-LABEL: v_fma_v2bf16: 40248; GFX9: ; %bb.0: 40249; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40250; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 40251; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 40252; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 40253; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 40254; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40255; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40256; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40257; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 40258; GFX9-NEXT: s_movk_i32 s4, 0x7fff 40259; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 40260; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 40261; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 40262; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40263; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 40264; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 40265; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 40266; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 40267; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40268; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40269; GFX9-NEXT: s_mov_b32 s4, 0x7060302 40270; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 40271; GFX9-NEXT: s_setpc_b64 s[30:31] 40272; 40273; GFX10-LABEL: v_fma_v2bf16: 40274; GFX10: ; %bb.0: 40275; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40276; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 40277; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 40278; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 40279; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40280; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40281; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40282; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4 40283; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 40284; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1 40285; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3 40286; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 40287; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1 40288; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2 40289; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff 40290; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 40291; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo 40292; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 40293; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 40294; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 40295; GFX10-NEXT: s_setpc_b64 s[30:31] 40296; 40297; GFX11-LABEL: v_fma_v2bf16: 40298; GFX11: ; %bb.0: 40299; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40300; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 40301; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 40302; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40303; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40304; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 40305; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40306; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 40307; GFX11-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4 40308; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 1 40309; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 40310; GFX11-NEXT: v_bfe_u32 v0, v3, 16, 1 40311; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 40312; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 40313; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 40314; GFX11-NEXT: v_add3_u32 v1, v1, v2, 0x7fff 40315; GFX11-NEXT: v_add3_u32 v0, v0, v3, 0x7fff 40316; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 40317; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo 40318; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 40319; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 40320; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 40321; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 40322; GFX11-NEXT: s_setpc_b64 s[30:31] 40323 %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 40324 ret <2 x bfloat> %op 40325} 40326 40327define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) { 40328; GCN-LABEL: v_fma_v3bf16: 40329; GCN: ; %bb.0: 40330; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40331; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40332; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 40333; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 40334; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40335; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 40336; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 40337; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40338; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 40339; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 40340; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 40341; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40342; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40343; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 40344; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40345; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40346; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 40347; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40348; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40349; GCN-NEXT: v_fma_f32 v2, v2, v5, v8 40350; GCN-NEXT: v_fma_f32 v1, v1, v4, v7 40351; GCN-NEXT: v_fma_f32 v0, v0, v3, v6 40352; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40353; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40354; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40355; GCN-NEXT: s_setpc_b64 s[30:31] 40356; 40357; GFX7-LABEL: v_fma_v3bf16: 40358; GFX7: ; %bb.0: 40359; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40360; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 40361; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 40362; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 40363; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 40364; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 40365; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 40366; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 40367; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40368; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40369; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 40370; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 40371; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 40372; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8 40373; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 40374; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40375; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40376; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5 40377; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 40378; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40379; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40380; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4 40381; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40382; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40383; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40384; GFX7-NEXT: s_setpc_b64 s[30:31] 40385; 40386; GFX8-LABEL: v_fma_v3bf16: 40387; GFX8: ; %bb.0: 40388; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40389; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 40390; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40391; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40392; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 40393; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 40394; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 40395; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 40396; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 40397; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40398; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 40399; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 40400; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 40401; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0 40402; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3 40403; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 40404; GFX8-NEXT: s_movk_i32 s4, 0x7fff 40405; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 40406; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40407; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40408; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40409; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 40410; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 40411; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 40412; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40413; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 40414; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 40415; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 40416; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 40417; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 40418; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40419; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 40420; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40421; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 40422; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 40423; GFX8-NEXT: s_setpc_b64 s[30:31] 40424; 40425; GFX9-LABEL: v_fma_v3bf16: 40426; GFX9: ; %bb.0: 40427; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40428; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 40429; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40430; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40431; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 40432; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 40433; GFX9-NEXT: s_movk_i32 s4, 0x7fff 40434; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 40435; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 40436; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40437; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 40438; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 40439; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 40440; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0 40441; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 40442; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40443; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40444; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40445; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 40446; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 40447; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 40448; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 40449; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40450; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 40451; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 40452; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 40453; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 40454; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40455; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 40456; GFX9-NEXT: s_mov_b32 s4, 0x7060302 40457; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 40458; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 40459; GFX9-NEXT: s_setpc_b64 s[30:31] 40460; 40461; GFX10-LABEL: v_fma_v3bf16: 40462; GFX10: ; %bb.0: 40463; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40464; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 40465; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 40466; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0 40467; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 40468; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40469; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40470; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40471; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40472; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40473; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 40474; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 40475; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 40476; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1 40477; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6 40478; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 40479; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1 40480; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1 40481; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff 40482; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4 40483; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 40484; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff 40485; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 40486; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 40487; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 40488; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 40489; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 40490; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo 40491; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302 40492; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16 40493; GFX10-NEXT: s_setpc_b64 s[30:31] 40494; 40495; GFX11TRUE16-LABEL: v_fma_v3bf16: 40496; GFX11TRUE16: ; %bb.0: 40497; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40498; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 40499; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 40500; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 40501; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40502; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40503; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40504; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40505; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5 40506; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 40507; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v0, v2 40508; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 40509; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 40510; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 40511; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 40512; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 40513; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40514; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 40515; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3 40516; GFX11TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1 40517; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 40518; GFX11TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1 40519; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 40520; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff 40521; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 40522; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff 40523; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) 40524; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 40525; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 40526; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 40527; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 40528; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo 40529; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302 40530; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 40531; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v3, 16 40532; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 40533; 40534; GFX11FAKE16-LABEL: v_fma_v3bf16: 40535; GFX11FAKE16: ; %bb.0: 40536; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40537; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 40538; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 40539; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 40540; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40541; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40542; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40543; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40544; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5 40545; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 40546; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2 40547; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 40548; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 40549; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 40550; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 40551; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff 40552; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40553; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 40554; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3 40555; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1 40556; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 40557; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1 40558; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 40559; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff 40560; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 40561; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff 40562; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) 40563; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo 40564; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 40565; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo 40566; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 40567; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo 40568; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302 40569; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 40570; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 40571; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 40572 %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) 40573 ret <3 x bfloat> %op 40574} 40575 40576define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) { 40577; GCN-LABEL: v_fma_v4bf16: 40578; GCN: ; %bb.0: 40579; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40580; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40581; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 40582; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 40583; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40584; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 40585; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 40586; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40587; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 40588; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 40589; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 40590; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 40591; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 40592; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 40593; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 40594; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40595; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 40596; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 40597; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40598; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 40599; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40600; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40601; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 40602; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40603; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40604; GCN-NEXT: v_fma_f32 v3, v3, v7, v11 40605; GCN-NEXT: v_fma_f32 v2, v2, v6, v10 40606; GCN-NEXT: v_fma_f32 v1, v1, v5, v9 40607; GCN-NEXT: v_fma_f32 v0, v0, v4, v8 40608; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40609; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40610; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40611; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40612; GCN-NEXT: s_setpc_b64 s[30:31] 40613; 40614; GFX7-LABEL: v_fma_v4bf16: 40615; GFX7: ; %bb.0: 40616; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40617; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 40618; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 40619; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 40620; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 40621; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 40622; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 40623; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 40624; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 40625; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40626; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 40627; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 40628; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 40629; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11 40630; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 40631; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 40632; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40633; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 40634; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 40635; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 40636; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7 40637; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 40638; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40639; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40640; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6 40641; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 40642; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40643; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40644; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5 40645; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40646; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40647; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40648; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40649; GFX7-NEXT: s_setpc_b64 s[30:31] 40650; 40651; GFX8-LABEL: v_fma_v4bf16: 40652; GFX8: ; %bb.0: 40653; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40654; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5 40655; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3 40656; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1 40657; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6 40658; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 40659; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 40660; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40661; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40662; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40663; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 40664; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 40665; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 40666; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 40667; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 40668; GFX8-NEXT: s_movk_i32 s4, 0x7fff 40669; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 40670; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 40671; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 40672; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 40673; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40674; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 40675; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 40676; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 40677; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0 40678; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3 40679; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 40680; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 40681; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40682; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40683; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40684; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 40685; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 40686; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 40687; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40688; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 40689; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 40690; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 40691; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 40692; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 40693; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40694; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 40695; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 40696; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40697; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 40698; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 40699; GFX8-NEXT: s_setpc_b64 s[30:31] 40700; 40701; GFX9-LABEL: v_fma_v4bf16: 40702; GFX9: ; %bb.0: 40703; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40704; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 40705; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3 40706; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 40707; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 40708; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40709; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40710; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40711; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 40712; GFX9-NEXT: s_movk_i32 s4, 0x7fff 40713; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 40714; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 40715; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 40716; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 40717; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 40718; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 40719; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 40720; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 40721; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 40722; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 40723; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 40724; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 40725; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0 40726; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 40727; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40728; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40729; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40730; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 40731; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 40732; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 40733; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 40734; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 40735; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 40736; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 40737; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 40738; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 40739; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40740; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 40741; GFX9-NEXT: s_mov_b32 s4, 0x7060302 40742; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 40743; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 40744; GFX9-NEXT: s_setpc_b64 s[30:31] 40745; 40746; GFX10-LABEL: v_fma_v4bf16: 40747; GFX10: ; %bb.0: 40748; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40749; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 40750; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3 40751; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1 40752; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40753; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40754; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40755; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 40756; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7 40757; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4 40758; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2 40759; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40760; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40761; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40762; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 40763; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3 40764; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8 40765; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6 40766; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2 40767; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff 40768; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1 40769; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1 40770; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 40771; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1 40772; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 40773; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo 40774; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff 40775; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff 40776; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7 40777; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 40778; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff 40779; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 40780; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo 40781; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 40782; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo 40783; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 40784; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo 40785; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 40786; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 40787; GFX10-NEXT: s_setpc_b64 s[30:31] 40788; 40789; GFX11-LABEL: v_fma_v4bf16: 40790; GFX11: ; %bb.0: 40791; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40792; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1 40793; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40794; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0 40795; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40796; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v3 40797; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40798; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v5 40799; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40800; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 40801; GFX11-NEXT: v_fmac_f32_e32 v5, v1, v3 40802; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4 40803; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40804; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) 40805; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 40806; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v6 40807; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 40808; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 40809; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40810; GFX11-NEXT: v_fmac_f32_e32 v4, v0, v2 40811; GFX11-NEXT: v_add3_u32 v0, v10, v6, 0x7fff 40812; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 40813; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) 40814; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo 40815; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v8 40816; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 40817; GFX11-NEXT: v_add3_u32 v0, v2, v5, 0x7fff 40818; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 40819; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) 40820; GFX11-NEXT: v_bfe_u32 v3, v7, 16, 1 40821; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 40822; GFX11-NEXT: v_add3_u32 v6, v8, v4, 0x7fff 40823; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 40824; GFX11-NEXT: v_add3_u32 v2, v3, v7, 0x7fff 40825; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7 40826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) 40827; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo 40828; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 40829; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo 40830; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 40831; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo 40832; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 40833; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 40834; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 40835; GFX11-NEXT: s_setpc_b64 s[30:31] 40836 %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) 40837 ret <4 x bfloat> %op 40838} 40839 40840declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat) 40841declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) 40842declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) 40843declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 40844 40845define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { 40846; GCN-LABEL: v_fmuladd_bf16: 40847; GCN: ; %bb.0: 40848; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40849; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40850; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40851; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40852; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40853; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40854; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 40855; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40856; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 40857; GCN-NEXT: v_add_f32_e32 v0, v0, v1 40858; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40859; GCN-NEXT: s_setpc_b64 s[30:31] 40860; 40861; GFX7-LABEL: v_fmuladd_bf16: 40862; GFX7: ; %bb.0: 40863; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40864; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 40865; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 40866; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40867; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40868; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 40869; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 40870; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40871; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 40872; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 40873; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40874; GFX7-NEXT: s_setpc_b64 s[30:31] 40875; 40876; GFX8-LABEL: v_fmuladd_bf16: 40877; GFX8: ; %bb.0: 40878; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40879; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40880; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40881; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 40882; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 40883; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 40884; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 40885; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 40886; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40887; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 40888; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40889; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 40890; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 40891; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 40892; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 40893; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 40894; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 40895; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40896; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40897; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40898; GFX8-NEXT: s_setpc_b64 s[30:31] 40899; 40900; GFX9-LABEL: v_fmuladd_bf16: 40901; GFX9: ; %bb.0: 40902; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40903; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40904; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40905; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 40906; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 40907; GFX9-NEXT: s_movk_i32 s4, 0x7fff 40908; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 40909; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 40910; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40911; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc 40912; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40913; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 40914; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 40915; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 40916; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 40917; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 40918; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 40919; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 40920; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40921; GFX9-NEXT: s_setpc_b64 s[30:31] 40922; 40923; GFX10-LABEL: v_fmuladd_bf16: 40924; GFX10: ; %bb.0: 40925; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40926; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40927; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40928; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 40929; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 40930; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 40931; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 40932; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 40933; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo 40934; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 40935; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40936; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 40937; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 40938; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 40939; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 40940; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 40941; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 40942; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40943; GFX10-NEXT: s_setpc_b64 s[30:31] 40944; 40945; GFX11-LABEL: v_fmuladd_bf16: 40946; GFX11: ; %bb.0: 40947; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40948; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 40949; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 40950; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 40951; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 40952; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 40953; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 40954; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 40955; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 40956; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 40957; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2 40958; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 40959; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40960; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 40961; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 40962; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 40963; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 40964; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 40965; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff 40966; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 40967; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo 40968; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 40969; GFX11-NEXT: s_setpc_b64 s[30:31] 40970 %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) 40971 ret bfloat %op 40972} 40973 40974define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { 40975; GCN-LABEL: v_fmuladd_v2bf16: 40976; GCN: ; %bb.0: 40977; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40978; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 40979; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 40980; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 40981; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 40982; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 40983; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 40984; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 40985; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40986; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 40987; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 40988; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40989; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 40990; GCN-NEXT: v_mul_f32_e32 v1, v1, v3 40991; GCN-NEXT: v_mul_f32_e32 v0, v0, v2 40992; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40993; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40994; GCN-NEXT: v_add_f32_e32 v1, v1, v5 40995; GCN-NEXT: v_add_f32_e32 v0, v0, v4 40996; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 40997; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 40998; GCN-NEXT: s_setpc_b64 s[30:31] 40999; 41000; GFX7-LABEL: v_fmuladd_v2bf16: 41001; GFX7: ; %bb.0: 41002; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41003; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 41004; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 41005; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 41006; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 41007; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41008; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41009; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41010; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41011; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 41012; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 41013; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 41014; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 41015; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41016; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 41017; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41018; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 41019; GFX7-NEXT: v_add_f32_e32 v1, v1, v3 41020; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 41021; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41022; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41023; GFX7-NEXT: s_setpc_b64 s[30:31] 41024; 41025; GFX8-LABEL: v_fmuladd_v2bf16: 41026; GFX8: ; %bb.0: 41027; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41028; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 41029; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 41030; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 41031; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 41032; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 41033; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 41034; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 41035; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41036; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 41037; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41038; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 41039; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 41040; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 41041; GFX8-NEXT: s_movk_i32 s4, 0x7fff 41042; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 41043; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41044; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41045; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 41046; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 41047; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 41048; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41049; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 41050; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 41051; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 41052; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 41053; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 41054; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41055; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 41056; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41057; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 41058; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 41059; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 41060; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 41061; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 41062; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 41063; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41064; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 41065; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 41066; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 41067; GFX8-NEXT: s_setpc_b64 s[30:31] 41068; 41069; GFX9-LABEL: v_fmuladd_v2bf16: 41070; GFX9: ; %bb.0: 41071; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41072; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 41073; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 41074; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 41075; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 41076; GFX9-NEXT: s_movk_i32 s4, 0x7fff 41077; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 41078; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 41079; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41080; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 41081; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41082; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 41083; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 41084; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41085; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41086; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 41087; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 41088; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 41089; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 41090; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41091; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 41092; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 41093; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 41094; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 41095; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41096; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc 41097; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41098; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 41099; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 41100; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 41101; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 41102; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 41103; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41104; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 41105; GFX9-NEXT: s_mov_b32 s4, 0x7060302 41106; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 41107; GFX9-NEXT: s_setpc_b64 s[30:31] 41108; 41109; GFX10-LABEL: v_fmuladd_v2bf16: 41110; GFX10: ; %bb.0: 41111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41112; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 41113; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 41114; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41115; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41116; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 41117; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 41118; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1 41119; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 41120; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 41121; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41122; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 41123; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff 41124; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41125; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 41126; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41127; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 41128; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41129; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41130; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 41131; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 41132; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41133; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 41134; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 41135; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 41136; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41137; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 41138; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41139; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 41140; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 41141; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 41142; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41143; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 41144; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 41145; GFX10-NEXT: s_setpc_b64 s[30:31] 41146; 41147; GFX11-LABEL: v_fmuladd_v2bf16: 41148; GFX11: ; %bb.0: 41149; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41150; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 41151; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 41152; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41153; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 41154; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0 41155; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 41156; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41157; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 1 41158; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 41159; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41160; GFX11-NEXT: v_add3_u32 v1, v1, v3, 0x7fff 41161; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41162; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 41163; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 41164; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) 41165; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo 41166; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41167; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 41168; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41169; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41170; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41171; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3 41172; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 41173; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41174; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 41175; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41176; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41177; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 41178; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 41179; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 41180; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41181; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41182; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 41183; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff 41184; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41185; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo 41186; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41187; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo 41188; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 41189; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 41190; GFX11-NEXT: s_setpc_b64 s[30:31] 41191 %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) 41192 ret <2 x bfloat> %op 41193} 41194 41195define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) { 41196; GCN-LABEL: v_fmuladd_v3bf16: 41197; GCN: ; %bb.0: 41198; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41199; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 41200; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 41201; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 41202; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 41203; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 41204; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 41205; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 41206; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 41207; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 41208; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 41209; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41210; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 41211; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41212; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41213; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 41214; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41215; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41216; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 41217; GCN-NEXT: v_mul_f32_e32 v2, v2, v5 41218; GCN-NEXT: v_mul_f32_e32 v1, v1, v4 41219; GCN-NEXT: v_mul_f32_e32 v0, v0, v3 41220; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41221; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41222; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41223; GCN-NEXT: v_add_f32_e32 v2, v2, v8 41224; GCN-NEXT: v_add_f32_e32 v1, v1, v7 41225; GCN-NEXT: v_add_f32_e32 v0, v0, v6 41226; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41227; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41228; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41229; GCN-NEXT: s_setpc_b64 s[30:31] 41230; 41231; GFX7-LABEL: v_fmuladd_v3bf16: 41232; GFX7: ; %bb.0: 41233; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41234; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 41235; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 41236; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 41237; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 41238; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 41239; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 41240; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 41241; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41242; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41243; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41244; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41245; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41246; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 41247; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 41248; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 41249; GFX7-NEXT: v_mul_f32_e32 v2, v2, v5 41250; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4 41251; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 41252; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41253; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 41254; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41255; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 41256; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41257; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 41258; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 41259; GFX7-NEXT: v_add_f32_e32 v1, v1, v4 41260; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 41261; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41262; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41263; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41264; GFX7-NEXT: s_setpc_b64 s[30:31] 41265; 41266; GFX8-LABEL: v_fmuladd_v3bf16: 41267; GFX8: ; %bb.0: 41268; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41269; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 41270; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 41271; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 41272; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 41273; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 41274; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 41275; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 41276; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41277; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc 41278; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41279; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 41280; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 41281; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 41282; GFX8-NEXT: s_movk_i32 s4, 0x7fff 41283; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 41284; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 41285; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 41286; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41287; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 41288; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41289; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 41290; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 41291; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 41292; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 41293; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 41294; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 41295; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41296; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 41297; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41298; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41299; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 41300; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 41301; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 41302; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41303; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41304; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 41305; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 41306; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 41307; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41308; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 41309; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 41310; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 41311; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 41312; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 41313; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41314; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 41315; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41316; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 41317; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 41318; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 41319; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 41320; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 41321; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 41322; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41323; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 41324; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 41325; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 41326; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 41327; GFX8-NEXT: s_setpc_b64 s[30:31] 41328; 41329; GFX9-LABEL: v_fmuladd_v3bf16: 41330; GFX9: ; %bb.0: 41331; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41332; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 41333; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 41334; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 41335; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 41336; GFX9-NEXT: s_movk_i32 s4, 0x7fff 41337; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 41338; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 41339; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41340; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc 41341; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41342; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 41343; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 41344; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 41345; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 41346; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 41347; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41348; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 41349; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41350; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 41351; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 41352; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 41353; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 41354; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 41355; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41356; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 41357; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41358; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41359; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 41360; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41361; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41362; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 41363; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 41364; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 41365; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 41366; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41367; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 41368; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 41369; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 41370; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 41371; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41372; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 41373; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41374; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 41375; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 41376; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 41377; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 41378; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 41379; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41380; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 41381; GFX9-NEXT: s_mov_b32 s4, 0x7060302 41382; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 41383; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 41384; GFX9-NEXT: s_setpc_b64 s[30:31] 41385; 41386; GFX10-LABEL: v_fmuladd_v3bf16: 41387; GFX10: ; %bb.0: 41388; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41389; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 41390; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 41391; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 41392; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 41393; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41394; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41395; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 41396; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6 41397; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 41398; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 41399; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 41400; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 41401; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41402; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 41403; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41404; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 41405; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 41406; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0 41407; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff 41408; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo 41409; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41410; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5 41411; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41412; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41413; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41414; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo 41415; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41416; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 41417; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41418; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo 41419; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 41420; GFX10-NEXT: v_add_f32_e32 v2, v2, v5 41421; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41422; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 41423; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 41424; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 41425; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 41426; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 41427; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 41428; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 41429; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 41430; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 41431; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 41432; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 41433; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41434; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 41435; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41436; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 41437; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 41438; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 41439; GFX10-NEXT: s_setpc_b64 s[30:31] 41440; 41441; GFX11TRUE16-LABEL: v_fmuladd_v3bf16: 41442; GFX11TRUE16: ; %bb.0: 41443; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41444; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 41445; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 41446; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41447; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41448; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 41449; GFX11TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3 41450; GFX11TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 41451; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 41452; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 41453; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff 41454; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 41455; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3 41456; GFX11TRUE16-NEXT: v_mul_f32_e32 v3, v7, v6 41457; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41458; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 41459; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 41460; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 41461; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41462; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 41463; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41464; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 41465; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 41466; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo 41467; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41468; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 41469; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41470; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41471; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo 41472; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41473; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41474; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41475; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo 41476; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 41477; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41478; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41479; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41480; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 41481; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41482; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 41483; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 41484; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 41485; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 41486; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 41487; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 41488; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 41489; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 41490; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 41491; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 41492; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 41493; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 41494; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41495; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41496; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 41497; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41498; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 41499; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 41500; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 41501; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16 41502; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] 41503; 41504; GFX11FAKE16-LABEL: v_fmuladd_v3bf16: 41505; GFX11FAKE16: ; %bb.0: 41506; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41507; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 41508; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 41509; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41510; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41511; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 41512; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3 41513; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1 41514; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 41515; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 41516; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff 41517; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 41518; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3 41519; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v7, v6 41520; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41521; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 41522; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 41523; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 41524; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41525; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 41526; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41527; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 41528; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 41529; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo 41530; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41531; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 41532; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41533; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41534; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo 41535; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41536; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41537; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41538; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo 41539; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v5 41540; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41541; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41542; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41543; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 41544; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41545; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 41546; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 41547; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 41548; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 41549; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 41550; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 41551; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff 41552; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 41553; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 41554; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 41555; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff 41556; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo 41557; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41558; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41559; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo 41560; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41561; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 41562; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo 41563; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 41564; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 41565; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] 41566 %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) 41567 ret <3 x bfloat> %op 41568} 41569 41570define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) { 41571; GCN-LABEL: v_fmuladd_v4bf16: 41572; GCN: ; %bb.0: 41573; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41574; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 41575; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 41576; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 41577; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 41578; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 41579; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 41580; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 41581; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 41582; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 41583; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 41584; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 41585; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 41586; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 41587; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41588; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 41589; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 41590; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41591; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 41592; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 41593; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41594; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 41595; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41596; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41597; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 41598; GCN-NEXT: v_mul_f32_e32 v3, v3, v7 41599; GCN-NEXT: v_mul_f32_e32 v2, v2, v6 41600; GCN-NEXT: v_mul_f32_e32 v1, v1, v5 41601; GCN-NEXT: v_mul_f32_e32 v0, v0, v4 41602; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41603; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41604; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41605; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41606; GCN-NEXT: v_add_f32_e32 v3, v3, v11 41607; GCN-NEXT: v_add_f32_e32 v2, v2, v10 41608; GCN-NEXT: v_add_f32_e32 v1, v1, v9 41609; GCN-NEXT: v_add_f32_e32 v0, v0, v8 41610; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41611; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41612; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41613; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41614; GCN-NEXT: s_setpc_b64 s[30:31] 41615; 41616; GFX7-LABEL: v_fmuladd_v4bf16: 41617; GFX7: ; %bb.0: 41618; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41619; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 41620; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 41621; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 41622; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 41623; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 41624; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 41625; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 41626; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 41627; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 41628; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41629; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 41630; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41631; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 41632; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41633; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41634; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41635; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 41636; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 41637; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 41638; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 41639; GFX7-NEXT: v_mul_f32_e32 v3, v3, v7 41640; GFX7-NEXT: v_mul_f32_e32 v2, v2, v6 41641; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5 41642; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4 41643; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41644; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 41645; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41646; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 41647; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41648; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v9 41649; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41650; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 41651; GFX7-NEXT: v_add_f32_e32 v3, v3, v7 41652; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 41653; GFX7-NEXT: v_add_f32_e32 v1, v1, v5 41654; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 41655; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41656; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41657; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41658; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41659; GFX7-NEXT: s_setpc_b64 s[30:31] 41660; 41661; GFX8-LABEL: v_fmuladd_v4bf16: 41662; GFX8: ; %bb.0: 41663; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41664; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 41665; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1 41666; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6 41667; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 41668; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 41669; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 41670; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 41671; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 41672; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 41673; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 41674; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5 41675; GFX8-NEXT: v_add_f32_e32 v6, v6, v7 41676; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 41677; GFX8-NEXT: s_movk_i32 s4, 0x7fff 41678; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 41679; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41680; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41681; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 41682; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3 41683; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6 41684; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 41685; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 41686; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 41687; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 41688; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 41689; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1 41690; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41691; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc 41692; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41693; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 41694; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 41695; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 41696; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 41697; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 41698; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 41699; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41700; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 41701; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41702; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 41703; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3 41704; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 41705; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 41706; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 41707; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 41708; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41709; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 41710; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41711; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41712; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 41713; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 41714; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 41715; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41716; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41717; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 41718; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 41719; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 41720; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41721; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 41722; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 41723; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 41724; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 41725; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 41726; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41727; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 41728; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41729; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 41730; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 41731; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 41732; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 41733; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 41734; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 41735; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41736; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 41737; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 41738; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 41739; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 41740; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 41741; GFX8-NEXT: s_setpc_b64 s[30:31] 41742; 41743; GFX9-LABEL: v_fmuladd_v4bf16: 41744; GFX9: ; %bb.0: 41745; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41746; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3 41747; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 41748; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 41749; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 41750; GFX9-NEXT: s_movk_i32 s4, 0x7fff 41751; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 41752; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 41753; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 41754; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 41755; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 41756; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 41757; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 41758; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41759; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41760; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 41761; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 41762; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 41763; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 41764; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 41765; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 41766; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 41767; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 41768; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 41769; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41770; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc 41771; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41772; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 41773; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 41774; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 41775; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 41776; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 41777; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 41778; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc 41779; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 41780; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 41781; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 41782; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 41783; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 41784; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 41785; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41786; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 41787; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41788; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 41789; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 41790; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41791; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41792; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 41793; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 41794; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 41795; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 41796; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 41797; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 41798; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc 41799; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 41800; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 41801; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41802; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc 41803; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41804; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 41805; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 41806; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 41807; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 41808; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 41809; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 41810; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 41811; GFX9-NEXT: s_mov_b32 s4, 0x7060302 41812; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 41813; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 41814; GFX9-NEXT: s_setpc_b64 s[30:31] 41815; 41816; GFX10-LABEL: v_fmuladd_v4bf16: 41817; GFX10: ; %bb.0: 41818; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41819; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3 41820; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1 41821; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41822; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41823; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 41824; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41825; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6 41826; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2 41827; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41828; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 41829; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 41830; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 41831; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6 41832; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7 41833; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 41834; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 41835; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff 41836; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 41837; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 41838; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1 41839; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41840; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1 41841; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo 41842; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41843; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7 41844; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff 41845; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 41846; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41847; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo 41848; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 41849; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff 41850; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 41851; GFX10-NEXT: v_add_f32_e32 v3, v3, v8 41852; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41853; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo 41854; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41855; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 41856; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41857; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 41858; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 41859; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo 41860; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 41861; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3 41862; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41863; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 41864; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41865; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1 41866; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 41867; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 41868; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff 41869; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 41870; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 41871; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo 41872; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff 41873; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff 41874; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 41875; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 41876; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 41877; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 41878; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 41879; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41880; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 41881; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41882; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 41883; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 41884; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 41885; GFX10-NEXT: s_setpc_b64 s[30:31] 41886; 41887; GFX11-LABEL: v_fmuladd_v4bf16: 41888; GFX11: ; %bb.0: 41889; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41890; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0 41891; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41892; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1 41893; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 41894; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 41895; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v3 41896; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41897; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41898; GFX11-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5 41899; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v2 41900; GFX11-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2 41901; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 41902; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 41903; GFX11-NEXT: v_mul_f32_e32 v7, v9, v7 41904; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v6 41905; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 41906; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41907; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff 41908; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 41909; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 41910; GFX11-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2 41911; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 41912; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41913; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v7 41914; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff 41915; GFX11-NEXT: v_bfe_u32 v11, v0, 16, 1 41916; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff 41917; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 41918; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 41919; GFX11-NEXT: v_add3_u32 v11, v11, v0, 0x7fff 41920; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4 41921; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 41922; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 41923; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 41924; GFX11-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1 41925; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41926; GFX11-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2 41927; GFX11-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo 41928; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 41929; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 41930; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 41931; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41932; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 41933; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 41934; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 41935; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 41936; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 41937; GFX11-NEXT: v_add_f32_e32 v3, v3, v8 41938; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 41939; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) 41940; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 41941; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 41942; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 41943; GFX11-NEXT: v_add3_u32 v4, v7, v3, 0x7fff 41944; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 41945; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41946; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo 41947; GFX11-NEXT: v_add3_u32 v4, v6, v1, 0x7fff 41948; GFX11-NEXT: v_add3_u32 v5, v7, v2, 0x7fff 41949; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 41950; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 41951; GFX11-NEXT: v_add3_u32 v7, v8, v0, 0x7fff 41952; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 41953; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 41954; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 41955; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 41956; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo 41957; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 41958; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 41959; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 41960; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo 41961; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x7060302 41962; GFX11-NEXT: s_setpc_b64 s[30:31] 41963 %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) 41964 ret <4 x bfloat> %op 41965} 41966