1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 4; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s 7 8declare half @llvm.fabs.f16(half) #0 9declare half @llvm.canonicalize.f16(half) #0 10declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 11declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 12declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0 13declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0 14declare <6 x half> @llvm.canonicalize.v6f16(<6 x half>) #0 15declare <8 x half> @llvm.canonicalize.v8f16(<8 x half>) #0 16declare <12 x half> @llvm.canonicalize.v12f16(<12 x half>) #0 17declare <16 x half> @llvm.canonicalize.v16f16(<16 x half>) #0 18declare <32 x half> @llvm.canonicalize.v32f16(<32 x half>) #0 19declare <64 x half> @llvm.canonicalize.v64f16(<64 x half>) #0 20declare i32 @llvm.amdgcn.workitem.id.x() #0 21 22define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { 23; VI-LABEL: test_fold_canonicalize_undef_value_f16: 24; VI: ; %bb.0: 25; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 26; VI-NEXT: v_mov_b32_e32 v2, 0 27; VI-NEXT: s_waitcnt lgkmcnt(0) 28; VI-NEXT: v_mov_b32_e32 v0, s0 29; VI-NEXT: v_mov_b32_e32 v1, s1 30; VI-NEXT: flat_store_short v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: test_fold_canonicalize_undef_value_f16: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 36; GFX9-NEXT: v_mov_b32_e32 v0, 0 37; GFX9-NEXT: s_waitcnt lgkmcnt(0) 38; GFX9-NEXT: global_store_short v0, v0, s[0:1] 39; GFX9-NEXT: s_endpgm 40; 41; CI-LABEL: test_fold_canonicalize_undef_value_f16: 42; CI: ; %bb.0: 43; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 44; CI-NEXT: s_mov_b32 s3, 0xf000 45; CI-NEXT: s_mov_b32 s2, -1 46; CI-NEXT: v_mov_b32_e32 v0, 0 47; CI-NEXT: s_waitcnt lgkmcnt(0) 48; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 49; CI-NEXT: s_endpgm 50; 51; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: 52; GFX11: ; %bb.0: 53; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 54; GFX11-NEXT: v_mov_b32_e32 v0, 0 55; GFX11-NEXT: s_waitcnt lgkmcnt(0) 56; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] 57; GFX11-NEXT: s_endpgm 58 %canonicalized = call half @llvm.canonicalize.f16(half undef) 59 store half %canonicalized, ptr addrspace(1) %out 60 ret void 61} 62 63define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { 64; VI-LABEL: v_test_canonicalize_var_f16: 65; VI: ; %bb.0: 66; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 67; VI-NEXT: s_waitcnt lgkmcnt(0) 68; VI-NEXT: v_mov_b32_e32 v0, s0 69; VI-NEXT: v_mov_b32_e32 v1, s1 70; VI-NEXT: flat_load_ushort v0, v[0:1] 71; VI-NEXT: s_waitcnt vmcnt(0) 72; VI-NEXT: v_max_f16_e32 v0, v0, v0 73; VI-NEXT: flat_store_short v[0:1], v0 74; VI-NEXT: s_endpgm 75; 76; GFX9-LABEL: v_test_canonicalize_var_f16: 77; GFX9: ; %bb.0: 78; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 79; GFX9-NEXT: v_mov_b32_e32 v0, 0 80; GFX9-NEXT: s_waitcnt lgkmcnt(0) 81; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] 82; GFX9-NEXT: s_waitcnt vmcnt(0) 83; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 84; GFX9-NEXT: global_store_short v[0:1], v0, off 85; GFX9-NEXT: s_endpgm 86; 87; CI-LABEL: v_test_canonicalize_var_f16: 88; CI: ; %bb.0: 89; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 90; CI-NEXT: s_mov_b32 s3, 0xf000 91; CI-NEXT: s_mov_b32 s2, -1 92; CI-NEXT: s_waitcnt lgkmcnt(0) 93; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 94; CI-NEXT: s_waitcnt vmcnt(0) 95; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 96; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 97; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 98; CI-NEXT: s_endpgm 99; 100; GFX11-TRUE16-LABEL: v_test_canonicalize_var_f16: 101; GFX11-TRUE16: ; %bb.0: 102; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 103; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 104; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 105; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[0:1] 106; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 107; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 108; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off 109; GFX11-TRUE16-NEXT: s_endpgm 110; 111; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16: 112; GFX11-FAKE16: ; %bb.0: 113; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 114; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 115; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 116; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[0:1] 117; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 118; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 119; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off 120; GFX11-FAKE16-NEXT: s_endpgm 121 %val = load half, ptr addrspace(1) %out 122 %canonicalized = call half @llvm.canonicalize.f16(half %val) 123 store half %canonicalized, ptr addrspace(1) undef 124 ret void 125} 126 127define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { 128; VI-LABEL: s_test_canonicalize_var_f16: 129; VI: ; %bb.0: 130; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 131; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 132; VI-NEXT: s_waitcnt lgkmcnt(0) 133; VI-NEXT: v_max_f16_e64 v2, s2, s2 134; VI-NEXT: v_mov_b32_e32 v0, s0 135; VI-NEXT: v_mov_b32_e32 v1, s1 136; VI-NEXT: flat_store_short v[0:1], v2 137; VI-NEXT: s_endpgm 138; 139; GFX9-LABEL: s_test_canonicalize_var_f16: 140; GFX9: ; %bb.0: 141; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 142; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 143; GFX9-NEXT: v_mov_b32_e32 v0, 0 144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 145; GFX9-NEXT: v_max_f16_e64 v1, s2, s2 146; GFX9-NEXT: global_store_short v0, v1, s[0:1] 147; GFX9-NEXT: s_endpgm 148; 149; CI-LABEL: s_test_canonicalize_var_f16: 150; CI: ; %bb.0: 151; CI-NEXT: s_load_dword s0, s[4:5], 0xb 152; CI-NEXT: s_mov_b32 s3, 0xf000 153; CI-NEXT: s_mov_b32 s2, -1 154; CI-NEXT: s_waitcnt lgkmcnt(0) 155; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 156; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 157; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 158; CI-NEXT: s_waitcnt lgkmcnt(0) 159; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 160; CI-NEXT: s_endpgm 161; 162; GFX11-TRUE16-LABEL: s_test_canonicalize_var_f16: 163; GFX11-TRUE16: ; %bb.0: 164; GFX11-TRUE16-NEXT: s_clause 0x1 165; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 166; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 167; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 168; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 169; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 170; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 171; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 172; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 173; GFX11-TRUE16-NEXT: s_endpgm 174; 175; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16: 176; GFX11-FAKE16: ; %bb.0: 177; GFX11-FAKE16-NEXT: s_clause 0x1 178; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c 179; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 180; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 181; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 182; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, s2, s2 183; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 184; GFX11-FAKE16-NEXT: s_endpgm 185 %val = bitcast i16 %val.arg to half 186 %canonicalized = call half @llvm.canonicalize.f16(half %val) 187 store half %canonicalized, ptr addrspace(1) %out 188 ret void 189} 190 191define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { 192; VI-LABEL: v_test_canonicalize_build_vector_v2f16: 193; VI: ; %bb.0: 194; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 196; VI-NEXT: v_max_f16_e32 v0, v0, v0 197; VI-NEXT: v_or_b32_e32 v0, v0, v1 198; VI-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX9-NEXT: s_mov_b32 s4, 0x5040100 204; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 205; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 206; GFX9-NEXT: s_setpc_b64 s[30:31] 207; 208; CI-LABEL: v_test_canonicalize_build_vector_v2f16: 209; CI: ; %bb.0: 210; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 212; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 213; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 214; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 215; CI-NEXT: s_setpc_b64 s[30:31] 216; 217; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16: 218; GFX11: ; %bb.0: 219; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 220; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 221; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 222; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 223; GFX11-NEXT: s_setpc_b64 s[30:31] 224 %ins0 = insertelement <2 x half> undef, half %lo, i32 0 225 %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1 226 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1) 227 ret <2 x half> %canonicalized 228} 229 230define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { 231; VI-LABEL: v_test_canonicalize_fabs_var_f16: 232; VI: ; %bb.0: 233; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 234; VI-NEXT: s_waitcnt lgkmcnt(0) 235; VI-NEXT: v_mov_b32_e32 v0, s0 236; VI-NEXT: v_mov_b32_e32 v1, s1 237; VI-NEXT: flat_load_ushort v2, v[0:1] 238; VI-NEXT: s_waitcnt vmcnt(0) 239; VI-NEXT: v_max_f16_e64 v2, |v2|, |v2| 240; VI-NEXT: flat_store_short v[0:1], v2 241; VI-NEXT: s_endpgm 242; 243; GFX9-LABEL: v_test_canonicalize_fabs_var_f16: 244; GFX9: ; %bb.0: 245; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 246; GFX9-NEXT: v_mov_b32_e32 v0, 0 247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 248; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 249; GFX9-NEXT: s_waitcnt vmcnt(0) 250; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1| 251; GFX9-NEXT: global_store_short v0, v1, s[0:1] 252; GFX9-NEXT: s_endpgm 253; 254; CI-LABEL: v_test_canonicalize_fabs_var_f16: 255; CI: ; %bb.0: 256; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 257; CI-NEXT: s_mov_b32 s3, 0xf000 258; CI-NEXT: s_mov_b32 s2, -1 259; CI-NEXT: s_waitcnt lgkmcnt(0) 260; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 261; CI-NEXT: s_waitcnt vmcnt(0) 262; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 263; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 264; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 265; CI-NEXT: s_endpgm 266; 267; GFX11-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16: 268; GFX11-TRUE16: ; %bb.0: 269; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 270; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 271; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 272; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] 273; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 274; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, |v0.l|, |v0.l| 275; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 276; GFX11-TRUE16-NEXT: s_endpgm 277; 278; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16: 279; GFX11-FAKE16: ; %bb.0: 280; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 281; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 282; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 283; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] 284; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 285; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, |v1|, |v1| 286; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 287; GFX11-FAKE16-NEXT: s_endpgm 288 %val = load half, ptr addrspace(1) %out 289 %val.fabs = call half @llvm.fabs.f16(half %val) 290 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs) 291 store half %canonicalized, ptr addrspace(1) %out 292 ret void 293} 294 295define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { 296; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: 297; VI: ; %bb.0: 298; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 299; VI-NEXT: s_waitcnt lgkmcnt(0) 300; VI-NEXT: v_mov_b32_e32 v0, s0 301; VI-NEXT: v_mov_b32_e32 v1, s1 302; VI-NEXT: flat_load_ushort v2, v[0:1] 303; VI-NEXT: s_waitcnt vmcnt(0) 304; VI-NEXT: v_max_f16_e64 v2, -|v2|, -|v2| 305; VI-NEXT: flat_store_short v[0:1], v2 306; VI-NEXT: s_endpgm 307; 308; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16: 309; GFX9: ; %bb.0: 310; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 311; GFX9-NEXT: v_mov_b32_e32 v0, 0 312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 313; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 314; GFX9-NEXT: s_waitcnt vmcnt(0) 315; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| 316; GFX9-NEXT: global_store_short v0, v1, s[0:1] 317; GFX9-NEXT: s_endpgm 318; 319; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: 320; CI: ; %bb.0: 321; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 322; CI-NEXT: s_mov_b32 s3, 0xf000 323; CI-NEXT: s_mov_b32 s2, -1 324; CI-NEXT: s_waitcnt lgkmcnt(0) 325; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 326; CI-NEXT: s_waitcnt vmcnt(0) 327; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 328; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 329; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 330; CI-NEXT: s_endpgm 331; 332; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16: 333; GFX11-TRUE16: ; %bb.0: 334; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 335; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 336; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 337; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] 338; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 339; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| 340; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 341; GFX11-TRUE16-NEXT: s_endpgm 342; 343; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16: 344; GFX11-FAKE16: ; %bb.0: 345; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 346; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 347; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 348; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] 349; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 350; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| 351; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 352; GFX11-FAKE16-NEXT: s_endpgm 353 %val = load half, ptr addrspace(1) %out 354 %val.fabs = call half @llvm.fabs.f16(half %val) 355 %val.fabs.fneg = fneg half %val.fabs 356 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) 357 store half %canonicalized, ptr addrspace(1) %out 358 ret void 359} 360 361define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { 362; VI-LABEL: v_test_canonicalize_fneg_var_f16: 363; VI: ; %bb.0: 364; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 365; VI-NEXT: s_waitcnt lgkmcnt(0) 366; VI-NEXT: v_mov_b32_e32 v0, s0 367; VI-NEXT: v_mov_b32_e32 v1, s1 368; VI-NEXT: flat_load_ushort v2, v[0:1] 369; VI-NEXT: s_waitcnt vmcnt(0) 370; VI-NEXT: v_max_f16_e64 v2, -v2, -v2 371; VI-NEXT: flat_store_short v[0:1], v2 372; VI-NEXT: s_endpgm 373; 374; GFX9-LABEL: v_test_canonicalize_fneg_var_f16: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 377; GFX9-NEXT: v_mov_b32_e32 v0, 0 378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 379; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 380; GFX9-NEXT: s_waitcnt vmcnt(0) 381; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 382; GFX9-NEXT: global_store_short v0, v1, s[0:1] 383; GFX9-NEXT: s_endpgm 384; 385; CI-LABEL: v_test_canonicalize_fneg_var_f16: 386; CI: ; %bb.0: 387; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 388; CI-NEXT: s_mov_b32 s3, 0xf000 389; CI-NEXT: s_mov_b32 s2, -1 390; CI-NEXT: s_waitcnt lgkmcnt(0) 391; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 392; CI-NEXT: s_waitcnt vmcnt(0) 393; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 394; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 395; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 396; CI-NEXT: s_endpgm 397; 398; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16: 399; GFX11-TRUE16: ; %bb.0: 400; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 401; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 402; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 403; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] 404; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 405; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l 406; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 407; GFX11-TRUE16-NEXT: s_endpgm 408; 409; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16: 410; GFX11-FAKE16: ; %bb.0: 411; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 412; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 413; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 414; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] 415; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 416; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1 417; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 418; GFX11-FAKE16-NEXT: s_endpgm 419 %val = load half, ptr addrspace(1) %out 420 %val.fneg = fneg half %val 421 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) 422 store half %canonicalized, ptr addrspace(1) %out 423 ret void 424} 425 426define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { 427; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: 428; VI: ; %bb.0: 429; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: v_mov_b32_e32 v0, s0 432; VI-NEXT: v_mov_b32_e32 v1, s1 433; VI-NEXT: flat_load_ushort v2, v[0:1] 434; VI-NEXT: s_waitcnt vmcnt(0) 435; VI-NEXT: v_mul_f16_e32 v2, -1.0, v2 436; VI-NEXT: flat_store_short v[0:1], v2 437; VI-NEXT: s_endpgm 438; 439; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: 440; GFX9: ; %bb.0: 441; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 442; GFX9-NEXT: v_mov_b32_e32 v0, 0 443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 444; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 445; GFX9-NEXT: s_waitcnt vmcnt(0) 446; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 447; GFX9-NEXT: global_store_short v0, v1, s[0:1] 448; GFX9-NEXT: s_endpgm 449; 450; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: 451; CI: ; %bb.0: 452; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 453; CI-NEXT: s_mov_b32 s3, 0xf000 454; CI-NEXT: s_mov_b32 s2, -1 455; CI-NEXT: s_waitcnt lgkmcnt(0) 456; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 457; CI-NEXT: s_waitcnt vmcnt(0) 458; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 459; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 460; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 461; CI-NEXT: s_endpgm 462; 463; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: 464; GFX11-TRUE16: ; %bb.0: 465; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 466; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 467; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 468; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] 469; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 470; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l 471; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 472; GFX11-TRUE16-NEXT: s_endpgm 473; 474; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: 475; GFX11-FAKE16: ; %bb.0: 476; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 477; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 478; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 479; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] 480; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 481; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1 482; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 483; GFX11-FAKE16-NEXT: s_endpgm 484 %val = load half, ptr addrspace(1) %out 485 %val.fneg = fneg half %val 486 %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) 487 store half %canonicalized, ptr addrspace(1) %out 488 ret void 489} 490 491define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { 492; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: 493; VI: ; %bb.0: 494; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 495; VI-NEXT: s_waitcnt lgkmcnt(0) 496; VI-NEXT: v_mov_b32_e32 v0, s0 497; VI-NEXT: v_mov_b32_e32 v1, s1 498; VI-NEXT: flat_load_ushort v2, v[0:1] 499; VI-NEXT: s_waitcnt vmcnt(0) 500; VI-NEXT: v_mul_f16_e64 v2, -1.0, |v2| 501; VI-NEXT: flat_store_short v[0:1], v2 502; VI-NEXT: s_endpgm 503; 504; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: 505; GFX9: ; %bb.0: 506; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 507; GFX9-NEXT: v_mov_b32_e32 v0, 0 508; GFX9-NEXT: s_waitcnt lgkmcnt(0) 509; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 510; GFX9-NEXT: s_waitcnt vmcnt(0) 511; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| 512; GFX9-NEXT: global_store_short v0, v1, s[0:1] 513; GFX9-NEXT: s_endpgm 514; 515; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: 516; CI: ; %bb.0: 517; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 518; CI-NEXT: s_mov_b32 s3, 0xf000 519; CI-NEXT: s_mov_b32 s2, -1 520; CI-NEXT: s_waitcnt lgkmcnt(0) 521; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 522; CI-NEXT: s_waitcnt vmcnt(0) 523; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 524; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 525; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 526; CI-NEXT: s_endpgm 527; 528; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: 529; GFX11-TRUE16: ; %bb.0: 530; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 531; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 532; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) 533; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] 534; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) 535; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| 536; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] 537; GFX11-TRUE16-NEXT: s_endpgm 538; 539; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: 540; GFX11-FAKE16: ; %bb.0: 541; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 542; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 543; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) 544; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] 545; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) 546; GFX11-FAKE16-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| 547; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] 548; GFX11-FAKE16-NEXT: s_endpgm 549 %val = load half, ptr addrspace(1) %out 550 %val.fabs = call half @llvm.fabs.f16(half %val) 551 %val.fabs.fneg = fneg half %val.fabs 552 %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) 553 store half %canonicalized, ptr addrspace(1) %out 554 ret void 555} 556 557define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { 558; VI-LABEL: test_fold_canonicalize_p0_f16: 559; VI: ; %bb.0: 560; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 561; VI-NEXT: v_mov_b32_e32 v2, 0 562; VI-NEXT: s_waitcnt lgkmcnt(0) 563; VI-NEXT: v_mov_b32_e32 v0, s0 564; VI-NEXT: v_mov_b32_e32 v1, s1 565; VI-NEXT: flat_store_short v[0:1], v2 566; VI-NEXT: s_endpgm 567; 568; GFX9-LABEL: test_fold_canonicalize_p0_f16: 569; GFX9: ; %bb.0: 570; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 571; GFX9-NEXT: v_mov_b32_e32 v0, 0 572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-NEXT: global_store_short v0, v0, s[0:1] 574; GFX9-NEXT: s_endpgm 575; 576; CI-LABEL: test_fold_canonicalize_p0_f16: 577; CI: ; %bb.0: 578; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 579; CI-NEXT: s_mov_b32 s3, 0xf000 580; CI-NEXT: s_mov_b32 s2, -1 581; CI-NEXT: v_mov_b32_e32 v0, 0 582; CI-NEXT: s_waitcnt lgkmcnt(0) 583; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 584; CI-NEXT: s_endpgm 585; 586; GFX11-LABEL: test_fold_canonicalize_p0_f16: 587; GFX11: ; %bb.0: 588; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 589; GFX11-NEXT: v_mov_b32_e32 v0, 0 590; GFX11-NEXT: s_waitcnt lgkmcnt(0) 591; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] 592; GFX11-NEXT: s_endpgm 593 %canonicalized = call half @llvm.canonicalize.f16(half 0.0) 594 store half %canonicalized, ptr addrspace(1) %out 595 ret void 596} 597 598define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { 599; VI-LABEL: test_fold_canonicalize_n0_f16: 600; VI: ; %bb.0: 601; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 602; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 603; VI-NEXT: s_waitcnt lgkmcnt(0) 604; VI-NEXT: v_mov_b32_e32 v0, s0 605; VI-NEXT: v_mov_b32_e32 v1, s1 606; VI-NEXT: flat_store_short v[0:1], v2 607; VI-NEXT: s_endpgm 608; 609; GFX9-LABEL: test_fold_canonicalize_n0_f16: 610; GFX9: ; %bb.0: 611; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 612; GFX9-NEXT: v_mov_b32_e32 v0, 0 613; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 614; GFX9-NEXT: s_waitcnt lgkmcnt(0) 615; GFX9-NEXT: global_store_short v0, v1, s[0:1] 616; GFX9-NEXT: s_endpgm 617; 618; CI-LABEL: test_fold_canonicalize_n0_f16: 619; CI: ; %bb.0: 620; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 621; CI-NEXT: s_mov_b32 s3, 0xf000 622; CI-NEXT: s_mov_b32 s2, -1 623; CI-NEXT: v_mov_b32_e32 v0, 0x8000 624; CI-NEXT: s_waitcnt lgkmcnt(0) 625; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 626; CI-NEXT: s_endpgm 627; 628; GFX11-LABEL: test_fold_canonicalize_n0_f16: 629; GFX11: ; %bb.0: 630; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 631; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 632; GFX11-NEXT: s_waitcnt lgkmcnt(0) 633; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 634; GFX11-NEXT: s_endpgm 635 %canonicalized = call half @llvm.canonicalize.f16(half -0.0) 636 store half %canonicalized, ptr addrspace(1) %out 637 ret void 638} 639 640define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { 641; VI-LABEL: test_fold_canonicalize_p1_f16: 642; VI: ; %bb.0: 643; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 644; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 645; VI-NEXT: s_waitcnt lgkmcnt(0) 646; VI-NEXT: v_mov_b32_e32 v0, s0 647; VI-NEXT: v_mov_b32_e32 v1, s1 648; VI-NEXT: flat_store_short v[0:1], v2 649; VI-NEXT: s_endpgm 650; 651; GFX9-LABEL: test_fold_canonicalize_p1_f16: 652; GFX9: ; %bb.0: 653; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 654; GFX9-NEXT: v_mov_b32_e32 v0, 0 655; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 657; GFX9-NEXT: global_store_short v0, v1, s[0:1] 658; GFX9-NEXT: s_endpgm 659; 660; CI-LABEL: test_fold_canonicalize_p1_f16: 661; CI: ; %bb.0: 662; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 663; CI-NEXT: s_mov_b32 s3, 0xf000 664; CI-NEXT: s_mov_b32 s2, -1 665; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 666; CI-NEXT: s_waitcnt lgkmcnt(0) 667; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 668; CI-NEXT: s_endpgm 669; 670; GFX11-LABEL: test_fold_canonicalize_p1_f16: 671; GFX11: ; %bb.0: 672; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 673; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 674; GFX11-NEXT: s_waitcnt lgkmcnt(0) 675; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 676; GFX11-NEXT: s_endpgm 677 %canonicalized = call half @llvm.canonicalize.f16(half 1.0) 678 store half %canonicalized, ptr addrspace(1) %out 679 ret void 680} 681 682define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { 683; VI-LABEL: test_fold_canonicalize_n1_f16: 684; VI: ; %bb.0: 685; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 686; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 687; VI-NEXT: s_waitcnt lgkmcnt(0) 688; VI-NEXT: v_mov_b32_e32 v0, s0 689; VI-NEXT: v_mov_b32_e32 v1, s1 690; VI-NEXT: flat_store_short v[0:1], v2 691; VI-NEXT: s_endpgm 692; 693; GFX9-LABEL: test_fold_canonicalize_n1_f16: 694; GFX9: ; %bb.0: 695; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 696; GFX9-NEXT: v_mov_b32_e32 v0, 0 697; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00 698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-NEXT: global_store_short v0, v1, s[0:1] 700; GFX9-NEXT: s_endpgm 701; 702; CI-LABEL: test_fold_canonicalize_n1_f16: 703; CI: ; %bb.0: 704; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 705; CI-NEXT: s_mov_b32 s3, 0xf000 706; CI-NEXT: s_mov_b32 s2, -1 707; CI-NEXT: v_mov_b32_e32 v0, 0xbc00 708; CI-NEXT: s_waitcnt lgkmcnt(0) 709; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 710; CI-NEXT: s_endpgm 711; 712; GFX11-LABEL: test_fold_canonicalize_n1_f16: 713; GFX11: ; %bb.0: 714; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 715; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 716; GFX11-NEXT: s_waitcnt lgkmcnt(0) 717; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 718; GFX11-NEXT: s_endpgm 719 %canonicalized = call half @llvm.canonicalize.f16(half -1.0) 720 store half %canonicalized, ptr addrspace(1) %out 721 ret void 722} 723 724define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { 725; VI-LABEL: test_fold_canonicalize_literal_f16: 726; VI: ; %bb.0: 727; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 728; VI-NEXT: v_mov_b32_e32 v2, 0x4c00 729; VI-NEXT: s_waitcnt lgkmcnt(0) 730; VI-NEXT: v_mov_b32_e32 v0, s0 731; VI-NEXT: v_mov_b32_e32 v1, s1 732; VI-NEXT: flat_store_short v[0:1], v2 733; VI-NEXT: s_endpgm 734; 735; GFX9-LABEL: test_fold_canonicalize_literal_f16: 736; GFX9: ; %bb.0: 737; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 738; GFX9-NEXT: v_mov_b32_e32 v0, 0 739; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 740; GFX9-NEXT: s_waitcnt lgkmcnt(0) 741; GFX9-NEXT: global_store_short v0, v1, s[0:1] 742; GFX9-NEXT: s_endpgm 743; 744; CI-LABEL: test_fold_canonicalize_literal_f16: 745; CI: ; %bb.0: 746; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 747; CI-NEXT: s_mov_b32 s3, 0xf000 748; CI-NEXT: s_mov_b32 s2, -1 749; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 750; CI-NEXT: s_waitcnt lgkmcnt(0) 751; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 752; CI-NEXT: s_endpgm 753; 754; GFX11-LABEL: test_fold_canonicalize_literal_f16: 755; GFX11: ; %bb.0: 756; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 757; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 758; GFX11-NEXT: s_waitcnt lgkmcnt(0) 759; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 760; GFX11-NEXT: s_endpgm 761 %canonicalized = call half @llvm.canonicalize.f16(half 16.0) 762 store half %canonicalized, ptr addrspace(1) %out 763 ret void 764} 765 766define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { 767; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: 768; VI: ; %bb.0: 769; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 770; VI-NEXT: v_mov_b32_e32 v2, 0x3ff 771; VI-NEXT: s_waitcnt lgkmcnt(0) 772; VI-NEXT: v_mov_b32_e32 v0, s0 773; VI-NEXT: v_mov_b32_e32 v1, s1 774; VI-NEXT: flat_store_short v[0:1], v2 775; VI-NEXT: s_endpgm 776; 777; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: 778; GFX9: ; %bb.0: 779; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 780; GFX9-NEXT: v_mov_b32_e32 v0, 0 781; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff 782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 783; GFX9-NEXT: global_store_short v0, v1, s[0:1] 784; GFX9-NEXT: s_endpgm 785; 786; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: 787; CI: ; %bb.0: 788; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 789; CI-NEXT: s_mov_b32 s3, 0xf000 790; CI-NEXT: s_mov_b32 s2, -1 791; CI-NEXT: v_mov_b32_e32 v0, 0x3ff 792; CI-NEXT: s_waitcnt lgkmcnt(0) 793; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 794; CI-NEXT: s_endpgm 795; 796; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: 797; GFX11: ; %bb.0: 798; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 799; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff 800; GFX11-NEXT: s_waitcnt lgkmcnt(0) 801; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 802; GFX11-NEXT: s_endpgm 803 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) 804 store half %canonicalized, ptr addrspace(1) %out 805 ret void 806} 807 808define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { 809; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: 810; VI: ; %bb.0: 811; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 812; VI-NEXT: v_mov_b32_e32 v2, 0x3ff 813; VI-NEXT: s_waitcnt lgkmcnt(0) 814; VI-NEXT: v_mov_b32_e32 v0, s0 815; VI-NEXT: v_mov_b32_e32 v1, s1 816; VI-NEXT: flat_store_short v[0:1], v2 817; VI-NEXT: s_endpgm 818; 819; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16: 820; GFX9: ; %bb.0: 821; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 822; GFX9-NEXT: v_mov_b32_e32 v0, 0 823; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff 824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 825; GFX9-NEXT: global_store_short v0, v1, s[0:1] 826; GFX9-NEXT: s_endpgm 827; 828; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: 829; CI: ; %bb.0: 830; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 831; CI-NEXT: s_mov_b32 s3, 0xf000 832; CI-NEXT: s_mov_b32 s2, -1 833; CI-NEXT: v_mov_b32_e32 v0, 0x3ff 834; CI-NEXT: s_waitcnt lgkmcnt(0) 835; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 836; CI-NEXT: s_endpgm 837; 838; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: 839; GFX11: ; %bb.0: 840; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 841; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff 842; GFX11-NEXT: s_waitcnt lgkmcnt(0) 843; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 844; GFX11-NEXT: s_endpgm 845 %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) 846 store half %canonicalized, ptr addrspace(1) %out 847 ret void 848} 849 850define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { 851; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: 852; VI: ; %bb.0: 853; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 854; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff 855; VI-NEXT: s_waitcnt lgkmcnt(0) 856; VI-NEXT: v_mov_b32_e32 v0, s0 857; VI-NEXT: v_mov_b32_e32 v1, s1 858; VI-NEXT: flat_store_short v[0:1], v2 859; VI-NEXT: s_endpgm 860; 861; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: 862; GFX9: ; %bb.0: 863; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 864; GFX9-NEXT: v_mov_b32_e32 v0, 0 865; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff 866; GFX9-NEXT: s_waitcnt lgkmcnt(0) 867; GFX9-NEXT: global_store_short v0, v1, s[0:1] 868; GFX9-NEXT: s_endpgm 869; 870; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: 871; CI: ; %bb.0: 872; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 873; CI-NEXT: s_mov_b32 s3, 0xf000 874; CI-NEXT: s_mov_b32 s2, -1 875; CI-NEXT: v_mov_b32_e32 v0, 0x83ff 876; CI-NEXT: s_waitcnt lgkmcnt(0) 877; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 878; CI-NEXT: s_endpgm 879; 880; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: 881; GFX11: ; %bb.0: 882; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 883; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff 884; GFX11-NEXT: s_waitcnt lgkmcnt(0) 885; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 886; GFX11-NEXT: s_endpgm 887 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) 888 store half %canonicalized, ptr addrspace(1) %out 889 ret void 890} 891 892define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { 893; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: 894; VI: ; %bb.0: 895; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 896; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff 897; VI-NEXT: s_waitcnt lgkmcnt(0) 898; VI-NEXT: v_mov_b32_e32 v0, s0 899; VI-NEXT: v_mov_b32_e32 v1, s1 900; VI-NEXT: flat_store_short v[0:1], v2 901; VI-NEXT: s_endpgm 902; 903; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16: 904; GFX9: ; %bb.0: 905; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 906; GFX9-NEXT: v_mov_b32_e32 v0, 0 907; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff 908; GFX9-NEXT: s_waitcnt lgkmcnt(0) 909; GFX9-NEXT: global_store_short v0, v1, s[0:1] 910; GFX9-NEXT: s_endpgm 911; 912; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: 913; CI: ; %bb.0: 914; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 915; CI-NEXT: s_mov_b32 s3, 0xf000 916; CI-NEXT: s_mov_b32 s2, -1 917; CI-NEXT: v_mov_b32_e32 v0, 0x83ff 918; CI-NEXT: s_waitcnt lgkmcnt(0) 919; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 920; CI-NEXT: s_endpgm 921; 922; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: 923; GFX11: ; %bb.0: 924; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 925; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff 926; GFX11-NEXT: s_waitcnt lgkmcnt(0) 927; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 928; GFX11-NEXT: s_endpgm 929 %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) 930 store half %canonicalized, ptr addrspace(1) %out 931 ret void 932} 933 934define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { 935; VI-LABEL: test_fold_canonicalize_qnan_f16: 936; VI: ; %bb.0: 937; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 938; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 939; VI-NEXT: s_waitcnt lgkmcnt(0) 940; VI-NEXT: v_mov_b32_e32 v0, s0 941; VI-NEXT: v_mov_b32_e32 v1, s1 942; VI-NEXT: flat_store_short v[0:1], v2 943; VI-NEXT: s_endpgm 944; 945; GFX9-LABEL: test_fold_canonicalize_qnan_f16: 946; GFX9: ; %bb.0: 947; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 948; GFX9-NEXT: v_mov_b32_e32 v0, 0 949; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00 950; GFX9-NEXT: s_waitcnt lgkmcnt(0) 951; GFX9-NEXT: global_store_short v0, v1, s[0:1] 952; GFX9-NEXT: s_endpgm 953; 954; CI-LABEL: test_fold_canonicalize_qnan_f16: 955; CI: ; %bb.0: 956; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 957; CI-NEXT: s_mov_b32 s3, 0xf000 958; CI-NEXT: s_mov_b32 s2, -1 959; CI-NEXT: v_mov_b32_e32 v0, 0x7c00 960; CI-NEXT: s_waitcnt lgkmcnt(0) 961; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 962; CI-NEXT: s_endpgm 963; 964; GFX11-LABEL: test_fold_canonicalize_qnan_f16: 965; GFX11: ; %bb.0: 966; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 967; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 968; GFX11-NEXT: s_waitcnt lgkmcnt(0) 969; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 970; GFX11-NEXT: s_endpgm 971 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) 972 store half %canonicalized, ptr addrspace(1) %out 973 ret void 974} 975 976define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { 977; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: 978; VI: ; %bb.0: 979; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 980; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 981; VI-NEXT: s_waitcnt lgkmcnt(0) 982; VI-NEXT: v_mov_b32_e32 v0, s0 983; VI-NEXT: v_mov_b32_e32 v1, s1 984; VI-NEXT: flat_store_short v[0:1], v2 985; VI-NEXT: s_endpgm 986; 987; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: 988; GFX9: ; %bb.0: 989; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 990; GFX9-NEXT: v_mov_b32_e32 v0, 0 991; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 993; GFX9-NEXT: global_store_short v0, v1, s[0:1] 994; GFX9-NEXT: s_endpgm 995; 996; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: 997; CI: ; %bb.0: 998; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 999; CI-NEXT: s_mov_b32 s3, 0xf000 1000; CI-NEXT: s_mov_b32 s2, -1 1001; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1002; CI-NEXT: s_waitcnt lgkmcnt(0) 1003; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1004; CI-NEXT: s_endpgm 1005; 1006; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: 1007; GFX11: ; %bb.0: 1008; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1009; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1010; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1012; GFX11-NEXT: s_endpgm 1013 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) 1014 store half %canonicalized, ptr addrspace(1) %out 1015 ret void 1016} 1017 1018define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { 1019; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: 1020; VI: ; %bb.0: 1021; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1022; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 1023; VI-NEXT: s_waitcnt lgkmcnt(0) 1024; VI-NEXT: v_mov_b32_e32 v0, s0 1025; VI-NEXT: v_mov_b32_e32 v1, s1 1026; VI-NEXT: flat_store_short v[0:1], v2 1027; VI-NEXT: s_endpgm 1028; 1029; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: 1030; GFX9: ; %bb.0: 1031; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1032; GFX9-NEXT: v_mov_b32_e32 v0, 0 1033; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 1034; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1036; GFX9-NEXT: s_endpgm 1037; 1038; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: 1039; CI: ; %bb.0: 1040; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1041; CI-NEXT: s_mov_b32 s3, 0xf000 1042; CI-NEXT: s_mov_b32 s2, -1 1043; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1044; CI-NEXT: s_waitcnt lgkmcnt(0) 1045; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1046; CI-NEXT: s_endpgm 1047; 1048; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: 1049; GFX11: ; %bb.0: 1050; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1051; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1052; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1054; GFX11-NEXT: s_endpgm 1055 %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) 1056 store half %canonicalized, ptr addrspace(1) %out 1057 ret void 1058} 1059 1060define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { 1061; VI-LABEL: test_fold_canonicalize_snan0_value_f16: 1062; VI: ; %bb.0: 1063; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1064; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 1065; VI-NEXT: s_waitcnt lgkmcnt(0) 1066; VI-NEXT: v_mov_b32_e32 v0, s0 1067; VI-NEXT: v_mov_b32_e32 v1, s1 1068; VI-NEXT: flat_store_short v[0:1], v2 1069; VI-NEXT: s_endpgm 1070; 1071; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16: 1072; GFX9: ; %bb.0: 1073; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1074; GFX9-NEXT: v_mov_b32_e32 v0, 0 1075; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 1076; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1078; GFX9-NEXT: s_endpgm 1079; 1080; CI-LABEL: test_fold_canonicalize_snan0_value_f16: 1081; CI: ; %bb.0: 1082; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1083; CI-NEXT: s_mov_b32 s3, 0xf000 1084; CI-NEXT: s_mov_b32 s2, -1 1085; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1086; CI-NEXT: s_waitcnt lgkmcnt(0) 1087; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1088; CI-NEXT: s_endpgm 1089; 1090; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: 1091; GFX11: ; %bb.0: 1092; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1093; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1094; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1096; GFX11-NEXT: s_endpgm 1097 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) 1098 store half %canonicalized, ptr addrspace(1) %out 1099 ret void 1100} 1101 1102define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { 1103; VI-LABEL: test_fold_canonicalize_snan1_value_f16: 1104; VI: ; %bb.0: 1105; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1106; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 1107; VI-NEXT: s_waitcnt lgkmcnt(0) 1108; VI-NEXT: v_mov_b32_e32 v0, s0 1109; VI-NEXT: v_mov_b32_e32 v1, s1 1110; VI-NEXT: flat_store_short v[0:1], v2 1111; VI-NEXT: s_endpgm 1112; 1113; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16: 1114; GFX9: ; %bb.0: 1115; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1116; GFX9-NEXT: v_mov_b32_e32 v0, 0 1117; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1120; GFX9-NEXT: s_endpgm 1121; 1122; CI-LABEL: test_fold_canonicalize_snan1_value_f16: 1123; CI: ; %bb.0: 1124; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1125; CI-NEXT: s_mov_b32 s3, 0xf000 1126; CI-NEXT: s_mov_b32 s2, -1 1127; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1128; CI-NEXT: s_waitcnt lgkmcnt(0) 1129; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1130; CI-NEXT: s_endpgm 1131; 1132; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: 1133; GFX11: ; %bb.0: 1134; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1135; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1136; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1138; GFX11-NEXT: s_endpgm 1139 %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) 1140 store half %canonicalized, ptr addrspace(1) %out 1141 ret void 1142} 1143 1144define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { 1145; VI-LABEL: test_fold_canonicalize_snan2_value_f16: 1146; VI: ; %bb.0: 1147; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1148; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 1149; VI-NEXT: s_waitcnt lgkmcnt(0) 1150; VI-NEXT: v_mov_b32_e32 v0, s0 1151; VI-NEXT: v_mov_b32_e32 v1, s1 1152; VI-NEXT: flat_store_short v[0:1], v2 1153; VI-NEXT: s_endpgm 1154; 1155; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16: 1156; GFX9: ; %bb.0: 1157; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1158; GFX9-NEXT: v_mov_b32_e32 v0, 0 1159; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 1160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1162; GFX9-NEXT: s_endpgm 1163; 1164; CI-LABEL: test_fold_canonicalize_snan2_value_f16: 1165; CI: ; %bb.0: 1166; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1167; CI-NEXT: s_mov_b32 s3, 0xf000 1168; CI-NEXT: s_mov_b32 s2, -1 1169; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1170; CI-NEXT: s_waitcnt lgkmcnt(0) 1171; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1172; CI-NEXT: s_endpgm 1173; 1174; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: 1175; GFX11: ; %bb.0: 1176; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1177; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1178; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1180; GFX11-NEXT: s_endpgm 1181 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) 1182 store half %canonicalized, ptr addrspace(1) %out 1183 ret void 1184} 1185 1186define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { 1187; VI-LABEL: test_fold_canonicalize_snan3_value_f16: 1188; VI: ; %bb.0: 1189; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1190; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 1191; VI-NEXT: s_waitcnt lgkmcnt(0) 1192; VI-NEXT: v_mov_b32_e32 v0, s0 1193; VI-NEXT: v_mov_b32_e32 v1, s1 1194; VI-NEXT: flat_store_short v[0:1], v2 1195; VI-NEXT: s_endpgm 1196; 1197; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16: 1198; GFX9: ; %bb.0: 1199; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1200; GFX9-NEXT: v_mov_b32_e32 v0, 0 1201; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 1202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1204; GFX9-NEXT: s_endpgm 1205; 1206; CI-LABEL: test_fold_canonicalize_snan3_value_f16: 1207; CI: ; %bb.0: 1208; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1209; CI-NEXT: s_mov_b32 s3, 0xf000 1210; CI-NEXT: s_mov_b32 s2, -1 1211; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 1212; CI-NEXT: s_waitcnt lgkmcnt(0) 1213; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 1214; CI-NEXT: s_endpgm 1215; 1216; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: 1217; GFX11: ; %bb.0: 1218; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1219; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 1220; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1222; GFX11-NEXT: s_endpgm 1223 %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) 1224 store half %canonicalized, ptr addrspace(1) %out 1225 ret void 1226} 1227 1228define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { 1229; VI-LABEL: v_test_canonicalize_var_v2f16: 1230; VI: ; %bb.0: 1231; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1232; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1233; VI-NEXT: s_waitcnt lgkmcnt(0) 1234; VI-NEXT: v_mov_b32_e32 v1, s1 1235; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1236; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1237; VI-NEXT: flat_load_dword v0, v[0:1] 1238; VI-NEXT: s_waitcnt vmcnt(0) 1239; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1240; VI-NEXT: v_max_f16_e32 v0, v0, v0 1241; VI-NEXT: v_or_b32_e32 v2, v0, v1 1242; VI-NEXT: v_mov_b32_e32 v0, s0 1243; VI-NEXT: v_mov_b32_e32 v1, s1 1244; VI-NEXT: flat_store_dword v[0:1], v2 1245; VI-NEXT: s_endpgm 1246; 1247; GFX9-LABEL: v_test_canonicalize_var_v2f16: 1248; GFX9: ; %bb.0: 1249; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1250; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1251; GFX9-NEXT: v_mov_b32_e32 v1, 0 1252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1253; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 1254; GFX9-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 1256; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 1257; GFX9-NEXT: s_endpgm 1258; 1259; CI-LABEL: v_test_canonicalize_var_v2f16: 1260; CI: ; %bb.0: 1261; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1262; CI-NEXT: s_mov_b32 s3, 0xf000 1263; CI-NEXT: s_mov_b32 s6, 0 1264; CI-NEXT: s_mov_b32 s7, s3 1265; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1266; CI-NEXT: s_waitcnt lgkmcnt(0) 1267; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1268; CI-NEXT: v_mov_b32_e32 v1, 0 1269; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1270; CI-NEXT: s_mov_b32 s2, -1 1271; CI-NEXT: s_waitcnt vmcnt(0) 1272; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1273; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1274; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1275; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1276; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1277; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1278; CI-NEXT: v_or_b32_e32 v0, v0, v1 1279; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1280; CI-NEXT: s_endpgm 1281; 1282; GFX11-LABEL: v_test_canonicalize_var_v2f16: 1283; GFX11: ; %bb.0: 1284; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1285; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1287; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1288; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 1290; GFX11-NEXT: s_waitcnt vmcnt(0) 1291; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 1292; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1293; GFX11-NEXT: s_endpgm 1294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1295 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1296 %val = load <2 x half>, ptr addrspace(1) %gep 1297 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) 1298 store <2 x half> %canonicalized, ptr addrspace(1) %out 1299 ret void 1300} 1301 1302define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { 1303; VI-LABEL: v_test_canonicalize_fabs_var_v2f16: 1304; VI: ; %bb.0: 1305; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1306; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1307; VI-NEXT: s_waitcnt lgkmcnt(0) 1308; VI-NEXT: v_mov_b32_e32 v1, s1 1309; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1310; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1311; VI-NEXT: flat_load_dword v0, v[0:1] 1312; VI-NEXT: s_waitcnt vmcnt(0) 1313; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1314; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0| 1315; VI-NEXT: v_or_b32_e32 v2, v0, v1 1316; VI-NEXT: v_mov_b32_e32 v0, s0 1317; VI-NEXT: v_mov_b32_e32 v1, s1 1318; VI-NEXT: flat_store_dword v[0:1], v2 1319; VI-NEXT: s_endpgm 1320; 1321; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16: 1322; GFX9: ; %bb.0: 1323; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1324; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1325; GFX9-NEXT: v_mov_b32_e32 v1, 0 1326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 1328; GFX9-NEXT: s_waitcnt vmcnt(0) 1329; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 1330; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 1331; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 1332; GFX9-NEXT: s_endpgm 1333; 1334; CI-LABEL: v_test_canonicalize_fabs_var_v2f16: 1335; CI: ; %bb.0: 1336; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1337; CI-NEXT: s_mov_b32 s3, 0xf000 1338; CI-NEXT: s_mov_b32 s6, 0 1339; CI-NEXT: s_mov_b32 s7, s3 1340; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1341; CI-NEXT: s_waitcnt lgkmcnt(0) 1342; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1343; CI-NEXT: v_mov_b32_e32 v1, 0 1344; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1345; CI-NEXT: s_mov_b32 s2, -1 1346; CI-NEXT: s_waitcnt vmcnt(0) 1347; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1348; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 1349; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 1350; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1351; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1352; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1353; CI-NEXT: v_or_b32_e32 v0, v0, v1 1354; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1355; CI-NEXT: s_endpgm 1356; 1357; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: 1358; GFX11: ; %bb.0: 1359; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1360; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1362; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1363; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 1365; GFX11-NEXT: s_waitcnt vmcnt(0) 1366; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 1367; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 1368; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1369; GFX11-NEXT: s_endpgm 1370 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1371 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1372 %val = load <2 x half>, ptr addrspace(1) %gep 1373 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 1374 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) 1375 store <2 x half> %canonicalized, ptr addrspace(1) %out 1376 ret void 1377} 1378 1379define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { 1380; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: 1381; VI: ; %bb.0: 1382; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1383; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1384; VI-NEXT: s_waitcnt lgkmcnt(0) 1385; VI-NEXT: v_mov_b32_e32 v1, s1 1386; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1387; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1388; VI-NEXT: flat_load_dword v0, v[0:1] 1389; VI-NEXT: s_waitcnt vmcnt(0) 1390; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1391; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0| 1392; VI-NEXT: v_or_b32_e32 v2, v0, v1 1393; VI-NEXT: v_mov_b32_e32 v0, s0 1394; VI-NEXT: v_mov_b32_e32 v1, s1 1395; VI-NEXT: flat_store_dword v[0:1], v2 1396; VI-NEXT: s_endpgm 1397; 1398; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: 1399; GFX9: ; %bb.0: 1400; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1401; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1402; GFX9-NEXT: v_mov_b32_e32 v1, 0 1403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1404; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 1405; GFX9-NEXT: s_waitcnt vmcnt(0) 1406; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 1407; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] 1408; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 1409; GFX9-NEXT: s_endpgm 1410; 1411; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: 1412; CI: ; %bb.0: 1413; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1414; CI-NEXT: s_mov_b32 s3, 0xf000 1415; CI-NEXT: s_mov_b32 s6, 0 1416; CI-NEXT: s_mov_b32 s7, s3 1417; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1418; CI-NEXT: s_waitcnt lgkmcnt(0) 1419; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1420; CI-NEXT: v_mov_b32_e32 v1, 0 1421; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1422; CI-NEXT: s_mov_b32 s2, -1 1423; CI-NEXT: s_waitcnt vmcnt(0) 1424; CI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 1425; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1426; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1427; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1428; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1429; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1430; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1431; CI-NEXT: v_or_b32_e32 v0, v0, v1 1432; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1433; CI-NEXT: s_endpgm 1434; 1435; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: 1436; GFX11: ; %bb.0: 1437; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1438; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1439; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1440; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1441; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1442; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 1443; GFX11-NEXT: s_waitcnt vmcnt(0) 1444; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 1445; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] 1446; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1447; GFX11-NEXT: s_endpgm 1448 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1449 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1450 %val = load <2 x half>, ptr addrspace(1) %gep 1451 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 1452 %val.fabs.fneg = fneg <2 x half> %val.fabs 1453 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) 1454 store <2 x half> %canonicalized, ptr addrspace(1) %out 1455 ret void 1456} 1457 1458define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { 1459; VI-LABEL: v_test_canonicalize_fneg_var_v2f16: 1460; VI: ; %bb.0: 1461; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1462; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1463; VI-NEXT: s_waitcnt lgkmcnt(0) 1464; VI-NEXT: v_mov_b32_e32 v1, s1 1465; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1466; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1467; VI-NEXT: flat_load_dword v0, v[0:1] 1468; VI-NEXT: s_waitcnt vmcnt(0) 1469; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1470; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 1471; VI-NEXT: v_or_b32_e32 v2, v0, v1 1472; VI-NEXT: v_mov_b32_e32 v0, s0 1473; VI-NEXT: v_mov_b32_e32 v1, s1 1474; VI-NEXT: flat_store_dword v[0:1], v2 1475; VI-NEXT: s_endpgm 1476; 1477; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16: 1478; GFX9: ; %bb.0: 1479; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1480; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1481; GFX9-NEXT: v_mov_b32_e32 v1, 0 1482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1483; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 1484; GFX9-NEXT: s_waitcnt vmcnt(0) 1485; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] 1486; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 1487; GFX9-NEXT: s_endpgm 1488; 1489; CI-LABEL: v_test_canonicalize_fneg_var_v2f16: 1490; CI: ; %bb.0: 1491; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1492; CI-NEXT: s_mov_b32 s3, 0xf000 1493; CI-NEXT: s_mov_b32 s6, 0 1494; CI-NEXT: s_mov_b32 s7, s3 1495; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1496; CI-NEXT: s_waitcnt lgkmcnt(0) 1497; CI-NEXT: s_mov_b64 s[4:5], s[0:1] 1498; CI-NEXT: v_mov_b32_e32 v1, 0 1499; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1500; CI-NEXT: s_mov_b32 s2, -1 1501; CI-NEXT: s_waitcnt vmcnt(0) 1502; CI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 1503; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1504; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1505; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1506; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1507; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1508; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1509; CI-NEXT: v_or_b32_e32 v0, v0, v1 1510; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1511; CI-NEXT: s_endpgm 1512; 1513; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: 1514; GFX11: ; %bb.0: 1515; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1516; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1517; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1518; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1519; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1520; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 1521; GFX11-NEXT: s_waitcnt vmcnt(0) 1522; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] 1523; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1524; GFX11-NEXT: s_endpgm 1525 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1526 %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 1527 %val = load <2 x half>, ptr addrspace(1) %gep 1528 %fneg.val = fneg <2 x half> %val 1529 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) 1530 store <2 x half> %canonicalized, ptr addrspace(1) %out 1531 ret void 1532} 1533 1534define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { 1535; VI-LABEL: s_test_canonicalize_var_v2f16: 1536; VI: ; %bb.0: 1537; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1538; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1539; VI-NEXT: s_waitcnt lgkmcnt(0) 1540; VI-NEXT: s_lshr_b32 s3, s2, 16 1541; VI-NEXT: v_mov_b32_e32 v1, s3 1542; VI-NEXT: v_max_f16_e64 v0, s2, s2 1543; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1544; VI-NEXT: v_or_b32_e32 v2, v0, v1 1545; VI-NEXT: v_mov_b32_e32 v0, s0 1546; VI-NEXT: v_mov_b32_e32 v1, s1 1547; VI-NEXT: flat_store_dword v[0:1], v2 1548; VI-NEXT: s_endpgm 1549; 1550; GFX9-LABEL: s_test_canonicalize_var_v2f16: 1551; GFX9: ; %bb.0: 1552; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 1553; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1554; GFX9-NEXT: v_mov_b32_e32 v0, 0 1555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX9-NEXT: v_pk_max_f16 v1, s2, s2 1557; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1558; GFX9-NEXT: s_endpgm 1559; 1560; CI-LABEL: s_test_canonicalize_var_v2f16: 1561; CI: ; %bb.0: 1562; CI-NEXT: s_load_dword s0, s[4:5], 0xb 1563; CI-NEXT: s_mov_b32 s3, 0xf000 1564; CI-NEXT: s_mov_b32 s2, -1 1565; CI-NEXT: s_waitcnt lgkmcnt(0) 1566; CI-NEXT: s_lshr_b32 s1, s0, 16 1567; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 1568; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 1569; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1570; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1571; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1572; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1573; CI-NEXT: v_or_b32_e32 v0, v1, v0 1574; CI-NEXT: s_waitcnt lgkmcnt(0) 1575; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1576; CI-NEXT: s_endpgm 1577; 1578; GFX11-LABEL: s_test_canonicalize_var_v2f16: 1579; GFX11: ; %bb.0: 1580; GFX11-NEXT: s_clause 0x1 1581; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 1582; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1583; GFX11-NEXT: v_mov_b32_e32 v0, 0 1584; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 1586; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1587; GFX11-NEXT: s_endpgm 1588 %val = bitcast i32 %val.arg to <2 x half> 1589 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) 1590 store <2 x half> %canonicalized, ptr addrspace(1) %out 1591 ret void 1592} 1593 1594define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { 1595; VI-LABEL: test_fold_canonicalize_p0_v2f16: 1596; VI: ; %bb.0: 1597; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1598; VI-NEXT: v_mov_b32_e32 v2, 0 1599; VI-NEXT: s_waitcnt lgkmcnt(0) 1600; VI-NEXT: v_mov_b32_e32 v0, s0 1601; VI-NEXT: v_mov_b32_e32 v1, s1 1602; VI-NEXT: flat_store_dword v[0:1], v2 1603; VI-NEXT: s_endpgm 1604; 1605; GFX9-LABEL: test_fold_canonicalize_p0_v2f16: 1606; GFX9: ; %bb.0: 1607; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1608; GFX9-NEXT: v_mov_b32_e32 v0, 0 1609; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1610; GFX9-NEXT: global_store_dword v0, v0, s[0:1] 1611; GFX9-NEXT: s_endpgm 1612; 1613; CI-LABEL: test_fold_canonicalize_p0_v2f16: 1614; CI: ; %bb.0: 1615; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1616; CI-NEXT: s_mov_b32 s3, 0xf000 1617; CI-NEXT: s_mov_b32 s2, -1 1618; CI-NEXT: v_mov_b32_e32 v0, 0 1619; CI-NEXT: s_waitcnt lgkmcnt(0) 1620; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1621; CI-NEXT: s_endpgm 1622; 1623; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: 1624; GFX11: ; %bb.0: 1625; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1626; GFX11-NEXT: v_mov_b32_e32 v0, 0 1627; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1628; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 1629; GFX11-NEXT: s_endpgm 1630 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) 1631 store <2 x half> %canonicalized, ptr addrspace(1) %out 1632 ret void 1633} 1634 1635define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { 1636; VI-LABEL: test_fold_canonicalize_n0_v2f16: 1637; VI: ; %bb.0: 1638; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1639; VI-NEXT: v_mov_b32_e32 v2, 0x80008000 1640; VI-NEXT: s_waitcnt lgkmcnt(0) 1641; VI-NEXT: v_mov_b32_e32 v0, s0 1642; VI-NEXT: v_mov_b32_e32 v1, s1 1643; VI-NEXT: flat_store_dword v[0:1], v2 1644; VI-NEXT: s_endpgm 1645; 1646; GFX9-LABEL: test_fold_canonicalize_n0_v2f16: 1647; GFX9: ; %bb.0: 1648; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1649; GFX9-NEXT: v_mov_b32_e32 v0, 0 1650; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000 1651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1652; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1653; GFX9-NEXT: s_endpgm 1654; 1655; CI-LABEL: test_fold_canonicalize_n0_v2f16: 1656; CI: ; %bb.0: 1657; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1658; CI-NEXT: s_mov_b32 s3, 0xf000 1659; CI-NEXT: s_mov_b32 s2, -1 1660; CI-NEXT: v_mov_b32_e32 v0, 0x80008000 1661; CI-NEXT: s_waitcnt lgkmcnt(0) 1662; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1663; CI-NEXT: s_endpgm 1664; 1665; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: 1666; GFX11: ; %bb.0: 1667; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1668; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 1669; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1671; GFX11-NEXT: s_endpgm 1672 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>) 1673 store <2 x half> %canonicalized, ptr addrspace(1) %out 1674 ret void 1675} 1676 1677define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { 1678; VI-LABEL: test_fold_canonicalize_p1_v2f16: 1679; VI: ; %bb.0: 1680; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1681; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00 1682; VI-NEXT: s_waitcnt lgkmcnt(0) 1683; VI-NEXT: v_mov_b32_e32 v0, s0 1684; VI-NEXT: v_mov_b32_e32 v1, s1 1685; VI-NEXT: flat_store_dword v[0:1], v2 1686; VI-NEXT: s_endpgm 1687; 1688; GFX9-LABEL: test_fold_canonicalize_p1_v2f16: 1689; GFX9: ; %bb.0: 1690; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1691; GFX9-NEXT: v_mov_b32_e32 v0, 0 1692; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00 1693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1695; GFX9-NEXT: s_endpgm 1696; 1697; CI-LABEL: test_fold_canonicalize_p1_v2f16: 1698; CI: ; %bb.0: 1699; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1700; CI-NEXT: s_mov_b32 s3, 0xf000 1701; CI-NEXT: s_mov_b32 s2, -1 1702; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 1703; CI-NEXT: s_waitcnt lgkmcnt(0) 1704; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1705; CI-NEXT: s_endpgm 1706; 1707; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: 1708; GFX11: ; %bb.0: 1709; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1710; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 1711; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1712; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1713; GFX11-NEXT: s_endpgm 1714 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>) 1715 store <2 x half> %canonicalized, ptr addrspace(1) %out 1716 ret void 1717} 1718 1719define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { 1720; VI-LABEL: test_fold_canonicalize_n1_v2f16: 1721; VI: ; %bb.0: 1722; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1723; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00 1724; VI-NEXT: s_waitcnt lgkmcnt(0) 1725; VI-NEXT: v_mov_b32_e32 v0, s0 1726; VI-NEXT: v_mov_b32_e32 v1, s1 1727; VI-NEXT: flat_store_dword v[0:1], v2 1728; VI-NEXT: s_endpgm 1729; 1730; GFX9-LABEL: test_fold_canonicalize_n1_v2f16: 1731; GFX9: ; %bb.0: 1732; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1733; GFX9-NEXT: v_mov_b32_e32 v0, 0 1734; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00 1735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1736; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1737; GFX9-NEXT: s_endpgm 1738; 1739; CI-LABEL: test_fold_canonicalize_n1_v2f16: 1740; CI: ; %bb.0: 1741; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1742; CI-NEXT: s_mov_b32 s3, 0xf000 1743; CI-NEXT: s_mov_b32 s2, -1 1744; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 1745; CI-NEXT: s_waitcnt lgkmcnt(0) 1746; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1747; CI-NEXT: s_endpgm 1748; 1749; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: 1750; GFX11: ; %bb.0: 1751; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1752; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 1753; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1754; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1755; GFX11-NEXT: s_endpgm 1756 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>) 1757 store <2 x half> %canonicalized, ptr addrspace(1) %out 1758 ret void 1759} 1760 1761define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { 1762; VI-LABEL: test_fold_canonicalize_literal_v2f16: 1763; VI: ; %bb.0: 1764; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1765; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00 1766; VI-NEXT: s_waitcnt lgkmcnt(0) 1767; VI-NEXT: v_mov_b32_e32 v0, s0 1768; VI-NEXT: v_mov_b32_e32 v1, s1 1769; VI-NEXT: flat_store_dword v[0:1], v2 1770; VI-NEXT: s_endpgm 1771; 1772; GFX9-LABEL: test_fold_canonicalize_literal_v2f16: 1773; GFX9: ; %bb.0: 1774; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1775; GFX9-NEXT: v_mov_b32_e32 v0, 0 1776; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00 1777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1778; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1779; GFX9-NEXT: s_endpgm 1780; 1781; CI-LABEL: test_fold_canonicalize_literal_v2f16: 1782; CI: ; %bb.0: 1783; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1784; CI-NEXT: s_mov_b32 s3, 0xf000 1785; CI-NEXT: s_mov_b32 s2, -1 1786; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00 1787; CI-NEXT: s_waitcnt lgkmcnt(0) 1788; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1789; CI-NEXT: s_endpgm 1790; 1791; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: 1792; GFX11: ; %bb.0: 1793; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1794; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 1795; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1796; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1797; GFX11-NEXT: s_endpgm 1798 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>) 1799 store <2 x half> %canonicalized, ptr addrspace(1) %out 1800 ret void 1801} 1802 1803define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { 1804; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: 1805; VI: ; %bb.0: 1806; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1807; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff 1808; VI-NEXT: s_waitcnt lgkmcnt(0) 1809; VI-NEXT: v_mov_b32_e32 v0, s0 1810; VI-NEXT: v_mov_b32_e32 v1, s1 1811; VI-NEXT: flat_store_dword v[0:1], v2 1812; VI-NEXT: s_endpgm 1813; 1814; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: 1815; GFX9: ; %bb.0: 1816; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1817; GFX9-NEXT: v_mov_b32_e32 v0, 0 1818; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff 1819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1821; GFX9-NEXT: s_endpgm 1822; 1823; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: 1824; CI: ; %bb.0: 1825; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1826; CI-NEXT: s_mov_b32 s3, 0xf000 1827; CI-NEXT: s_mov_b32 s2, -1 1828; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff 1829; CI-NEXT: s_waitcnt lgkmcnt(0) 1830; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1831; CI-NEXT: s_endpgm 1832; 1833; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: 1834; GFX11: ; %bb.0: 1835; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1836; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff 1837; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1838; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1839; GFX11-NEXT: s_endpgm 1840 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>) 1841 store <2 x half> %canonicalized, ptr addrspace(1) %out 1842 ret void 1843} 1844 1845define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { 1846; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: 1847; VI: ; %bb.0: 1848; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1849; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff 1850; VI-NEXT: s_waitcnt lgkmcnt(0) 1851; VI-NEXT: v_mov_b32_e32 v0, s0 1852; VI-NEXT: v_mov_b32_e32 v1, s1 1853; VI-NEXT: flat_store_dword v[0:1], v2 1854; VI-NEXT: s_endpgm 1855; 1856; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: 1857; GFX9: ; %bb.0: 1858; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1859; GFX9-NEXT: v_mov_b32_e32 v0, 0 1860; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff 1861; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1863; GFX9-NEXT: s_endpgm 1864; 1865; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: 1866; CI: ; %bb.0: 1867; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1868; CI-NEXT: s_mov_b32 s3, 0xf000 1869; CI-NEXT: s_mov_b32 s2, -1 1870; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff 1871; CI-NEXT: s_waitcnt lgkmcnt(0) 1872; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1873; CI-NEXT: s_endpgm 1874; 1875; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: 1876; GFX11: ; %bb.0: 1877; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1878; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff 1879; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1880; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1881; GFX11-NEXT: s_endpgm 1882 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>) 1883 store <2 x half> %canonicalized, ptr addrspace(1) %out 1884 ret void 1885} 1886 1887define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { 1888; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: 1889; VI: ; %bb.0: 1890; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1891; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff 1892; VI-NEXT: s_waitcnt lgkmcnt(0) 1893; VI-NEXT: v_mov_b32_e32 v0, s0 1894; VI-NEXT: v_mov_b32_e32 v1, s1 1895; VI-NEXT: flat_store_dword v[0:1], v2 1896; VI-NEXT: s_endpgm 1897; 1898; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: 1899; GFX9: ; %bb.0: 1900; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1901; GFX9-NEXT: v_mov_b32_e32 v0, 0 1902; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff 1903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1904; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1905; GFX9-NEXT: s_endpgm 1906; 1907; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: 1908; CI: ; %bb.0: 1909; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1910; CI-NEXT: s_mov_b32 s3, 0xf000 1911; CI-NEXT: s_mov_b32 s2, -1 1912; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff 1913; CI-NEXT: s_waitcnt lgkmcnt(0) 1914; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1915; CI-NEXT: s_endpgm 1916; 1917; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: 1918; GFX11: ; %bb.0: 1919; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1920; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff 1921; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1923; GFX11-NEXT: s_endpgm 1924 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>) 1925 store <2 x half> %canonicalized, ptr addrspace(1) %out 1926 ret void 1927} 1928 1929define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { 1930; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: 1931; VI: ; %bb.0: 1932; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1933; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff 1934; VI-NEXT: s_waitcnt lgkmcnt(0) 1935; VI-NEXT: v_mov_b32_e32 v0, s0 1936; VI-NEXT: v_mov_b32_e32 v1, s1 1937; VI-NEXT: flat_store_dword v[0:1], v2 1938; VI-NEXT: s_endpgm 1939; 1940; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: 1941; GFX9: ; %bb.0: 1942; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1943; GFX9-NEXT: v_mov_b32_e32 v0, 0 1944; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff 1945; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1946; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1947; GFX9-NEXT: s_endpgm 1948; 1949; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: 1950; CI: ; %bb.0: 1951; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1952; CI-NEXT: s_mov_b32 s3, 0xf000 1953; CI-NEXT: s_mov_b32 s2, -1 1954; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff 1955; CI-NEXT: s_waitcnt lgkmcnt(0) 1956; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1957; CI-NEXT: s_endpgm 1958; 1959; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: 1960; GFX11: ; %bb.0: 1961; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1962; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff 1963; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1964; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1965; GFX11-NEXT: s_endpgm 1966 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>) 1967 store <2 x half> %canonicalized, ptr addrspace(1) %out 1968 ret void 1969} 1970 1971define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { 1972; VI-LABEL: test_fold_canonicalize_qnan_v2f16: 1973; VI: ; %bb.0: 1974; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1975; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00 1976; VI-NEXT: s_waitcnt lgkmcnt(0) 1977; VI-NEXT: v_mov_b32_e32 v0, s0 1978; VI-NEXT: v_mov_b32_e32 v1, s1 1979; VI-NEXT: flat_store_dword v[0:1], v2 1980; VI-NEXT: s_endpgm 1981; 1982; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16: 1983; GFX9: ; %bb.0: 1984; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1985; GFX9-NEXT: v_mov_b32_e32 v0, 0 1986; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00 1987; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1988; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1989; GFX9-NEXT: s_endpgm 1990; 1991; CI-LABEL: test_fold_canonicalize_qnan_v2f16: 1992; CI: ; %bb.0: 1993; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1994; CI-NEXT: s_mov_b32 s3, 0xf000 1995; CI-NEXT: s_mov_b32 s2, -1 1996; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00 1997; CI-NEXT: s_waitcnt lgkmcnt(0) 1998; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1999; CI-NEXT: s_endpgm 2000; 2001; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: 2002; GFX11: ; %bb.0: 2003; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2004; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 2005; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2006; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2007; GFX11-NEXT: s_endpgm 2008 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>) 2009 store <2 x half> %canonicalized, ptr addrspace(1) %out 2010 ret void 2011} 2012 2013define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { 2014; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: 2015; VI: ; %bb.0: 2016; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2017; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2018; VI-NEXT: s_waitcnt lgkmcnt(0) 2019; VI-NEXT: v_mov_b32_e32 v0, s0 2020; VI-NEXT: v_mov_b32_e32 v1, s1 2021; VI-NEXT: flat_store_dword v[0:1], v2 2022; VI-NEXT: s_endpgm 2023; 2024; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: 2025; GFX9: ; %bb.0: 2026; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2027; GFX9-NEXT: v_mov_b32_e32 v0, 0 2028; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2029; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2030; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2031; GFX9-NEXT: s_endpgm 2032; 2033; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: 2034; CI: ; %bb.0: 2035; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2036; CI-NEXT: s_mov_b32 s3, 0xf000 2037; CI-NEXT: s_mov_b32 s2, -1 2038; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2039; CI-NEXT: s_waitcnt lgkmcnt(0) 2040; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2041; CI-NEXT: s_endpgm 2042; 2043; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: 2044; GFX11: ; %bb.0: 2045; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2046; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2047; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2048; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2049; GFX11-NEXT: s_endpgm 2050 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) 2051 store <2 x half> %canonicalized, ptr addrspace(1) %out 2052 ret void 2053} 2054 2055define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { 2056; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: 2057; VI: ; %bb.0: 2058; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2059; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2060; VI-NEXT: s_waitcnt lgkmcnt(0) 2061; VI-NEXT: v_mov_b32_e32 v0, s0 2062; VI-NEXT: v_mov_b32_e32 v1, s1 2063; VI-NEXT: flat_store_dword v[0:1], v2 2064; VI-NEXT: s_endpgm 2065; 2066; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: 2067; GFX9: ; %bb.0: 2068; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2069; GFX9-NEXT: v_mov_b32_e32 v0, 0 2070; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2071; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2072; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2073; GFX9-NEXT: s_endpgm 2074; 2075; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: 2076; CI: ; %bb.0: 2077; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2078; CI-NEXT: s_mov_b32 s3, 0xf000 2079; CI-NEXT: s_mov_b32 s2, -1 2080; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2081; CI-NEXT: s_waitcnt lgkmcnt(0) 2082; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2083; CI-NEXT: s_endpgm 2084; 2085; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: 2086; GFX11: ; %bb.0: 2087; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2088; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2089; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2090; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2091; GFX11-NEXT: s_endpgm 2092 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>) 2093 store <2 x half> %canonicalized, ptr addrspace(1) %out 2094 ret void 2095} 2096 2097define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { 2098; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16: 2099; VI: ; %bb.0: 2100; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2101; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2102; VI-NEXT: s_waitcnt lgkmcnt(0) 2103; VI-NEXT: v_mov_b32_e32 v0, s0 2104; VI-NEXT: v_mov_b32_e32 v1, s1 2105; VI-NEXT: flat_store_dword v[0:1], v2 2106; VI-NEXT: s_endpgm 2107; 2108; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16: 2109; GFX9: ; %bb.0: 2110; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2111; GFX9-NEXT: v_mov_b32_e32 v0, 0 2112; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2113; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2114; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2115; GFX9-NEXT: s_endpgm 2116; 2117; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16: 2118; CI: ; %bb.0: 2119; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2120; CI-NEXT: s_mov_b32 s3, 0xf000 2121; CI-NEXT: s_mov_b32 s2, -1 2122; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2123; CI-NEXT: s_waitcnt lgkmcnt(0) 2124; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2125; CI-NEXT: s_endpgm 2126; 2127; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: 2128; GFX11: ; %bb.0: 2129; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2130; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2131; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2133; GFX11-NEXT: s_endpgm 2134 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>) 2135 store <2 x half> %canonicalized, ptr addrspace(1) %out 2136 ret void 2137} 2138 2139define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { 2140; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16: 2141; VI: ; %bb.0: 2142; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2143; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2144; VI-NEXT: s_waitcnt lgkmcnt(0) 2145; VI-NEXT: v_mov_b32_e32 v0, s0 2146; VI-NEXT: v_mov_b32_e32 v1, s1 2147; VI-NEXT: flat_store_dword v[0:1], v2 2148; VI-NEXT: s_endpgm 2149; 2150; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16: 2151; GFX9: ; %bb.0: 2152; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2153; GFX9-NEXT: v_mov_b32_e32 v0, 0 2154; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2155; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2156; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2157; GFX9-NEXT: s_endpgm 2158; 2159; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16: 2160; CI: ; %bb.0: 2161; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2162; CI-NEXT: s_mov_b32 s3, 0xf000 2163; CI-NEXT: s_mov_b32 s2, -1 2164; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2165; CI-NEXT: s_waitcnt lgkmcnt(0) 2166; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2167; CI-NEXT: s_endpgm 2168; 2169; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: 2170; GFX11: ; %bb.0: 2171; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2172; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2173; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2174; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2175; GFX11-NEXT: s_endpgm 2176 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>) 2177 store <2 x half> %canonicalized, ptr addrspace(1) %out 2178 ret void 2179} 2180 2181define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { 2182; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16: 2183; VI: ; %bb.0: 2184; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2185; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2186; VI-NEXT: s_waitcnt lgkmcnt(0) 2187; VI-NEXT: v_mov_b32_e32 v0, s0 2188; VI-NEXT: v_mov_b32_e32 v1, s1 2189; VI-NEXT: flat_store_dword v[0:1], v2 2190; VI-NEXT: s_endpgm 2191; 2192; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16: 2193; GFX9: ; %bb.0: 2194; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2195; GFX9-NEXT: v_mov_b32_e32 v0, 0 2196; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2197; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2198; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2199; GFX9-NEXT: s_endpgm 2200; 2201; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16: 2202; CI: ; %bb.0: 2203; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2204; CI-NEXT: s_mov_b32 s3, 0xf000 2205; CI-NEXT: s_mov_b32 s2, -1 2206; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2207; CI-NEXT: s_waitcnt lgkmcnt(0) 2208; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2209; CI-NEXT: s_endpgm 2210; 2211; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: 2212; GFX11: ; %bb.0: 2213; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2214; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2215; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2217; GFX11-NEXT: s_endpgm 2218 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>) 2219 store <2 x half> %canonicalized, ptr addrspace(1) %out 2220 ret void 2221} 2222 2223define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { 2224; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16: 2225; VI: ; %bb.0: 2226; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2227; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 2228; VI-NEXT: s_waitcnt lgkmcnt(0) 2229; VI-NEXT: v_mov_b32_e32 v0, s0 2230; VI-NEXT: v_mov_b32_e32 v1, s1 2231; VI-NEXT: flat_store_dword v[0:1], v2 2232; VI-NEXT: s_endpgm 2233; 2234; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16: 2235; GFX9: ; %bb.0: 2236; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2237; GFX9-NEXT: v_mov_b32_e32 v0, 0 2238; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2240; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2241; GFX9-NEXT: s_endpgm 2242; 2243; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16: 2244; CI: ; %bb.0: 2245; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2246; CI-NEXT: s_mov_b32 s3, 0xf000 2247; CI-NEXT: s_mov_b32 s2, -1 2248; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 2249; CI-NEXT: s_waitcnt lgkmcnt(0) 2250; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2251; CI-NEXT: s_endpgm 2252; 2253; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: 2254; GFX11: ; %bb.0: 2255; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2256; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 2257; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2258; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2259; GFX11-NEXT: s_endpgm 2260 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>) 2261 store <2 x half> %canonicalized, ptr addrspace(1) %out 2262 ret void 2263} 2264 2265define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 { 2266; VI-LABEL: v_test_canonicalize_var_v3f16: 2267; VI: ; %bb.0: 2268; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2269; VI-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2270; VI-NEXT: v_max_f16_e32 v0, v0, v0 2271; VI-NEXT: v_max_f16_e32 v1, v1, v1 2272; VI-NEXT: v_or_b32_e32 v0, v0, v2 2273; VI-NEXT: s_setpc_b64 s[30:31] 2274; 2275; GFX9-LABEL: v_test_canonicalize_var_v3f16: 2276; GFX9: ; %bb.0: 2277; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2278; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 2279; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2280; GFX9-NEXT: s_setpc_b64 s[30:31] 2281; 2282; CI-LABEL: v_test_canonicalize_var_v3f16: 2283; CI: ; %bb.0: 2284; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2285; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2286; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2287; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2288; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2289; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2290; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2291; CI-NEXT: s_setpc_b64 s[30:31] 2292; 2293; GFX11-LABEL: v_test_canonicalize_var_v3f16: 2294; GFX11: ; %bb.0: 2295; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2296; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 2297; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2298; GFX11-NEXT: s_setpc_b64 s[30:31] 2299 %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val) 2300 ret <3 x half> %canonicalized 2301} 2302 2303define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { 2304; VI-LABEL: v_test_canonicalize_var_v4f16: 2305; VI: ; %bb.0: 2306; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2307; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2308; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2309; VI-NEXT: v_max_f16_e32 v1, v1, v1 2310; VI-NEXT: v_max_f16_e32 v0, v0, v0 2311; VI-NEXT: v_or_b32_e32 v0, v0, v3 2312; VI-NEXT: v_or_b32_e32 v1, v1, v2 2313; VI-NEXT: s_setpc_b64 s[30:31] 2314; 2315; GFX9-LABEL: v_test_canonicalize_var_v4f16: 2316; GFX9: ; %bb.0: 2317; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2318; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 2319; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2320; GFX9-NEXT: s_setpc_b64 s[30:31] 2321; 2322; CI-LABEL: v_test_canonicalize_var_v4f16: 2323; CI: ; %bb.0: 2324; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2325; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2326; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2327; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2328; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2329; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2330; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2331; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2332; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2333; CI-NEXT: s_setpc_b64 s[30:31] 2334; 2335; GFX11-LABEL: v_test_canonicalize_var_v4f16: 2336; GFX11: ; %bb.0: 2337; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2338; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 2339; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2340; GFX11-NEXT: s_setpc_b64 s[30:31] 2341 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val) 2342 ret <4 x half> %canonicalized 2343} 2344 2345define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { 2346; VI-LABEL: s_test_canonicalize_undef_v2f16: 2347; VI: ; %bb.0: 2348; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2349; VI-NEXT: v_mov_b32_e32 v2, 0 2350; VI-NEXT: s_waitcnt lgkmcnt(0) 2351; VI-NEXT: v_mov_b32_e32 v0, s0 2352; VI-NEXT: v_mov_b32_e32 v1, s1 2353; VI-NEXT: flat_store_dword v[0:1], v2 2354; VI-NEXT: s_endpgm 2355; 2356; GFX9-LABEL: s_test_canonicalize_undef_v2f16: 2357; GFX9: ; %bb.0: 2358; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2359; GFX9-NEXT: v_mov_b32_e32 v0, 0 2360; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX9-NEXT: global_store_dword v0, v0, s[0:1] 2362; GFX9-NEXT: s_endpgm 2363; 2364; CI-LABEL: s_test_canonicalize_undef_v2f16: 2365; CI: ; %bb.0: 2366; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2367; CI-NEXT: s_mov_b32 s3, 0xf000 2368; CI-NEXT: s_mov_b32 s2, -1 2369; CI-NEXT: v_mov_b32_e32 v0, 0 2370; CI-NEXT: s_waitcnt lgkmcnt(0) 2371; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2372; CI-NEXT: s_endpgm 2373; 2374; GFX11-LABEL: s_test_canonicalize_undef_v2f16: 2375; GFX11: ; %bb.0: 2376; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2377; GFX11-NEXT: v_mov_b32_e32 v0, 0 2378; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2379; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 2380; GFX11-NEXT: s_endpgm 2381 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) 2382 store <2 x half> %canonicalized, ptr addrspace(1) %out 2383 ret void 2384} 2385 2386define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { 2387; VI-LABEL: v_test_canonicalize_reg_undef_v2f16: 2388; VI: ; %bb.0: 2389; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2390; VI-NEXT: v_max_f16_e32 v0, v0, v0 2391; VI-NEXT: s_setpc_b64 s[30:31] 2392; 2393; GFX9-LABEL: v_test_canonicalize_reg_undef_v2f16: 2394; GFX9: ; %bb.0: 2395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2396; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 2397; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 2398; GFX9-NEXT: s_setpc_b64 s[30:31] 2399; 2400; CI-LABEL: v_test_canonicalize_reg_undef_v2f16: 2401; CI: ; %bb.0: 2402; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2403; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2404; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 2405; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2406; CI-NEXT: s_setpc_b64 s[30:31] 2407; 2408; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16: 2409; GFX11-TRUE16: ; %bb.0: 2410; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2411; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2412; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2413; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 2414; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2415; 2416; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16: 2417; GFX11-FAKE16: ; %bb.0: 2418; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2419; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2420; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2421; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 2422; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2423 %vec = insertelement <2 x half> undef, half %val, i32 0 2424 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2425 ret <2 x half> %canonicalized 2426} 2427 2428define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { 2429; VI-LABEL: v_test_canonicalize_undef_reg_v2f16: 2430; VI: ; %bb.0: 2431; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2432; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2433; VI-NEXT: s_setpc_b64 s[30:31] 2434; 2435; GFX9-LABEL: v_test_canonicalize_undef_reg_v2f16: 2436; GFX9: ; %bb.0: 2437; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2438; GFX9-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2439; GFX9-NEXT: s_setpc_b64 s[30:31] 2440; 2441; CI-LABEL: v_test_canonicalize_undef_reg_v2f16: 2442; CI: ; %bb.0: 2443; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2444; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2445; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 2446; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 2447; CI-NEXT: s_setpc_b64 s[30:31] 2448; 2449; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16: 2450; GFX11-TRUE16: ; %bb.0: 2451; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2452; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2453; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2454; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2455; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2456; 2457; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16: 2458; GFX11-FAKE16: ; %bb.0: 2459; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2460; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2461; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2462; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2463; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2464 %vec = insertelement <2 x half> undef, half %val, i32 1 2465 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2466 ret <2 x half> %canonicalized 2467} 2468 2469define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 { 2470; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: 2471; VI: ; %bb.0: 2472; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2473; VI-NEXT: v_bfrev_b32_e32 v0, 60 2474; VI-NEXT: s_setpc_b64 s[30:31] 2475; 2476; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: 2477; GFX9: ; %bb.0: 2478; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2479; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 2480; GFX9-NEXT: s_setpc_b64 s[30:31] 2481; 2482; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: 2483; CI: ; %bb.0: 2484; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2485; CI-NEXT: v_mov_b32_e32 v0, 0 2486; CI-NEXT: v_mov_b32_e32 v1, 1.0 2487; CI-NEXT: s_setpc_b64 s[30:31] 2488; 2489; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16: 2490; GFX11: ; %bb.0: 2491; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2492; GFX11-NEXT: v_bfrev_b32_e32 v0, 60 2493; GFX11-NEXT: s_setpc_b64 s[30:31] 2494 %vec = insertelement <2 x half> undef, half 1.0, i32 1 2495 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2496 ret <2 x half> %canonicalized 2497} 2498 2499define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 { 2500; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: 2501; VI: ; %bb.0: 2502; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2503; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 2504; VI-NEXT: s_setpc_b64 s[30:31] 2505; 2506; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: 2507; GFX9: ; %bb.0: 2508; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2509; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 2510; GFX9-NEXT: s_setpc_b64 s[30:31] 2511; 2512; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: 2513; CI: ; %bb.0: 2514; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2515; CI-NEXT: v_mov_b32_e32 v0, 1.0 2516; CI-NEXT: v_mov_b32_e32 v1, 0 2517; CI-NEXT: s_setpc_b64 s[30:31] 2518; 2519; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16: 2520; GFX11: ; %bb.0: 2521; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2522; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 2523; GFX11-NEXT: s_setpc_b64 s[30:31] 2524 %vec = insertelement <2 x half> undef, half 1.0, i32 0 2525 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2526 ret <2 x half> %canonicalized 2527} 2528 2529define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 { 2530; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: 2531; VI: ; %bb.0: 2532; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2533; VI-NEXT: v_bfrev_b32_e32 v0, 50 2534; VI-NEXT: s_setpc_b64 s[30:31] 2535; 2536; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: 2537; GFX9: ; %bb.0: 2538; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2539; GFX9-NEXT: v_bfrev_b32_e32 v0, 50 2540; GFX9-NEXT: s_setpc_b64 s[30:31] 2541; 2542; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: 2543; CI: ; %bb.0: 2544; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2545; CI-NEXT: v_mov_b32_e32 v0, 0 2546; CI-NEXT: v_mov_b32_e32 v1, 0x41800000 2547; CI-NEXT: s_setpc_b64 s[30:31] 2548; 2549; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16: 2550; GFX11: ; %bb.0: 2551; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2552; GFX11-NEXT: v_bfrev_b32_e32 v0, 50 2553; GFX11-NEXT: s_setpc_b64 s[30:31] 2554 %vec = insertelement <2 x half> undef, half 16.0, i32 1 2555 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2556 ret <2 x half> %canonicalized 2557} 2558 2559define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 { 2560; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: 2561; VI: ; %bb.0: 2562; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2563; VI-NEXT: v_mov_b32_e32 v0, 0x4c00 2564; VI-NEXT: s_setpc_b64 s[30:31] 2565; 2566; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: 2567; GFX9: ; %bb.0: 2568; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2569; GFX9-NEXT: v_mov_b32_e32 v0, 0x4c00 2570; GFX9-NEXT: s_setpc_b64 s[30:31] 2571; 2572; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: 2573; CI: ; %bb.0: 2574; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2575; CI-NEXT: v_mov_b32_e32 v0, 0x41800000 2576; CI-NEXT: v_mov_b32_e32 v1, 0 2577; CI-NEXT: s_setpc_b64 s[30:31] 2578; 2579; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16: 2580; GFX11: ; %bb.0: 2581; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2582; GFX11-NEXT: v_mov_b32_e32 v0, 0x4c00 2583; GFX11-NEXT: s_setpc_b64 s[30:31] 2584 %vec = insertelement <2 x half> undef, half 16.0, i32 0 2585 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) 2586 ret <2 x half> %canonicalized 2587} 2588 2589define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { 2590; VI-LABEL: v_test_canonicalize_reg_k_v2f16: 2591; VI: ; %bb.0: 2592; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2593; VI-NEXT: v_max_f16_e32 v0, v0, v0 2594; VI-NEXT: v_or_b32_e32 v0, 2.0, v0 2595; VI-NEXT: s_setpc_b64 s[30:31] 2596; 2597; GFX9-LABEL: v_test_canonicalize_reg_k_v2f16: 2598; GFX9: ; %bb.0: 2599; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2600; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 2601; GFX9-NEXT: v_pack_b32_f16 v0, v0, 2.0 2602; GFX9-NEXT: s_setpc_b64 s[30:31] 2603; 2604; CI-LABEL: v_test_canonicalize_reg_k_v2f16: 2605; CI: ; %bb.0: 2606; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2607; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2608; CI-NEXT: v_mov_b32_e32 v1, 2.0 2609; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2610; CI-NEXT: s_setpc_b64 s[30:31] 2611; 2612; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16: 2613; GFX11-TRUE16: ; %bb.0: 2614; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2615; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2616; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2617; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 2.0 2618; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2619; 2620; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16: 2621; GFX11-FAKE16: ; %bb.0: 2622; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2623; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2624; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2625; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 2.0 2626; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2627 %vec0 = insertelement <2 x half> undef, half %val, i32 0 2628 %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1 2629 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) 2630 ret <2 x half> %canonicalized 2631} 2632 2633define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { 2634; VI-LABEL: v_test_canonicalize_k_reg_v2f16: 2635; VI: ; %bb.0: 2636; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2637; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2638; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0 2639; VI-NEXT: s_setpc_b64 s[30:31] 2640; 2641; GFX9-LABEL: v_test_canonicalize_k_reg_v2f16: 2642; GFX9: ; %bb.0: 2643; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2644; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 2645; GFX9-NEXT: v_pack_b32_f16 v0, 2.0, v0 2646; GFX9-NEXT: s_setpc_b64 s[30:31] 2647; 2648; CI-LABEL: v_test_canonicalize_k_reg_v2f16: 2649; CI: ; %bb.0: 2650; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2651; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2652; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 2653; CI-NEXT: v_mov_b32_e32 v0, 2.0 2654; CI-NEXT: s_setpc_b64 s[30:31] 2655; 2656; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16: 2657; GFX11-TRUE16: ; %bb.0: 2658; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2659; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2660; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2661; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 2.0, v0.l 2662; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2663; 2664; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16: 2665; GFX11-FAKE16: ; %bb.0: 2666; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2667; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2668; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 2669; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, 2.0, v0 2670; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2671 %vec0 = insertelement <2 x half> undef, half 2.0, i32 0 2672 %vec1 = insertelement <2 x half> %vec0, half %val, i32 1 2673 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) 2674 ret <2 x half> %canonicalized 2675} 2676 2677define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { 2678; VI-LABEL: s_test_canonicalize_undef_v4f16: 2679; VI: ; %bb.0: 2680; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2681; VI-NEXT: v_mov_b32_e32 v0, 0 2682; VI-NEXT: v_mov_b32_e32 v1, v0 2683; VI-NEXT: s_waitcnt lgkmcnt(0) 2684; VI-NEXT: v_mov_b32_e32 v3, s1 2685; VI-NEXT: v_mov_b32_e32 v2, s0 2686; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2687; VI-NEXT: s_endpgm 2688; 2689; GFX9-LABEL: s_test_canonicalize_undef_v4f16: 2690; GFX9: ; %bb.0: 2691; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2692; GFX9-NEXT: v_mov_b32_e32 v0, 0 2693; GFX9-NEXT: v_mov_b32_e32 v1, v0 2694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2695; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 2696; GFX9-NEXT: s_endpgm 2697; 2698; CI-LABEL: s_test_canonicalize_undef_v4f16: 2699; CI: ; %bb.0: 2700; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2701; CI-NEXT: v_mov_b32_e32 v0, 0 2702; CI-NEXT: s_mov_b32 s3, 0xf000 2703; CI-NEXT: s_mov_b32 s2, -1 2704; CI-NEXT: v_mov_b32_e32 v1, v0 2705; CI-NEXT: s_waitcnt lgkmcnt(0) 2706; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2707; CI-NEXT: s_endpgm 2708; 2709; GFX11-LABEL: s_test_canonicalize_undef_v4f16: 2710; GFX11: ; %bb.0: 2711; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2712; GFX11-NEXT: v_mov_b32_e32 v0, 0 2713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2714; GFX11-NEXT: v_mov_b32_e32 v1, v0 2715; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2716; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 2717; GFX11-NEXT: s_endpgm 2718 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) 2719 store <4 x half> %canonicalized, ptr addrspace(1) %out 2720 ret void 2721} 2722 2723define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 { 2724; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: 2725; VI: ; %bb.0: 2726; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2727; VI-NEXT: v_max_f16_e32 v0, v0, v0 2728; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 2729; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2730; VI-NEXT: s_setpc_b64 s[30:31] 2731; 2732; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: 2733; GFX9: ; %bb.0: 2734; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2735; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 2736; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 2737; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2738; GFX9-NEXT: s_setpc_b64 s[30:31] 2739; 2740; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: 2741; CI: ; %bb.0: 2742; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2743; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2744; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 2745; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2746; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 2747; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2748; CI-NEXT: s_setpc_b64 s[30:31] 2749; 2750; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: 2751; GFX11-TRUE16: ; %bb.0: 2752; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2753; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2754; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2755; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 2756; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 2757; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2758; 2759; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: 2760; GFX11-FAKE16: ; %bb.0: 2761; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2762; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2763; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2764; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) 2765; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 2766; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2767 %vec = insertelement <4 x half> undef, half %val, i32 0 2768 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec) 2769 ret <4 x half> %canonicalized 2770} 2771 2772define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 { 2773; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: 2774; VI: ; %bb.0: 2775; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2776; VI-NEXT: v_max_f16_e32 v0, v0, v0 2777; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2778; VI-NEXT: v_or_b32_e32 v0, v0, v1 2779; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2780; VI-NEXT: s_setpc_b64 s[30:31] 2781; 2782; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: 2783; GFX9: ; %bb.0: 2784; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2785; GFX9-NEXT: s_mov_b32 s4, 0x5040100 2786; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 2787; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 2788; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2789; GFX9-NEXT: s_setpc_b64 s[30:31] 2790; 2791; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: 2792; CI: ; %bb.0: 2793; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2794; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2795; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2796; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 2797; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 2798; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2799; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2800; CI-NEXT: s_setpc_b64 s[30:31] 2801; 2802; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: 2803; GFX11: ; %bb.0: 2804; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2805; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 2806; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00 2807; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2808; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 2809; GFX11-NEXT: s_setpc_b64 s[30:31] 2810 %vec0 = insertelement <4 x half> undef, half %val0, i32 0 2811 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1 2812 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1) 2813 ret <4 x half> %canonicalized 2814} 2815 2816define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 { 2817; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: 2818; VI: ; %bb.0: 2819; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2820; VI-NEXT: v_max_f16_e32 v1, v1, v1 2821; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2822; VI-NEXT: v_max_f16_e32 v0, v0, v0 2823; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 2824; VI-NEXT: v_or_b32_e32 v1, v1, v2 2825; VI-NEXT: s_setpc_b64 s[30:31] 2826; 2827; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: 2828; GFX9: ; %bb.0: 2829; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2830; GFX9-NEXT: s_mov_b32 s4, 0x5040100 2831; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 2832; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 2833; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2834; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 2835; GFX9-NEXT: s_setpc_b64 s[30:31] 2836; 2837; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: 2838; CI: ; %bb.0: 2839; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2840; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2841; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2842; CI-NEXT: v_cvt_f16_f32_e32 v3, v2 2843; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2844; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 2845; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2846; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 2847; CI-NEXT: s_setpc_b64 s[30:31] 2848; 2849; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: 2850; GFX11-TRUE16: ; %bb.0: 2851; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2852; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l 2853; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 2854; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2855; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0 2856; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 2857; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 2858; 2859; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: 2860; GFX11-FAKE16: ; %bb.0: 2861; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2862; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 2863; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 2864; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2865; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0 2866; GFX11-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 2867; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 2868 %vec0 = insertelement <4 x half> undef, half %val0, i32 0 2869 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 2870 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 2871 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2) 2872 ret <4 x half> %canonicalized 2873} 2874 2875define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { 2876; VI-LABEL: v_test_canonicalize_var_v6f16: 2877; VI: ; %bb.0: 2878; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2879; VI-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2880; VI-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2881; VI-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2882; VI-NEXT: v_max_f16_e32 v2, v2, v2 2883; VI-NEXT: v_max_f16_e32 v1, v1, v1 2884; VI-NEXT: v_max_f16_e32 v0, v0, v0 2885; VI-NEXT: v_or_b32_e32 v0, v0, v5 2886; VI-NEXT: v_or_b32_e32 v1, v1, v4 2887; VI-NEXT: v_or_b32_e32 v2, v2, v3 2888; VI-NEXT: s_setpc_b64 s[30:31] 2889; 2890; GFX9-LABEL: v_test_canonicalize_var_v6f16: 2891; GFX9: ; %bb.0: 2892; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2893; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 2894; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2895; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 2896; GFX9-NEXT: s_setpc_b64 s[30:31] 2897; 2898; CI-LABEL: v_test_canonicalize_var_v6f16: 2899; CI: ; %bb.0: 2900; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2901; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 2902; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2903; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2904; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2905; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2906; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2907; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 2908; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 2909; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2910; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2911; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2912; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2913; CI-NEXT: s_setpc_b64 s[30:31] 2914; 2915; GFX11-LABEL: v_test_canonicalize_var_v6f16: 2916; GFX11: ; %bb.0: 2917; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2918; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 2919; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2920; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 2921; GFX11-NEXT: s_setpc_b64 s[30:31] 2922 %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val) 2923 ret <6 x half> %canonicalized 2924} 2925 2926define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { 2927; VI-LABEL: v_test_canonicalize_var_v8f16: 2928; VI: ; %bb.0: 2929; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2930; VI-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2931; VI-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2932; VI-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2933; VI-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2934; VI-NEXT: v_max_f16_e32 v3, v3, v3 2935; VI-NEXT: v_max_f16_e32 v2, v2, v2 2936; VI-NEXT: v_max_f16_e32 v1, v1, v1 2937; VI-NEXT: v_max_f16_e32 v0, v0, v0 2938; VI-NEXT: v_or_b32_e32 v0, v0, v7 2939; VI-NEXT: v_or_b32_e32 v1, v1, v6 2940; VI-NEXT: v_or_b32_e32 v2, v2, v5 2941; VI-NEXT: v_or_b32_e32 v3, v3, v4 2942; VI-NEXT: s_setpc_b64 s[30:31] 2943; 2944; GFX9-LABEL: v_test_canonicalize_var_v8f16: 2945; GFX9: ; %bb.0: 2946; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2947; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 2948; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2949; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 2950; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 2951; GFX9-NEXT: s_setpc_b64 s[30:31] 2952; 2953; CI-LABEL: v_test_canonicalize_var_v8f16: 2954; CI: ; %bb.0: 2955; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2956; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2957; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 2958; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 2959; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2960; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2961; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2962; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2963; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2964; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 2965; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 2966; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 2967; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 2968; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2969; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2970; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2971; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2972; CI-NEXT: s_setpc_b64 s[30:31] 2973; 2974; GFX11-LABEL: v_test_canonicalize_var_v8f16: 2975; GFX11: ; %bb.0: 2976; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2977; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 2978; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2979; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 2980; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 2981; GFX11-NEXT: s_setpc_b64 s[30:31] 2982 %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val) 2983 ret <8 x half> %canonicalized 2984} 2985 2986define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { 2987; VI-LABEL: v_test_canonicalize_var_v12f16: 2988; VI: ; %bb.0: 2989; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2990; VI-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2991; VI-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2992; VI-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2993; VI-NEXT: v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2994; VI-NEXT: v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2995; VI-NEXT: v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2996; VI-NEXT: v_max_f16_e32 v5, v5, v5 2997; VI-NEXT: v_max_f16_e32 v4, v4, v4 2998; VI-NEXT: v_max_f16_e32 v3, v3, v3 2999; VI-NEXT: v_max_f16_e32 v2, v2, v2 3000; VI-NEXT: v_max_f16_e32 v1, v1, v1 3001; VI-NEXT: v_max_f16_e32 v0, v0, v0 3002; VI-NEXT: v_or_b32_e32 v0, v0, v11 3003; VI-NEXT: v_or_b32_e32 v1, v1, v10 3004; VI-NEXT: v_or_b32_e32 v2, v2, v9 3005; VI-NEXT: v_or_b32_e32 v3, v3, v8 3006; VI-NEXT: v_or_b32_e32 v4, v4, v7 3007; VI-NEXT: v_or_b32_e32 v5, v5, v6 3008; VI-NEXT: s_setpc_b64 s[30:31] 3009; 3010; GFX9-LABEL: v_test_canonicalize_var_v12f16: 3011; GFX9: ; %bb.0: 3012; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3013; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 3014; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3015; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 3016; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 3017; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 3018; GFX9-NEXT: v_pk_max_f16 v5, v5, v5 3019; GFX9-NEXT: s_setpc_b64 s[30:31] 3020; 3021; CI-LABEL: v_test_canonicalize_var_v12f16: 3022; CI: ; %bb.0: 3023; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3024; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3025; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3026; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 3027; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 3028; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3029; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3030; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3031; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3032; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 3033; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3034; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3035; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3036; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3037; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 3038; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 3039; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 3040; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3041; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3042; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3043; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3044; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 3045; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3046; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3047; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3048; CI-NEXT: s_setpc_b64 s[30:31] 3049; 3050; GFX11-LABEL: v_test_canonicalize_var_v12f16: 3051; GFX11: ; %bb.0: 3052; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3053; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 3054; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3055; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 3056; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 3057; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 3058; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 3059; GFX11-NEXT: s_setpc_b64 s[30:31] 3060 %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val) 3061 ret <12 x half> %canonicalized 3062} 3063 3064define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { 3065; VI-LABEL: v_test_canonicalize_var_v16f16: 3066; VI: ; %bb.0: 3067; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3068; VI-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3069; VI-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3070; VI-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3071; VI-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3072; VI-NEXT: v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3073; VI-NEXT: v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3074; VI-NEXT: v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3075; VI-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3076; VI-NEXT: v_max_f16_e32 v7, v7, v7 3077; VI-NEXT: v_max_f16_e32 v6, v6, v6 3078; VI-NEXT: v_max_f16_e32 v5, v5, v5 3079; VI-NEXT: v_max_f16_e32 v4, v4, v4 3080; VI-NEXT: v_max_f16_e32 v3, v3, v3 3081; VI-NEXT: v_max_f16_e32 v2, v2, v2 3082; VI-NEXT: v_max_f16_e32 v1, v1, v1 3083; VI-NEXT: v_max_f16_e32 v0, v0, v0 3084; VI-NEXT: v_or_b32_e32 v0, v0, v15 3085; VI-NEXT: v_or_b32_e32 v1, v1, v14 3086; VI-NEXT: v_or_b32_e32 v2, v2, v13 3087; VI-NEXT: v_or_b32_e32 v3, v3, v12 3088; VI-NEXT: v_or_b32_e32 v4, v4, v11 3089; VI-NEXT: v_or_b32_e32 v5, v5, v10 3090; VI-NEXT: v_or_b32_e32 v6, v6, v9 3091; VI-NEXT: v_or_b32_e32 v7, v7, v8 3092; VI-NEXT: s_setpc_b64 s[30:31] 3093; 3094; GFX9-LABEL: v_test_canonicalize_var_v16f16: 3095; GFX9: ; %bb.0: 3096; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3097; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 3098; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3099; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 3100; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 3101; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 3102; GFX9-NEXT: v_pk_max_f16 v5, v5, v5 3103; GFX9-NEXT: v_pk_max_f16 v6, v6, v6 3104; GFX9-NEXT: v_pk_max_f16 v7, v7, v7 3105; GFX9-NEXT: s_setpc_b64 s[30:31] 3106; 3107; CI-LABEL: v_test_canonicalize_var_v16f16: 3108; CI: ; %bb.0: 3109; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3110; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 3111; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3112; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3113; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3114; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3115; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3116; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 3117; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 3118; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3119; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3120; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3121; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3122; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 3123; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3124; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3125; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3126; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 3127; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 3128; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 3129; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3130; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3131; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 3132; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 3133; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 3134; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3135; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3136; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3137; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3138; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 3139; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3140; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3141; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3142; CI-NEXT: s_setpc_b64 s[30:31] 3143; 3144; GFX11-LABEL: v_test_canonicalize_var_v16f16: 3145; GFX11: ; %bb.0: 3146; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3147; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 3148; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3149; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 3150; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 3151; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 3152; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 3153; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 3154; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 3155; GFX11-NEXT: s_setpc_b64 s[30:31] 3156 %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val) 3157 ret <16 x half> %canonicalized 3158} 3159 3160define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { 3161; VI-LABEL: v_test_canonicalize_var_v32f16: 3162; VI: ; %bb.0: 3163; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3164; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3165; VI-NEXT: v_max_f16_e32 v0, v0, v0 3166; VI-NEXT: v_or_b32_e32 v0, v0, v19 3167; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3168; VI-NEXT: v_max_f16_e32 v1, v1, v1 3169; VI-NEXT: v_or_b32_e32 v1, v1, v19 3170; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3171; VI-NEXT: v_max_f16_e32 v2, v2, v2 3172; VI-NEXT: v_or_b32_e32 v2, v2, v19 3173; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3174; VI-NEXT: v_max_f16_e32 v3, v3, v3 3175; VI-NEXT: v_or_b32_e32 v3, v3, v19 3176; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3177; VI-NEXT: v_max_f16_e32 v4, v4, v4 3178; VI-NEXT: v_or_b32_e32 v4, v4, v19 3179; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3180; VI-NEXT: v_max_f16_e32 v5, v5, v5 3181; VI-NEXT: v_or_b32_e32 v5, v5, v19 3182; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3183; VI-NEXT: v_max_f16_e32 v6, v6, v6 3184; VI-NEXT: v_or_b32_e32 v6, v6, v19 3185; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3186; VI-NEXT: v_max_f16_e32 v7, v7, v7 3187; VI-NEXT: v_or_b32_e32 v7, v7, v19 3188; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3189; VI-NEXT: v_max_f16_e32 v8, v8, v8 3190; VI-NEXT: v_or_b32_e32 v8, v8, v19 3191; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3192; VI-NEXT: v_max_f16_e32 v9, v9, v9 3193; VI-NEXT: v_or_b32_e32 v9, v9, v19 3194; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3195; VI-NEXT: v_max_f16_e32 v10, v10, v10 3196; VI-NEXT: v_or_b32_e32 v10, v10, v19 3197; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3198; VI-NEXT: v_max_f16_e32 v11, v11, v11 3199; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3200; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3201; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3202; VI-NEXT: v_or_b32_e32 v11, v11, v19 3203; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3204; VI-NEXT: v_max_f16_e32 v15, v15, v15 3205; VI-NEXT: v_max_f16_e32 v14, v14, v14 3206; VI-NEXT: v_max_f16_e32 v13, v13, v13 3207; VI-NEXT: v_max_f16_e32 v12, v12, v12 3208; VI-NEXT: v_or_b32_e32 v12, v12, v19 3209; VI-NEXT: v_or_b32_e32 v13, v13, v18 3210; VI-NEXT: v_or_b32_e32 v14, v14, v17 3211; VI-NEXT: v_or_b32_e32 v15, v15, v16 3212; VI-NEXT: s_setpc_b64 s[30:31] 3213; 3214; GFX9-LABEL: v_test_canonicalize_var_v32f16: 3215; GFX9: ; %bb.0: 3216; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3217; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 3218; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3219; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 3220; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 3221; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 3222; GFX9-NEXT: v_pk_max_f16 v5, v5, v5 3223; GFX9-NEXT: v_pk_max_f16 v6, v6, v6 3224; GFX9-NEXT: v_pk_max_f16 v7, v7, v7 3225; GFX9-NEXT: v_pk_max_f16 v8, v8, v8 3226; GFX9-NEXT: v_pk_max_f16 v9, v9, v9 3227; GFX9-NEXT: v_pk_max_f16 v10, v10, v10 3228; GFX9-NEXT: v_pk_max_f16 v11, v11, v11 3229; GFX9-NEXT: v_pk_max_f16 v12, v12, v12 3230; GFX9-NEXT: v_pk_max_f16 v13, v13, v13 3231; GFX9-NEXT: v_pk_max_f16 v14, v14, v14 3232; GFX9-NEXT: v_pk_max_f16 v15, v15, v15 3233; GFX9-NEXT: s_setpc_b64 s[30:31] 3234; 3235; CI-LABEL: v_test_canonicalize_var_v32f16: 3236; CI: ; %bb.0: 3237; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3238; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 3239; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 3240; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 3241; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 3242; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 3243; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 3244; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 3245; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 3246; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 3247; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 3248; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 3249; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 3250; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 3251; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 3252; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 3253; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 3254; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 3255; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3256; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3257; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3258; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3259; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3260; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 3261; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 3262; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3263; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3264; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3265; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3266; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 3267; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3268; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3269; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3270; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 3271; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 3272; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 3273; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 3274; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 3275; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 3276; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 3277; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 3278; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 3279; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 3280; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 3281; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 3282; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 3283; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 3284; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 3285; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 3286; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 3287; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 3288; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3289; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3290; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 3291; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 3292; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 3293; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3294; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3295; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3296; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3297; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 3298; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3299; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3300; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3301; CI-NEXT: s_waitcnt vmcnt(0) 3302; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3303; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3304; CI-NEXT: s_setpc_b64 s[30:31] 3305; 3306; GFX11-LABEL: v_test_canonicalize_var_v32f16: 3307; GFX11: ; %bb.0: 3308; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3309; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 3310; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3311; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 3312; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 3313; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 3314; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 3315; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 3316; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 3317; GFX11-NEXT: v_pk_max_f16 v8, v8, v8 3318; GFX11-NEXT: v_pk_max_f16 v9, v9, v9 3319; GFX11-NEXT: v_pk_max_f16 v10, v10, v10 3320; GFX11-NEXT: v_pk_max_f16 v11, v11, v11 3321; GFX11-NEXT: v_pk_max_f16 v12, v12, v12 3322; GFX11-NEXT: v_pk_max_f16 v13, v13, v13 3323; GFX11-NEXT: v_pk_max_f16 v14, v14, v14 3324; GFX11-NEXT: v_pk_max_f16 v15, v15, v15 3325; GFX11-NEXT: s_setpc_b64 s[30:31] 3326 %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val) 3327 ret <32 x half> %canonicalized 3328} 3329 3330define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { 3331; VI-LABEL: v_test_canonicalize_var_v64f16: 3332; VI: ; %bb.0: 3333; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3334; VI-NEXT: v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3335; VI-NEXT: v_max_f16_e32 v0, v0, v0 3336; VI-NEXT: v_or_b32_e32 v0, v0, v31 3337; VI-NEXT: v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3338; VI-NEXT: v_max_f16_e32 v1, v1, v1 3339; VI-NEXT: v_or_b32_e32 v1, v1, v31 3340; VI-NEXT: v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3341; VI-NEXT: v_max_f16_e32 v2, v2, v2 3342; VI-NEXT: v_or_b32_e32 v2, v2, v31 3343; VI-NEXT: v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3344; VI-NEXT: v_max_f16_e32 v3, v3, v3 3345; VI-NEXT: v_or_b32_e32 v3, v3, v31 3346; VI-NEXT: v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3347; VI-NEXT: v_max_f16_e32 v4, v4, v4 3348; VI-NEXT: v_or_b32_e32 v4, v4, v31 3349; VI-NEXT: v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3350; VI-NEXT: v_max_f16_e32 v5, v5, v5 3351; VI-NEXT: v_or_b32_e32 v5, v5, v31 3352; VI-NEXT: v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3353; VI-NEXT: v_max_f16_e32 v6, v6, v6 3354; VI-NEXT: v_or_b32_e32 v6, v6, v31 3355; VI-NEXT: v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3356; VI-NEXT: v_max_f16_e32 v7, v7, v7 3357; VI-NEXT: v_or_b32_e32 v7, v7, v31 3358; VI-NEXT: v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3359; VI-NEXT: v_max_f16_e32 v8, v8, v8 3360; VI-NEXT: v_or_b32_e32 v8, v8, v31 3361; VI-NEXT: v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3362; VI-NEXT: v_max_f16_e32 v9, v9, v9 3363; VI-NEXT: v_or_b32_e32 v9, v9, v31 3364; VI-NEXT: v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3365; VI-NEXT: v_max_f16_e32 v10, v10, v10 3366; VI-NEXT: v_or_b32_e32 v10, v10, v31 3367; VI-NEXT: v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3368; VI-NEXT: v_max_f16_e32 v11, v11, v11 3369; VI-NEXT: v_or_b32_e32 v11, v11, v31 3370; VI-NEXT: v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3371; VI-NEXT: v_max_f16_e32 v12, v12, v12 3372; VI-NEXT: v_or_b32_e32 v12, v12, v31 3373; VI-NEXT: v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3374; VI-NEXT: v_max_f16_e32 v13, v13, v13 3375; VI-NEXT: v_or_b32_e32 v13, v13, v31 3376; VI-NEXT: v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3377; VI-NEXT: v_max_f16_e32 v14, v14, v14 3378; VI-NEXT: v_or_b32_e32 v14, v14, v31 3379; VI-NEXT: v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3380; VI-NEXT: v_max_f16_e32 v15, v15, v15 3381; VI-NEXT: v_or_b32_e32 v15, v15, v31 3382; VI-NEXT: v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3383; VI-NEXT: v_max_f16_e32 v16, v16, v16 3384; VI-NEXT: v_or_b32_e32 v16, v16, v31 3385; VI-NEXT: v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3386; VI-NEXT: v_max_f16_e32 v17, v17, v17 3387; VI-NEXT: v_or_b32_e32 v17, v17, v31 3388; VI-NEXT: v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3389; VI-NEXT: v_max_f16_e32 v18, v18, v18 3390; VI-NEXT: v_or_b32_e32 v18, v18, v31 3391; VI-NEXT: v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3392; VI-NEXT: v_max_f16_e32 v19, v19, v19 3393; VI-NEXT: v_or_b32_e32 v19, v19, v31 3394; VI-NEXT: v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3395; VI-NEXT: v_max_f16_e32 v20, v20, v20 3396; VI-NEXT: v_or_b32_e32 v20, v20, v31 3397; VI-NEXT: v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3398; VI-NEXT: v_max_f16_e32 v21, v21, v21 3399; VI-NEXT: v_or_b32_e32 v21, v21, v31 3400; VI-NEXT: v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3401; VI-NEXT: v_max_f16_e32 v22, v22, v22 3402; VI-NEXT: v_or_b32_e32 v22, v22, v31 3403; VI-NEXT: v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3404; VI-NEXT: v_max_f16_e32 v23, v23, v23 3405; VI-NEXT: v_or_b32_e32 v23, v23, v31 3406; VI-NEXT: v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3407; VI-NEXT: v_max_f16_e32 v24, v24, v24 3408; VI-NEXT: v_or_b32_e32 v24, v24, v31 3409; VI-NEXT: v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3410; VI-NEXT: v_max_f16_e32 v25, v25, v25 3411; VI-NEXT: v_or_b32_e32 v25, v25, v31 3412; VI-NEXT: v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3413; VI-NEXT: v_max_f16_e32 v26, v26, v26 3414; VI-NEXT: v_or_b32_e32 v26, v26, v31 3415; VI-NEXT: v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3416; VI-NEXT: v_max_f16_e32 v27, v27, v27 3417; VI-NEXT: v_or_b32_e32 v27, v27, v31 3418; VI-NEXT: v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3419; VI-NEXT: v_max_f16_e32 v28, v28, v28 3420; VI-NEXT: v_or_b32_e32 v28, v28, v31 3421; VI-NEXT: v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3422; VI-NEXT: v_max_f16_e32 v29, v29, v29 3423; VI-NEXT: v_or_b32_e32 v29, v29, v31 3424; VI-NEXT: v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3425; VI-NEXT: v_max_f16_e32 v30, v30, v30 3426; VI-NEXT: v_or_b32_e32 v30, v30, v31 3427; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 3428; VI-NEXT: s_waitcnt vmcnt(0) 3429; VI-NEXT: v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 3430; VI-NEXT: v_max_f16_e32 v31, v31, v31 3431; VI-NEXT: v_or_b32_e32 v31, v31, v32 3432; VI-NEXT: s_setpc_b64 s[30:31] 3433; 3434; GFX9-LABEL: v_test_canonicalize_var_v64f16: 3435; GFX9: ; %bb.0: 3436; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3437; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 3438; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 3439; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 3440; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 3441; GFX9-NEXT: v_pk_max_f16 v3, v3, v3 3442; GFX9-NEXT: v_pk_max_f16 v4, v4, v4 3443; GFX9-NEXT: v_pk_max_f16 v5, v5, v5 3444; GFX9-NEXT: v_pk_max_f16 v6, v6, v6 3445; GFX9-NEXT: v_pk_max_f16 v7, v7, v7 3446; GFX9-NEXT: v_pk_max_f16 v8, v8, v8 3447; GFX9-NEXT: v_pk_max_f16 v9, v9, v9 3448; GFX9-NEXT: v_pk_max_f16 v10, v10, v10 3449; GFX9-NEXT: v_pk_max_f16 v11, v11, v11 3450; GFX9-NEXT: v_pk_max_f16 v12, v12, v12 3451; GFX9-NEXT: v_pk_max_f16 v13, v13, v13 3452; GFX9-NEXT: v_pk_max_f16 v14, v14, v14 3453; GFX9-NEXT: v_pk_max_f16 v15, v15, v15 3454; GFX9-NEXT: v_pk_max_f16 v16, v16, v16 3455; GFX9-NEXT: v_pk_max_f16 v17, v17, v17 3456; GFX9-NEXT: v_pk_max_f16 v18, v18, v18 3457; GFX9-NEXT: v_pk_max_f16 v19, v19, v19 3458; GFX9-NEXT: v_pk_max_f16 v20, v20, v20 3459; GFX9-NEXT: v_pk_max_f16 v21, v21, v21 3460; GFX9-NEXT: v_pk_max_f16 v22, v22, v22 3461; GFX9-NEXT: v_pk_max_f16 v23, v23, v23 3462; GFX9-NEXT: v_pk_max_f16 v24, v24, v24 3463; GFX9-NEXT: v_pk_max_f16 v25, v25, v25 3464; GFX9-NEXT: v_pk_max_f16 v26, v26, v26 3465; GFX9-NEXT: v_pk_max_f16 v27, v27, v27 3466; GFX9-NEXT: v_pk_max_f16 v28, v28, v28 3467; GFX9-NEXT: v_pk_max_f16 v29, v29, v29 3468; GFX9-NEXT: v_pk_max_f16 v30, v30, v30 3469; GFX9-NEXT: s_waitcnt vmcnt(0) 3470; GFX9-NEXT: v_pk_max_f16 v31, v31, v31 3471; GFX9-NEXT: s_setpc_b64 s[30:31] 3472; 3473; CI-LABEL: v_test_canonicalize_var_v64f16: 3474; CI: ; %bb.0: 3475; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3476; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 3477; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 3478; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3479; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3480; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3481; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3482; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3483; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3484; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3485; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3486; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3487; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3488; CI-NEXT: v_or_b32_e32 v1, v1, v2 3489; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 3490; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 3491; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 3492; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 3493; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3494; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3495; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3496; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3497; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3498; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3499; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3500; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3501; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3502; CI-NEXT: v_or_b32_e32 v2, v3, v2 3503; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 3504; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 3505; CI-NEXT: v_cvt_f16_f32_e32 v9, v16 3506; CI-NEXT: v_cvt_f16_f32_e32 v16, v21 3507; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3508; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3509; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 3510; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 3511; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3512; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3513; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 3514; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 3515; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3516; CI-NEXT: v_or_b32_e32 v3, v4, v3 3517; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 3518; CI-NEXT: v_cvt_f16_f32_e32 v8, v13 3519; CI-NEXT: v_cvt_f16_f32_e32 v13, v20 3520; CI-NEXT: v_cvt_f16_f32_e32 v20, v25 3521; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3522; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 3523; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 3524; CI-NEXT: v_cvt_f16_f32_e32 v21, v28 3525; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3526; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 3527; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3528; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 3529; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3530; CI-NEXT: v_or_b32_e32 v4, v5, v4 3531; CI-NEXT: v_cvt_f16_f32_e32 v5, v10 3532; CI-NEXT: v_cvt_f16_f32_e32 v10, v15 3533; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 3534; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 3535; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3536; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 3537; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 3538; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 3539; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3540; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3541; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3542; CI-NEXT: v_or_b32_e32 v5, v6, v5 3543; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 3544; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 3545; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3546; CI-NEXT: s_waitcnt vmcnt(3) 3547; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3548; CI-NEXT: s_waitcnt vmcnt(2) 3549; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3550; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3551; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3552; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3553; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3554; CI-NEXT: v_or_b32_e32 v6, v7, v6 3555; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3556; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3557; CI-NEXT: v_cvt_f16_f32_e32 v7, v14 3558; CI-NEXT: v_cvt_f16_f32_e32 v14, v19 3559; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3560; CI-NEXT: v_or_b32_e32 v31, v32, v31 3561; CI-NEXT: v_add_i32_e32 v32, vcc, 0x7c, v0 3562; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3563; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 3564; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 3565; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3566; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 3567; CI-NEXT: v_cvt_f16_f32_e32 v19, v26 3568; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3569; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3570; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 3571; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 3572; CI-NEXT: v_or_b32_e32 v7, v8, v7 3573; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 3574; CI-NEXT: v_cvt_f16_f32_e32 v9, v18 3575; CI-NEXT: v_or_b32_e32 v8, v10, v8 3576; CI-NEXT: v_cvt_f16_f32_e32 v10, v17 3577; CI-NEXT: v_cvt_f16_f32_e32 v17, v24 3578; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 3579; CI-NEXT: v_cvt_f16_f32_e32 v18, v23 3580; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 3581; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 3582; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 3583; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 3584; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 3585; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 3586; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 3587; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 3588; CI-NEXT: v_or_b32_e32 v9, v10, v9 3589; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 3590; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 3591; CI-NEXT: v_or_b32_e32 v10, v14, v10 3592; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 3593; CI-NEXT: v_or_b32_e32 v17, v18, v17 3594; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 3595; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 3596; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 3597; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 3598; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3599; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 3600; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 3601; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 3602; CI-NEXT: v_or_b32_e32 v13, v16, v13 3603; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 3604; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 3605; CI-NEXT: v_or_b32_e32 v19, v20, v19 3606; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 3607; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 3608; CI-NEXT: v_or_b32_e32 v20, v22, v20 3609; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 3610; CI-NEXT: s_waitcnt vmcnt(6) 3611; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3612; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 3613; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 3614; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3615; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 3616; CI-NEXT: s_waitcnt vmcnt(5) 3617; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3618; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 3619; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3620; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 3621; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3622; CI-NEXT: v_or_b32_e32 v21, v22, v21 3623; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 3624; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3625; CI-NEXT: s_waitcnt vmcnt(3) 3626; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3627; CI-NEXT: s_waitcnt vmcnt(2) 3628; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3629; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3630; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3631; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3632; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3633; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3634; CI-NEXT: v_or_b32_e32 v31, v32, v31 3635; CI-NEXT: v_add_i32_e32 v32, vcc, 0x78, v0 3636; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3637; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 3638; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 3639; CI-NEXT: s_waitcnt vmcnt(1) 3640; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3641; CI-NEXT: s_waitcnt vmcnt(0) 3642; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3643; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3644; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3645; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3646; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3647; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3648; CI-NEXT: v_or_b32_e32 v31, v32, v31 3649; CI-NEXT: v_add_i32_e32 v32, vcc, 0x74, v0 3650; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3651; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 3652; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 3653; CI-NEXT: s_waitcnt vmcnt(1) 3654; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3655; CI-NEXT: s_waitcnt vmcnt(0) 3656; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3657; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3658; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3659; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3660; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3661; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3662; CI-NEXT: v_or_b32_e32 v31, v32, v31 3663; CI-NEXT: v_add_i32_e32 v32, vcc, 0x70, v0 3664; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3665; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 3666; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 3667; CI-NEXT: s_waitcnt vmcnt(1) 3668; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3669; CI-NEXT: s_waitcnt vmcnt(0) 3670; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3671; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3672; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3673; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3674; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3675; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3676; CI-NEXT: v_or_b32_e32 v31, v32, v31 3677; CI-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v0 3678; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3679; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 3680; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 3681; CI-NEXT: s_waitcnt vmcnt(1) 3682; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3683; CI-NEXT: s_waitcnt vmcnt(0) 3684; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3685; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3686; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3687; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3688; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3689; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3690; CI-NEXT: v_or_b32_e32 v31, v32, v31 3691; CI-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 3692; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3693; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 3694; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 3695; CI-NEXT: s_waitcnt vmcnt(1) 3696; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3697; CI-NEXT: s_waitcnt vmcnt(0) 3698; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3699; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3700; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3701; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3702; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3703; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3704; CI-NEXT: v_or_b32_e32 v31, v32, v31 3705; CI-NEXT: v_add_i32_e32 v32, vcc, 0x64, v0 3706; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3707; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 3708; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 3709; CI-NEXT: s_waitcnt vmcnt(1) 3710; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3711; CI-NEXT: s_waitcnt vmcnt(0) 3712; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3713; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3714; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3715; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3716; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3717; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3718; CI-NEXT: v_or_b32_e32 v31, v32, v31 3719; CI-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 3720; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3721; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 3722; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 3723; CI-NEXT: s_waitcnt vmcnt(1) 3724; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3725; CI-NEXT: s_waitcnt vmcnt(0) 3726; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3727; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3728; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3729; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3730; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3731; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3732; CI-NEXT: v_or_b32_e32 v31, v32, v31 3733; CI-NEXT: v_add_i32_e32 v32, vcc, 0x5c, v0 3734; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3735; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 3736; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 3737; CI-NEXT: s_waitcnt vmcnt(1) 3738; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3739; CI-NEXT: s_waitcnt vmcnt(0) 3740; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3741; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3742; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3743; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3744; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3745; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3746; CI-NEXT: v_or_b32_e32 v31, v32, v31 3747; CI-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 3748; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3749; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 3750; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 3751; CI-NEXT: s_waitcnt vmcnt(1) 3752; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3753; CI-NEXT: s_waitcnt vmcnt(0) 3754; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3755; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3756; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3757; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3758; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3759; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3760; CI-NEXT: v_or_b32_e32 v31, v32, v31 3761; CI-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0 3762; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3763; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 3764; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 3765; CI-NEXT: s_waitcnt vmcnt(1) 3766; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3767; CI-NEXT: s_waitcnt vmcnt(0) 3768; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3769; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3770; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3771; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3772; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3773; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3774; CI-NEXT: v_or_b32_e32 v31, v32, v31 3775; CI-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 3776; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3777; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 3778; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 3779; CI-NEXT: s_waitcnt vmcnt(1) 3780; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3781; CI-NEXT: s_waitcnt vmcnt(0) 3782; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3783; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3784; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3785; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3786; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3787; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3788; CI-NEXT: v_or_b32_e32 v31, v32, v31 3789; CI-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 3790; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3791; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 3792; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 3793; CI-NEXT: s_waitcnt vmcnt(1) 3794; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3795; CI-NEXT: s_waitcnt vmcnt(0) 3796; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3797; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 3798; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 3799; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 3800; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 3801; CI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 3802; CI-NEXT: v_or_b32_e32 v31, v32, v31 3803; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 3804; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen 3805; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 3806; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 3807; CI-NEXT: s_waitcnt vmcnt(1) 3808; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3809; CI-NEXT: s_waitcnt vmcnt(0) 3810; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 3811; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 3812; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 3813; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3814; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 3815; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 3816; CI-NEXT: v_or_b32_e32 v14, v15, v14 3817; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 3818; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 3819; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 3820; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 3821; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 3822; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 3823; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 3824; CI-NEXT: v_or_b32_e32 v12, v12, v15 3825; CI-NEXT: v_or_b32_e32 v11, v16, v11 3826; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 3827; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen 3828; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 3829; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen 3830; CI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 3831; CI-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen 3832; CI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 3833; CI-NEXT: buffer_store_dword v21, v11, s[0:3], 0 offen 3834; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 3835; CI-NEXT: buffer_store_dword v20, v11, s[0:3], 0 offen 3836; CI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 3837; CI-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen 3838; CI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 3839; CI-NEXT: buffer_store_dword v17, v11, s[0:3], 0 offen 3840; CI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 3841; CI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen 3842; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 3843; CI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen 3844; CI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 3845; CI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen 3846; CI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 3847; CI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen 3848; CI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 3849; CI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen 3850; CI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 3851; CI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen 3852; CI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 3853; CI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen 3854; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 3855; CI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen 3856; CI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 3857; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen 3858; CI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 3859; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 3860; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen 3861; CI-NEXT: s_waitcnt vmcnt(0) 3862; CI-NEXT: s_setpc_b64 s[30:31] 3863; 3864; GFX11-LABEL: v_test_canonicalize_var_v64f16: 3865; GFX11: ; %bb.0: 3866; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3867; GFX11-NEXT: scratch_load_b32 v31, off, s32 3868; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 3869; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 3870; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 3871; GFX11-NEXT: v_pk_max_f16 v3, v3, v3 3872; GFX11-NEXT: v_pk_max_f16 v4, v4, v4 3873; GFX11-NEXT: v_pk_max_f16 v5, v5, v5 3874; GFX11-NEXT: v_pk_max_f16 v6, v6, v6 3875; GFX11-NEXT: v_pk_max_f16 v7, v7, v7 3876; GFX11-NEXT: v_pk_max_f16 v8, v8, v8 3877; GFX11-NEXT: v_pk_max_f16 v9, v9, v9 3878; GFX11-NEXT: v_pk_max_f16 v10, v10, v10 3879; GFX11-NEXT: v_pk_max_f16 v11, v11, v11 3880; GFX11-NEXT: v_pk_max_f16 v12, v12, v12 3881; GFX11-NEXT: v_pk_max_f16 v13, v13, v13 3882; GFX11-NEXT: v_pk_max_f16 v14, v14, v14 3883; GFX11-NEXT: v_pk_max_f16 v15, v15, v15 3884; GFX11-NEXT: v_pk_max_f16 v16, v16, v16 3885; GFX11-NEXT: v_pk_max_f16 v17, v17, v17 3886; GFX11-NEXT: v_pk_max_f16 v18, v18, v18 3887; GFX11-NEXT: v_pk_max_f16 v19, v19, v19 3888; GFX11-NEXT: v_pk_max_f16 v20, v20, v20 3889; GFX11-NEXT: v_pk_max_f16 v21, v21, v21 3890; GFX11-NEXT: v_pk_max_f16 v22, v22, v22 3891; GFX11-NEXT: v_pk_max_f16 v23, v23, v23 3892; GFX11-NEXT: v_pk_max_f16 v24, v24, v24 3893; GFX11-NEXT: v_pk_max_f16 v25, v25, v25 3894; GFX11-NEXT: v_pk_max_f16 v26, v26, v26 3895; GFX11-NEXT: v_pk_max_f16 v27, v27, v27 3896; GFX11-NEXT: v_pk_max_f16 v28, v28, v28 3897; GFX11-NEXT: v_pk_max_f16 v29, v29, v29 3898; GFX11-NEXT: v_pk_max_f16 v30, v30, v30 3899; GFX11-NEXT: s_waitcnt vmcnt(0) 3900; GFX11-NEXT: v_pk_max_f16 v31, v31, v31 3901; GFX11-NEXT: s_setpc_b64 s[30:31] 3902 %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val) 3903 ret <64 x half> %canonicalized 3904} 3905 3906attributes #0 = { nounwind readnone } 3907attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3908attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } 3909attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3910