1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s 8 9define <2 x half> @chain_hi_to_lo_private() { 10; GFX900-LABEL: chain_hi_to_lo_private: 11; GFX900: ; %bb.0: ; %bb 12; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 14; GFX900-NEXT: s_nop 0 15; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 16; GFX900-NEXT: s_waitcnt vmcnt(0) 17; GFX900-NEXT: s_setpc_b64 s[30:31] 18; 19; FLATSCR-LABEL: chain_hi_to_lo_private: 20; FLATSCR: ; %bb.0: ; %bb 21; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; FLATSCR-NEXT: s_mov_b32 s0, 2 23; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 24; FLATSCR-NEXT: s_mov_b32 s0, 0 25; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 26; FLATSCR-NEXT: s_waitcnt vmcnt(0) 27; FLATSCR-NEXT: s_setpc_b64 s[30:31] 28; 29; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private: 30; GFX10_DEFAULT: ; %bb.0: ; %bb 31; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX10_DEFAULT-NEXT: s_clause 0x1 33; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 34; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 35; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 36; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] 37; 38; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private: 39; FLATSCR_GFX10: ; %bb.0: ; %bb 40; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 2 42; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 43; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 44; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 45; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 46; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 47; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 48; 49; GFX11-LABEL: chain_hi_to_lo_private: 50; GFX11: ; %bb.0: ; %bb 51; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GFX11-NEXT: s_mov_b32 s0, 2 53; GFX11-NEXT: scratch_load_u16 v0, off, s0 54; GFX11-NEXT: s_mov_b32 s0, 0 55; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 56; GFX11-NEXT: s_waitcnt vmcnt(0) 57; GFX11-NEXT: s_setpc_b64 s[30:31] 58bb: 59 %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1 60 %load_lo = load half, ptr addrspace(5) %gep_lo 61 %load_hi = load half, ptr addrspace(5) null 62 63 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 64 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 65 66 ret <2 x half> %result 67} 68 69define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base_lo, ptr addrspace(5) %base_hi) { 70; GFX900-LABEL: chain_hi_to_lo_private_different_bases: 71; GFX900: ; %bb.0: ; %bb 72; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 74; GFX900-NEXT: s_nop 0 75; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen 76; GFX900-NEXT: s_waitcnt vmcnt(0) 77; GFX900-NEXT: s_setpc_b64 s[30:31] 78; 79; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases: 80; FLATSCR: ; %bb.0: ; %bb 81; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 82; FLATSCR-NEXT: scratch_load_ushort v0, v0, off 83; FLATSCR-NEXT: s_nop 0 84; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off 85; FLATSCR-NEXT: s_waitcnt vmcnt(0) 86; FLATSCR-NEXT: s_setpc_b64 s[30:31] 87; 88; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases: 89; GFX10_DEFAULT: ; %bb.0: ; %bb 90; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 91; GFX10_DEFAULT-NEXT: s_clause 0x1 92; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 93; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen 94; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 95; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] 96; 97; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases: 98; FLATSCR_GFX10: ; %bb.0: ; %bb 99; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off 101; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off 102; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 103; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 104; 105; GFX11-LABEL: chain_hi_to_lo_private_different_bases: 106; GFX11: ; %bb.0: ; %bb 107; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GFX11-NEXT: scratch_load_u16 v0, v0, off 109; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off 110; GFX11-NEXT: s_waitcnt vmcnt(0) 111; GFX11-NEXT: s_setpc_b64 s[30:31] 112bb: 113 %load_lo = load half, ptr addrspace(5) %base_lo 114 %load_hi = load half, ptr addrspace(5) %base_hi 115 116 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 117 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 118 119 ret <2 x half> %result 120} 121 122define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) { 123; GFX900-LABEL: chain_hi_to_lo_arithmatic: 124; GFX900: ; %bb.0: ; %bb 125; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 127; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 128; GFX900-NEXT: s_waitcnt vmcnt(0) 129; GFX900-NEXT: v_mov_b32_e32 v0, v1 130; GFX900-NEXT: s_setpc_b64 s[30:31] 131; 132; FLATSCR-LABEL: chain_hi_to_lo_arithmatic: 133; FLATSCR: ; %bb.0: ; %bb 134; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1 136; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off 137; FLATSCR-NEXT: s_waitcnt vmcnt(0) 138; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 139; FLATSCR-NEXT: s_setpc_b64 s[30:31] 140; 141; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic: 142; GFX10_DEFAULT: ; %bb.0: ; %bb 143; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX10_DEFAULT-NEXT: v_add_f16_e32 v1, 1.0, v1 145; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 146; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 147; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 148; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] 149; 150; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic: 151; FLATSCR_GFX10: ; %bb.0: ; %bb 152; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 154; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off 155; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 156; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 157; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 158; 159; GFX11-LABEL: chain_hi_to_lo_arithmatic: 160; GFX11: ; %bb.0: ; %bb 161; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 162; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 163; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off 164; GFX11-NEXT: s_waitcnt vmcnt(0) 165; GFX11-NEXT: v_mov_b32_e32 v0, v1 166; GFX11-NEXT: s_setpc_b64 s[30:31] 167bb: 168 %arith_lo = fadd half %in, 1.0 169 %load_hi = load half, ptr addrspace(5) %base 170 171 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 172 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 173 174 ret <2 x half> %result 175} 176 177define <2 x half> @chain_hi_to_lo_group() { 178; GCN-LABEL: chain_hi_to_lo_group: 179; GCN: ; %bb.0: ; %bb 180; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GCN-NEXT: v_mov_b32_e32 v1, 0 182; GCN-NEXT: ds_read_u16 v0, v1 offset:2 183; GCN-NEXT: s_waitcnt lgkmcnt(0) 184; GCN-NEXT: ds_read_u16_d16_hi v0, v1 185; GCN-NEXT: s_waitcnt lgkmcnt(0) 186; GCN-NEXT: s_setpc_b64 s[30:31] 187; 188; GFX10-LABEL: chain_hi_to_lo_group: 189; GFX10: ; %bb.0: ; %bb 190; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 191; GFX10-NEXT: v_mov_b32_e32 v1, 0 192; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 193; GFX10-NEXT: s_waitcnt lgkmcnt(0) 194; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 195; GFX10-NEXT: s_waitcnt lgkmcnt(0) 196; GFX10-NEXT: s_setpc_b64 s[30:31] 197; 198; GFX11-LABEL: chain_hi_to_lo_group: 199; GFX11: ; %bb.0: ; %bb 200; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 201; GFX11-NEXT: v_mov_b32_e32 v1, 0 202; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 203; GFX11-NEXT: s_waitcnt lgkmcnt(0) 204; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 205; GFX11-NEXT: s_waitcnt lgkmcnt(0) 206; GFX11-NEXT: s_setpc_b64 s[30:31] 207bb: 208 %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1 209 %load_lo = load half, ptr addrspace(3) %gep_lo 210 %load_hi = load half, ptr addrspace(3) null 211 212 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 213 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 214 215 ret <2 x half> %result 216} 217 218define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_lo, ptr addrspace(3) %base_hi) { 219; GCN-LABEL: chain_hi_to_lo_group_different_bases: 220; GCN: ; %bb.0: ; %bb 221; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 222; GCN-NEXT: ds_read_u16 v0, v0 223; GCN-NEXT: s_waitcnt lgkmcnt(0) 224; GCN-NEXT: ds_read_u16_d16_hi v0, v1 225; GCN-NEXT: s_waitcnt lgkmcnt(0) 226; GCN-NEXT: s_setpc_b64 s[30:31] 227; 228; GFX10-LABEL: chain_hi_to_lo_group_different_bases: 229; GFX10: ; %bb.0: ; %bb 230; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 231; GFX10-NEXT: ds_read_u16 v0, v0 232; GFX10-NEXT: s_waitcnt lgkmcnt(0) 233; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 234; GFX10-NEXT: s_waitcnt lgkmcnt(0) 235; GFX10-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX11-LABEL: chain_hi_to_lo_group_different_bases: 238; GFX11: ; %bb.0: ; %bb 239; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX11-NEXT: ds_load_u16 v0, v0 241; GFX11-NEXT: s_waitcnt lgkmcnt(0) 242; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 243; GFX11-NEXT: s_waitcnt lgkmcnt(0) 244; GFX11-NEXT: s_setpc_b64 s[30:31] 245bb: 246 %load_lo = load half, ptr addrspace(3) %base_lo 247 %load_hi = load half, ptr addrspace(3) %base_hi 248 249 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 250 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 251 252 ret <2 x half> %result 253} 254 255define <2 x half> @chain_hi_to_lo_global() { 256; GCN-LABEL: chain_hi_to_lo_global: 257; GCN: ; %bb.0: ; %bb 258; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GCN-NEXT: v_mov_b32_e32 v0, 2 260; GCN-NEXT: v_mov_b32_e32 v1, 0 261; GCN-NEXT: global_load_ushort v0, v[0:1], off 262; GCN-NEXT: v_mov_b32_e32 v1, 0 263; GCN-NEXT: v_mov_b32_e32 v2, 0 264; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off 265; GCN-NEXT: s_waitcnt vmcnt(0) 266; GCN-NEXT: s_setpc_b64 s[30:31] 267; 268; GFX10-LABEL: chain_hi_to_lo_global: 269; GFX10: ; %bb.0: ; %bb 270; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 271; GFX10-NEXT: v_mov_b32_e32 v0, 2 272; GFX10-NEXT: v_mov_b32_e32 v1, 0 273; GFX10-NEXT: global_load_ushort v0, v[0:1], off 274; GFX10-NEXT: v_mov_b32_e32 v1, 0 275; GFX10-NEXT: v_mov_b32_e32 v2, 0 276; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off 277; GFX10-NEXT: s_waitcnt vmcnt(0) 278; GFX10-NEXT: s_setpc_b64 s[30:31] 279; 280; GFX11-LABEL: chain_hi_to_lo_global: 281; GFX11: ; %bb.0: ; %bb 282; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 283; GFX11-NEXT: v_mov_b32_e32 v0, 2 284; GFX11-NEXT: v_mov_b32_e32 v1, 0 285; GFX11-NEXT: global_load_u16 v0, v[0:1], off 286; GFX11-NEXT: v_mov_b32_e32 v1, 0 287; GFX11-NEXT: v_mov_b32_e32 v2, 0 288; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off 289; GFX11-NEXT: s_waitcnt vmcnt(0) 290; GFX11-NEXT: s_setpc_b64 s[30:31] 291bb: 292 %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1 293 %load_lo = load half, ptr addrspace(1) %gep_lo 294 %load_hi = load half, ptr addrspace(1) null 295 296 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 297 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 298 299 ret <2 x half> %result 300} 301 302define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_lo, ptr addrspace(1) %base_hi) { 303; GCN-LABEL: chain_hi_to_lo_global_different_bases: 304; GCN: ; %bb.0: ; %bb 305; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 306; GCN-NEXT: global_load_ushort v0, v[0:1], off 307; GCN-NEXT: s_nop 0 308; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off 309; GCN-NEXT: s_waitcnt vmcnt(0) 310; GCN-NEXT: s_setpc_b64 s[30:31] 311; 312; GFX10-LABEL: chain_hi_to_lo_global_different_bases: 313; GFX10: ; %bb.0: ; %bb 314; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 315; GFX10-NEXT: global_load_ushort v0, v[0:1], off 316; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off 317; GFX10-NEXT: s_waitcnt vmcnt(0) 318; GFX10-NEXT: s_setpc_b64 s[30:31] 319; 320; GFX11-LABEL: chain_hi_to_lo_global_different_bases: 321; GFX11: ; %bb.0: ; %bb 322; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 323; GFX11-NEXT: global_load_u16 v0, v[0:1], off 324; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off 325; GFX11-NEXT: s_waitcnt vmcnt(0) 326; GFX11-NEXT: s_setpc_b64 s[30:31] 327bb: 328 %load_lo = load half, ptr addrspace(1) %base_lo 329 %load_hi = load half, ptr addrspace(1) %base_hi 330 331 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 332 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 333 334 ret <2 x half> %result 335} 336 337define <2 x half> @chain_hi_to_lo_flat() { 338; GCN-LABEL: chain_hi_to_lo_flat: 339; GCN: ; %bb.0: ; %bb 340; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; GCN-NEXT: v_mov_b32_e32 v0, 2 342; GCN-NEXT: v_mov_b32_e32 v1, 0 343; GCN-NEXT: flat_load_ushort v0, v[0:1] 344; GCN-NEXT: v_mov_b32_e32 v1, 0 345; GCN-NEXT: v_mov_b32_e32 v2, 0 346; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 347; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] 348; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 349; GCN-NEXT: s_setpc_b64 s[30:31] 350; 351; GFX10-LABEL: chain_hi_to_lo_flat: 352; GFX10: ; %bb.0: ; %bb 353; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; GFX10-NEXT: v_mov_b32_e32 v0, 2 355; GFX10-NEXT: v_mov_b32_e32 v1, 0 356; GFX10-NEXT: flat_load_ushort v0, v[0:1] 357; GFX10-NEXT: v_mov_b32_e32 v1, 0 358; GFX10-NEXT: v_mov_b32_e32 v2, 0 359; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 360; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] 361; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 362; GFX10-NEXT: s_setpc_b64 s[30:31] 363; 364; GFX11-LABEL: chain_hi_to_lo_flat: 365; GFX11: ; %bb.0: ; %bb 366; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 367; GFX11-NEXT: v_mov_b32_e32 v0, 2 368; GFX11-NEXT: v_mov_b32_e32 v1, 0 369; GFX11-NEXT: flat_load_u16 v0, v[0:1] 370; GFX11-NEXT: v_mov_b32_e32 v1, 0 371; GFX11-NEXT: v_mov_b32_e32 v2, 0 372; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 373; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2] 374; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 375; GFX11-NEXT: s_setpc_b64 s[30:31] 376bb: 377 %gep_lo = getelementptr inbounds half, ptr null, i64 1 378 %load_lo = load half, ptr %gep_lo 379 %load_hi = load half, ptr null 380 381 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 382 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 383 384 ret <2 x half> %result 385} 386 387define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_hi) { 388; GCN-LABEL: chain_hi_to_lo_flat_different_bases: 389; GCN: ; %bb.0: ; %bb 390; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GCN-NEXT: flat_load_ushort v0, v[0:1] 392; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 393; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] 394; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 395; GCN-NEXT: s_setpc_b64 s[30:31] 396; 397; GFX10-LABEL: chain_hi_to_lo_flat_different_bases: 398; GFX10: ; %bb.0: ; %bb 399; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX10-NEXT: flat_load_ushort v0, v[0:1] 401; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 402; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] 403; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 404; GFX10-NEXT: s_setpc_b64 s[30:31] 405; 406; GFX11-LABEL: chain_hi_to_lo_flat_different_bases: 407; GFX11: ; %bb.0: ; %bb 408; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 409; GFX11-NEXT: flat_load_u16 v0, v[0:1] 410; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 411; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3] 412; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 413; GFX11-NEXT: s_setpc_b64 s[30:31] 414bb: 415 %load_lo = load half, ptr %base_lo 416 %load_hi = load half, ptr %base_hi 417 418 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 419 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 420 421 ret <2 x half> %result 422} 423 424; Make sure we don't lose any of the private stores. 425define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { 426; GFX900-LABEL: vload2_private: 427; GFX900: ; %bb.0: ; %entry 428; GFX900-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 429; GFX900-NEXT: v_mov_b32_e32 v2, 0 430; GFX900-NEXT: s_add_u32 s0, s0, s17 431; GFX900-NEXT: s_addc_u32 s1, s1, 0 432; GFX900-NEXT: s_waitcnt lgkmcnt(0) 433; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] 434; GFX900-NEXT: s_waitcnt vmcnt(0) 435; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 436; GFX900-NEXT: s_waitcnt vmcnt(0) 437; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 438; GFX900-NEXT: s_waitcnt vmcnt(0) 439; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 440; GFX900-NEXT: s_waitcnt vmcnt(0) 441; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 442; GFX900-NEXT: s_mov_b32 s4, 0x5040100 443; GFX900-NEXT: s_waitcnt vmcnt(0) 444; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 445; GFX900-NEXT: s_waitcnt vmcnt(0) 446; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 447; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 448; GFX900-NEXT: s_waitcnt vmcnt(1) 449; GFX900-NEXT: v_mov_b32_e32 v1, v0 450; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 451; GFX900-NEXT: s_waitcnt vmcnt(1) 452; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 453; GFX900-NEXT: s_waitcnt vmcnt(0) 454; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 455; GFX900-NEXT: s_endpgm 456; 457; FLATSCR-LABEL: vload2_private: 458; FLATSCR: ; %bb.0: ; %entry 459; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 460; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 461; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 462; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 463; FLATSCR-NEXT: s_mov_b32 s4, 0 464; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 465; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] 466; FLATSCR-NEXT: s_waitcnt vmcnt(0) 467; FLATSCR-NEXT: scratch_store_short off, v0, s4 468; FLATSCR-NEXT: s_waitcnt vmcnt(0) 469; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 470; FLATSCR-NEXT: s_waitcnt vmcnt(0) 471; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:2 472; FLATSCR-NEXT: s_waitcnt vmcnt(0) 473; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 474; FLATSCR-NEXT: s_mov_b32 s0, 0 475; FLATSCR-NEXT: s_waitcnt vmcnt(0) 476; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 477; FLATSCR-NEXT: s_waitcnt vmcnt(0) 478; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 479; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 480; FLATSCR-NEXT: s_waitcnt vmcnt(1) 481; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 482; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 483; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 484; FLATSCR-NEXT: s_waitcnt vmcnt(1) 485; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 486; FLATSCR-NEXT: s_waitcnt vmcnt(0) 487; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 488; FLATSCR-NEXT: s_endpgm 489; 490; GFX10_DEFAULT-LABEL: vload2_private: 491; GFX10_DEFAULT: ; %bb.0: ; %entry 492; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 493; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 494; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s17 495; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 496; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) 497; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] 498; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 499; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 500; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 501; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 502; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 503; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 504; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 505; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 506; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 507; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 508; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 509; GFX10_DEFAULT-NEXT: s_clause 0x1 510; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 511; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 512; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) 513; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 514; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 515; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 516; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 517; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 518; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 519; GFX10_DEFAULT-NEXT: s_endpgm 520; 521; FLATSCR_GFX10-LABEL: vload2_private: 522; FLATSCR_GFX10: ; %bb.0: ; %entry 523; FLATSCR_GFX10-NEXT: s_add_u32 s8, s8, s13 524; FLATSCR_GFX10-NEXT: s_addc_u32 s9, s9, 0 525; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 526; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 527; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 528; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 529; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 530; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) 531; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] 532; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 533; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 534; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 535; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 536; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 537; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:2 538; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 539; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 540; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 541; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 542; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 543; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 544; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 545; FLATSCR_GFX10-NEXT: s_clause 0x1 546; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 547; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 548; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) 549; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 550; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 551; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 552; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 553; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 554; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 555; FLATSCR_GFX10-NEXT: s_endpgm 556; 557; GFX11-LABEL: vload2_private: 558; GFX11: ; %bb.0: ; %entry 559; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 560; GFX11-NEXT: v_mov_b32_e32 v2, 0 561; GFX11-NEXT: s_waitcnt lgkmcnt(0) 562; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] 563; GFX11-NEXT: s_waitcnt vmcnt(0) 564; GFX11-NEXT: scratch_store_b16 off, v0, off dlc 565; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 566; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 567; GFX11-NEXT: s_waitcnt vmcnt(0) 568; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc 569; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 570; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 571; GFX11-NEXT: s_waitcnt vmcnt(0) 572; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc 573; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 574; GFX11-NEXT: s_clause 0x1 575; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2 576; GFX11-NEXT: scratch_load_u16 v3, off, off 577; GFX11-NEXT: s_waitcnt vmcnt(1) 578; GFX11-NEXT: v_mov_b32_e32 v1, v0 579; GFX11-NEXT: s_waitcnt vmcnt(0) 580; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 581; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 582; GFX11-NEXT: s_waitcnt vmcnt(0) 583; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 584; GFX11-NEXT: s_endpgm 585entry: 586 %loc = alloca [3 x i16], align 2, addrspace(5) 587 %tmp = load i16, ptr addrspace(1) %in, align 2 588 store volatile i16 %tmp, ptr addrspace(5) %loc 589 %arrayidx.1 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1 590 %tmp1 = load i16, ptr addrspace(1) %arrayidx.1, align 2 591 %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1 592 store volatile i16 %tmp1, ptr addrspace(5) %loc.2.sroa_idx3 593 %arrayidx.2 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 2 594 %tmp2 = load i16, ptr addrspace(1) %arrayidx.2, align 2 595 %loc.4.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 2 596 store volatile i16 %tmp2, ptr addrspace(5) %loc.4.sroa_idx 597 %loc.0. = load <2 x i16>, ptr addrspace(5) %loc, align 2 598 store <2 x i16> %loc.0., ptr addrspace(1) %out, align 4 599 %loc.2.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1 600 %loc.2. = load <2 x i16>, ptr addrspace(5) %loc.2.sroa_idx, align 2 601 %arrayidx6 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 1 602 store <2 x i16> %loc.2., ptr addrspace(1) %arrayidx6, align 4 603 ret void 604} 605 606; There is another instruction between the misordered instruction and 607; the value dependent load, so a simple operand check is insufficient. 608define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) { 609; GCN-LABEL: chain_hi_to_lo_group_other_dep: 610; GCN: ; %bb.0: ; %bb 611; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 612; GCN-NEXT: ds_read_u16_d16_hi v1, v0 613; GCN-NEXT: s_waitcnt lgkmcnt(0) 614; GCN-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 615; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2 616; GCN-NEXT: s_waitcnt lgkmcnt(0) 617; GCN-NEXT: v_mov_b32_e32 v0, v1 618; GCN-NEXT: s_setpc_b64 s[30:31] 619; 620; GFX10-LABEL: chain_hi_to_lo_group_other_dep: 621; GFX10: ; %bb.0: ; %bb 622; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 623; GFX10-NEXT: ds_read_u16_d16_hi v1, v0 624; GFX10-NEXT: s_waitcnt lgkmcnt(0) 625; GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 626; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 627; GFX10-NEXT: s_waitcnt lgkmcnt(0) 628; GFX10-NEXT: v_mov_b32_e32 v0, v1 629; GFX10-NEXT: s_setpc_b64 s[30:31] 630; 631; GFX11-LABEL: chain_hi_to_lo_group_other_dep: 632; GFX11: ; %bb.0: ; %bb 633; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 634; GFX11-NEXT: ds_load_u16_d16_hi v1, v0 635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 636; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 637; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2 638; GFX11-NEXT: s_waitcnt lgkmcnt(0) 639; GFX11-NEXT: v_mov_b32_e32 v0, v1 640; GFX11-NEXT: s_setpc_b64 s[30:31] 641bb: 642 %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 643 %load_lo = load i16, ptr addrspace(3) %gep_lo 644 %load_hi = load i16, ptr addrspace(3) %ptr 645 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 646 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 647 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 648 ret <2 x i16> %result 649} 650 651; The volatile operations aren't put on the same chain 652define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %ptr) { 653; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 654; GFX900: ; %bb.0: ; %bb 655; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 656; GFX900-NEXT: ds_read_u16 v1, v0 offset:2 657; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 658; GFX900-NEXT: s_mov_b32 s4, 0xffff 659; GFX900-NEXT: s_waitcnt lgkmcnt(0) 660; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 661; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 662; GFX900-NEXT: s_setpc_b64 s[30:31] 663; 664; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 665; FLATSCR: ; %bb.0: ; %bb 666; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2 668; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 669; FLATSCR-NEXT: s_mov_b32 s0, 0xffff 670; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 671; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 672; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0 673; FLATSCR-NEXT: s_setpc_b64 s[30:31] 674; 675; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 676; GFX10: ; %bb.0: ; %bb 677; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 678; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 679; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 680; GFX10-NEXT: s_waitcnt lgkmcnt(0) 681; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 682; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 683; GFX10-NEXT: s_setpc_b64 s[30:31] 684; 685; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 686; GFX11: ; %bb.0: ; %bb 687; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 688; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 689; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 690; GFX11-NEXT: s_waitcnt lgkmcnt(0) 691; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 692; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 693; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 694; GFX11-NEXT: s_setpc_b64 s[30:31] 695bb: 696 %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 697 %load_lo = load volatile i16, ptr addrspace(3) %gep_lo 698 %load_hi = load volatile i16, ptr addrspace(3) %ptr 699 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 700 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 701 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 702 ret <2 x i16> %result 703} 704 705define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { 706; GFX900-LABEL: chain_hi_to_lo_private_other_dep: 707; GFX900: ; %bb.0: ; %bb 708; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 709; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 710; GFX900-NEXT: s_waitcnt vmcnt(0) 711; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 712; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 713; GFX900-NEXT: s_waitcnt vmcnt(0) 714; GFX900-NEXT: v_mov_b32_e32 v0, v1 715; GFX900-NEXT: s_setpc_b64 s[30:31] 716; 717; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep: 718; FLATSCR: ; %bb.0: ; %bb 719; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 720; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off 721; FLATSCR-NEXT: s_waitcnt vmcnt(0) 722; FLATSCR-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 723; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 724; FLATSCR-NEXT: s_waitcnt vmcnt(0) 725; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 726; FLATSCR-NEXT: s_setpc_b64 s[30:31] 727; 728; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep: 729; GFX10_DEFAULT: ; %bb.0: ; %bb 730; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 731; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 732; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 733; GFX10_DEFAULT-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 734; GFX10_DEFAULT-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 735; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) 736; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 737; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] 738; 739; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep: 740; FLATSCR_GFX10: ; %bb.0: ; %bb 741; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off 743; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 744; FLATSCR_GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 745; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 746; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) 747; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 748; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] 749; 750; GFX11-LABEL: chain_hi_to_lo_private_other_dep: 751; GFX11: ; %bb.0: ; %bb 752; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 753; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off 754; GFX11-NEXT: s_waitcnt vmcnt(0) 755; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] 756; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 757; GFX11-NEXT: s_waitcnt vmcnt(0) 758; GFX11-NEXT: v_mov_b32_e32 v0, v1 759; GFX11-NEXT: s_setpc_b64 s[30:31] 760bb: 761 %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1 762 %load_lo = load i16, ptr addrspace(5) %gep_lo 763 %load_hi = load i16, ptr addrspace(5) %ptr 764 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 765 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 766 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 767 ret <2 x i16> %result 768} 769 770define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { 771; GFX900-LABEL: chain_hi_to_lo_global_other_dep: 772; GFX900: ; %bb.0: ; %bb 773; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 774; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc 775; GFX900-NEXT: s_waitcnt vmcnt(0) 776; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc 777; GFX900-NEXT: s_waitcnt vmcnt(0) 778; GFX900-NEXT: s_mov_b32 s4, 0xffff 779; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 780; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 781; GFX900-NEXT: s_setpc_b64 s[30:31] 782; 783; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep: 784; FLATSCR: ; %bb.0: ; %bb 785; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 786; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc 787; FLATSCR-NEXT: s_waitcnt vmcnt(0) 788; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc 789; FLATSCR-NEXT: s_waitcnt vmcnt(0) 790; FLATSCR-NEXT: s_mov_b32 s0, 0xffff 791; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 792; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 793; FLATSCR-NEXT: s_setpc_b64 s[30:31] 794; 795; GFX10-LABEL: chain_hi_to_lo_global_other_dep: 796; GFX10: ; %bb.0: ; %bb 797; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 798; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc dlc 799; GFX10-NEXT: s_waitcnt vmcnt(0) 800; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc 801; GFX10-NEXT: s_waitcnt vmcnt(0) 802; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 803; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 804; GFX10-NEXT: s_setpc_b64 s[30:31] 805; 806; GFX11-LABEL: chain_hi_to_lo_global_other_dep: 807; GFX11: ; %bb.0: ; %bb 808; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 809; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc 810; GFX11-NEXT: s_waitcnt vmcnt(0) 811; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc 812; GFX11-NEXT: s_waitcnt vmcnt(0) 813; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 814; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 815; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 816; GFX11-NEXT: s_setpc_b64 s[30:31] 817bb: 818 %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1 819 %load_lo = load volatile i16, ptr addrspace(1) %gep_lo 820 %load_hi = load volatile i16, ptr addrspace(1) %ptr 821 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 822 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 823 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 824 ret <2 x i16> %result 825} 826 827define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { 828; GFX900-LABEL: chain_hi_to_lo_flat_other_dep: 829; GFX900: ; %bb.0: ; %bb 830; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 831; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc 832; GFX900-NEXT: s_waitcnt vmcnt(0) 833; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc 834; GFX900-NEXT: s_waitcnt vmcnt(0) 835; GFX900-NEXT: s_mov_b32 s4, 0xffff 836; GFX900-NEXT: s_waitcnt lgkmcnt(0) 837; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 838; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 839; GFX900-NEXT: s_setpc_b64 s[30:31] 840; 841; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep: 842; FLATSCR: ; %bb.0: ; %bb 843; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 844; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc 845; FLATSCR-NEXT: s_waitcnt vmcnt(0) 846; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc 847; FLATSCR-NEXT: s_waitcnt vmcnt(0) 848; FLATSCR-NEXT: s_mov_b32 s0, 0xffff 849; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 850; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 851; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 852; FLATSCR-NEXT: s_setpc_b64 s[30:31] 853; 854; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: 855; GFX10: ; %bb.0: ; %bb 856; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 2 858; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 859; GFX10-NEXT: flat_load_ushort v2, v[2:3] glc dlc 860; GFX10-NEXT: s_waitcnt vmcnt(0) 861; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc 862; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 863; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 864; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 865; GFX10-NEXT: s_setpc_b64 s[30:31] 866; 867; GFX11-LABEL: chain_hi_to_lo_flat_other_dep: 868; GFX11: ; %bb.0: ; %bb 869; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 870; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc 871; GFX11-NEXT: s_waitcnt vmcnt(0) 872; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc 873; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 874; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] 875; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 876; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 877; GFX11-NEXT: s_setpc_b64 s[30:31] 878bb: 879 %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1 880 %load_lo = load volatile i16, ptr addrspace(0) %gep_lo 881 %load_hi = load volatile i16, ptr addrspace(0) %ptr 882 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 883 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 884 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 885 ret <2 x i16> %result 886} 887 888define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, ptr addrspace(3) %may.alias) { 889; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store: 890; GFX900: ; %bb.0: ; %bb 891; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 892; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b 893; GFX900-NEXT: ds_read_u16 v2, v0 894; GFX900-NEXT: ds_write_b16 v1, v3 895; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 896; GFX900-NEXT: s_mov_b32 s4, 0x5040100 897; GFX900-NEXT: s_waitcnt lgkmcnt(0) 898; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 899; GFX900-NEXT: s_setpc_b64 s[30:31] 900; 901; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store: 902; FLATSCR: ; %bb.0: ; %bb 903; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 904; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b 905; FLATSCR-NEXT: ds_read_u16 v2, v0 906; FLATSCR-NEXT: ds_write_b16 v1, v3 907; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2 908; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 909; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 910; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0 911; FLATSCR-NEXT: s_setpc_b64 s[30:31] 912; 913; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: 914; GFX10: ; %bb.0: ; %bb 915; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 916; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 917; GFX10-NEXT: ds_read_u16 v3, v0 918; GFX10-NEXT: ds_write_b16 v1, v2 919; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 920; GFX10-NEXT: s_waitcnt lgkmcnt(0) 921; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 922; GFX10-NEXT: s_setpc_b64 s[30:31] 923; 924; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store: 925; GFX11: ; %bb.0: ; %bb 926; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b 928; GFX11-NEXT: ds_load_u16 v3, v0 929; GFX11-NEXT: ds_store_b16 v1, v2 930; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 931; GFX11-NEXT: s_waitcnt lgkmcnt(0) 932; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 933; GFX11-NEXT: s_setpc_b64 s[30:31] 934bb: 935 %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 936 %load_hi = load i16, ptr addrspace(3) %ptr 937 store i16 123, ptr addrspace(3) %may.alias 938 %load_lo = load i16, ptr addrspace(3) %gep_lo 939 940 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 941 %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0 942 ret <2 x i16> %result 943} 944