1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) { 9; GFX9-LABEL: load_lds_v4i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: ds_read_b128 v[0:3], v0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX7-LABEL: load_lds_v4i32: 17; GFX7: ; %bb.0: 18; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX7-NEXT: s_mov_b32 m0, -1 20; GFX7-NEXT: ds_read_b128 v[0:3], v0 21; GFX7-NEXT: s_waitcnt lgkmcnt(0) 22; GFX7-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX6-LABEL: load_lds_v4i32: 25; GFX6: ; %bb.0: 26; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX6-NEXT: v_mov_b32_e32 v2, v0 28; GFX6-NEXT: s_mov_b32 m0, -1 29; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 30; GFX6-NEXT: ds_read_b64 v[0:1], v0 31; GFX6-NEXT: ds_read_b64 v[2:3], v2 32; GFX6-NEXT: s_waitcnt lgkmcnt(0) 33; GFX6-NEXT: s_setpc_b64 s[30:31] 34; 35; GFX10-LABEL: load_lds_v4i32: 36; GFX10: ; %bb.0: 37; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38; GFX10-NEXT: ds_read_b128 v[0:3], v0 39; GFX10-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-NEXT: s_setpc_b64 s[30:31] 41; 42; GFX11-LABEL: load_lds_v4i32: 43; GFX11: ; %bb.0: 44; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 45; GFX11-NEXT: ds_load_b128 v[0:3], v0 46; GFX11-NEXT: s_waitcnt lgkmcnt(0) 47; GFX11-NEXT: s_setpc_b64 s[30:31] 48 %load = load <4 x i32>, ptr addrspace(3) %ptr 49 ret <4 x i32> %load 50} 51 52define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) { 53; GFX9-LABEL: load_lds_v4i32_align1: 54; GFX9: ; %bb.0: 55; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX9-NEXT: ds_read_u8 v1, v0 57; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 58; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 59; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 60; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 61; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 62; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 63; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 64; GFX9-NEXT: ds_read_u8 v9, v0 offset:8 65; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 66; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 67; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 68; GFX9-NEXT: ds_read_u8 v13, v0 offset:12 69; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 70; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 71; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 72; GFX9-NEXT: s_waitcnt lgkmcnt(14) 73; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 74; GFX9-NEXT: s_waitcnt lgkmcnt(12) 75; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 76; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 77; GFX9-NEXT: s_waitcnt lgkmcnt(10) 78; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 79; GFX9-NEXT: s_waitcnt lgkmcnt(8) 80; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 81; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 82; GFX9-NEXT: s_waitcnt lgkmcnt(6) 83; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 84; GFX9-NEXT: s_waitcnt lgkmcnt(4) 85; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 86; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 87; GFX9-NEXT: s_waitcnt lgkmcnt(2) 88; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 91; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 92; GFX9-NEXT: s_setpc_b64 s[30:31] 93; 94; GFX7-LABEL: load_lds_v4i32_align1: 95; GFX7: ; %bb.0: 96; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 97; GFX7-NEXT: s_mov_b32 m0, -1 98; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 99; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 100; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 101; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 102; GFX7-NEXT: ds_read_u8 v5, v0 103; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 104; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 105; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 106; GFX7-NEXT: s_waitcnt lgkmcnt(4) 107; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 108; GFX7-NEXT: s_waitcnt lgkmcnt(3) 109; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 110; GFX7-NEXT: s_waitcnt lgkmcnt(2) 111; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 112; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 113; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 114; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 115; GFX7-NEXT: s_waitcnt lgkmcnt(1) 116; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7 117; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 118; GFX7-NEXT: s_waitcnt lgkmcnt(0) 119; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 120; GFX7-NEXT: ds_read_u8 v5, v0 offset:15 121; GFX7-NEXT: ds_read_u8 v6, v0 offset:14 122; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 123; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 124; GFX7-NEXT: ds_read_u8 v9, v0 offset:11 125; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 126; GFX7-NEXT: ds_read_u8 v11, v0 offset:9 127; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 128; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 129; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 130; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 131; GFX7-NEXT: s_waitcnt lgkmcnt(1) 132; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11 133; GFX7-NEXT: s_waitcnt lgkmcnt(0) 134; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 135; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 136; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 137; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 138; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 139; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 140; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7 141; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 142; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 143; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 144; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 145; GFX7-NEXT: v_mov_b32_e32 v0, v4 146; GFX7-NEXT: s_setpc_b64 s[30:31] 147; 148; GFX6-LABEL: load_lds_v4i32_align1: 149; GFX6: ; %bb.0: 150; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0 152; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 153; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0 154; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 155; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0 156; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 157; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 158; GFX6-NEXT: s_mov_b32 m0, -1 159; GFX6-NEXT: ds_read_u8 v1, v1 160; GFX6-NEXT: ds_read_u8 v2, v2 161; GFX6-NEXT: ds_read_u8 v3, v3 162; GFX6-NEXT: ds_read_u8 v4, v4 163; GFX6-NEXT: ds_read_u8 v5, v5 164; GFX6-NEXT: ds_read_u8 v6, v6 165; GFX6-NEXT: ds_read_u8 v7, v7 166; GFX6-NEXT: ds_read_u8 v8, v0 167; GFX6-NEXT: s_waitcnt lgkmcnt(7) 168; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 169; GFX6-NEXT: s_waitcnt lgkmcnt(6) 170; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 171; GFX6-NEXT: s_waitcnt lgkmcnt(5) 172; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 173; GFX6-NEXT: s_waitcnt lgkmcnt(4) 174; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 175; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 176; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 177; GFX6-NEXT: s_waitcnt lgkmcnt(3) 178; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 179; GFX6-NEXT: s_waitcnt lgkmcnt(2) 180; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 181; GFX6-NEXT: s_waitcnt lgkmcnt(1) 182; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 183; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 184; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 185; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 186; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 187; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 188; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 189; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 190; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 191; GFX6-NEXT: ds_read_u8 v4, v4 192; GFX6-NEXT: ds_read_u8 v5, v5 193; GFX6-NEXT: ds_read_u8 v6, v6 194; GFX6-NEXT: ds_read_u8 v7, v7 195; GFX6-NEXT: ds_read_u8 v9, v9 196; GFX6-NEXT: ds_read_u8 v10, v10 197; GFX6-NEXT: ds_read_u8 v11, v11 198; GFX6-NEXT: ds_read_u8 v0, v0 199; GFX6-NEXT: s_waitcnt lgkmcnt(7) 200; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 201; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 202; GFX6-NEXT: s_waitcnt lgkmcnt(4) 203; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7 204; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 205; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 206; GFX6-NEXT: s_waitcnt lgkmcnt(3) 207; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 208; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 209; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 210; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 211; GFX6-NEXT: s_waitcnt lgkmcnt(2) 212; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v10 213; GFX6-NEXT: s_waitcnt lgkmcnt(1) 214; GFX6-NEXT: v_or_b32_e32 v4, v4, v11 215; GFX6-NEXT: s_waitcnt lgkmcnt(0) 216; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 217; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 218; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 219; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 220; GFX6-NEXT: s_setpc_b64 s[30:31] 221; 222; GFX10-LABEL: load_lds_v4i32_align1: 223; GFX10: ; %bb.0: 224; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX10-NEXT: ds_read_u8 v1, v0 226; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 227; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 228; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 229; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 230; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 231; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 232; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 233; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 234; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 235; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 236; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 237; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 238; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 239; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 240; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 241; GFX10-NEXT: s_waitcnt lgkmcnt(14) 242; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 243; GFX10-NEXT: s_waitcnt lgkmcnt(12) 244; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 245; GFX10-NEXT: s_waitcnt lgkmcnt(10) 246; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 247; GFX10-NEXT: s_waitcnt lgkmcnt(8) 248; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 249; GFX10-NEXT: s_waitcnt lgkmcnt(6) 250; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 251; GFX10-NEXT: s_waitcnt lgkmcnt(4) 252; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 253; GFX10-NEXT: s_waitcnt lgkmcnt(2) 254; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 257; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 258; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 259; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 260; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 261; GFX10-NEXT: s_setpc_b64 s[30:31] 262; 263; GFX11-LABEL: load_lds_v4i32_align1: 264; GFX11: ; %bb.0: 265; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; GFX11-NEXT: ds_load_u8 v1, v0 267; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 268; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 269; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 270; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 271; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 272; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 273; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 274; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 275; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 276; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 277; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 278; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 279; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 280; GFX11-NEXT: ds_load_u8 v15, v0 offset:14 281; GFX11-NEXT: ds_load_u8 v0, v0 offset:15 282; GFX11-NEXT: s_waitcnt lgkmcnt(14) 283; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 284; GFX11-NEXT: s_waitcnt lgkmcnt(12) 285; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 286; GFX11-NEXT: s_waitcnt lgkmcnt(10) 287; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 288; GFX11-NEXT: s_waitcnt lgkmcnt(8) 289; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 290; GFX11-NEXT: s_waitcnt lgkmcnt(6) 291; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 292; GFX11-NEXT: s_waitcnt lgkmcnt(4) 293; GFX11-NEXT: v_lshl_or_b32 v6, v12, 8, v11 294; GFX11-NEXT: s_waitcnt lgkmcnt(2) 295; GFX11-NEXT: v_lshl_or_b32 v7, v14, 8, v13 296; GFX11-NEXT: s_waitcnt lgkmcnt(0) 297; GFX11-NEXT: v_lshl_or_b32 v8, v0, 8, v15 298; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 299; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 300; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 301; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 302; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 303; GFX11-NEXT: s_setpc_b64 s[30:31] 304 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1 305 ret <4 x i32> %load 306} 307 308define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) { 309; GFX9-LABEL: load_lds_v4i32_align2: 310; GFX9: ; %bb.0: 311; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX9-NEXT: ds_read_u16 v1, v0 313; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 314; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 315; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 316; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 317; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 318; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 319; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 320; GFX9-NEXT: s_waitcnt lgkmcnt(6) 321; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 322; GFX9-NEXT: s_waitcnt lgkmcnt(4) 323; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 324; GFX9-NEXT: s_waitcnt lgkmcnt(2) 325; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 328; GFX9-NEXT: s_setpc_b64 s[30:31] 329; 330; GFX7-LABEL: load_lds_v4i32_align2: 331; GFX7: ; %bb.0: 332; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 333; GFX7-NEXT: s_mov_b32 m0, -1 334; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 335; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 336; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 337; GFX7-NEXT: ds_read_u16 v4, v0 offset:2 338; GFX7-NEXT: ds_read_u16 v5, v0 339; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 340; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 341; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 342; GFX7-NEXT: s_waitcnt lgkmcnt(4) 343; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 344; GFX7-NEXT: s_waitcnt lgkmcnt(3) 345; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 346; GFX7-NEXT: s_waitcnt lgkmcnt(2) 347; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 348; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 349; GFX7-NEXT: s_waitcnt lgkmcnt(1) 350; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 351; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 352; GFX7-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 354; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 355; GFX7-NEXT: s_setpc_b64 s[30:31] 356; 357; GFX6-LABEL: load_lds_v4i32_align2: 358; GFX6: ; %bb.0: 359; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 360; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0 361; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 362; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0 363; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 364; GFX6-NEXT: v_add_i32_e32 v5, vcc, 14, v0 365; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 366; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0 367; GFX6-NEXT: s_mov_b32 m0, -1 368; GFX6-NEXT: ds_read_u16 v1, v1 369; GFX6-NEXT: ds_read_u16 v2, v2 370; GFX6-NEXT: ds_read_u16 v3, v3 371; GFX6-NEXT: ds_read_u16 v4, v4 372; GFX6-NEXT: ds_read_u16 v5, v5 373; GFX6-NEXT: ds_read_u16 v6, v6 374; GFX6-NEXT: ds_read_u16 v7, v7 375; GFX6-NEXT: ds_read_u16 v0, v0 376; GFX6-NEXT: s_waitcnt lgkmcnt(7) 377; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 378; GFX6-NEXT: s_waitcnt lgkmcnt(6) 379; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 380; GFX6-NEXT: s_waitcnt lgkmcnt(5) 381; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 382; GFX6-NEXT: s_waitcnt lgkmcnt(4) 383; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 384; GFX6-NEXT: s_waitcnt lgkmcnt(3) 385; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 386; GFX6-NEXT: s_waitcnt lgkmcnt(1) 387; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 388; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 389; GFX6-NEXT: s_waitcnt lgkmcnt(0) 390; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 391; GFX6-NEXT: s_setpc_b64 s[30:31] 392; 393; GFX10-LABEL: load_lds_v4i32_align2: 394; GFX10: ; %bb.0: 395; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; GFX10-NEXT: ds_read_u16 v1, v0 397; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 398; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 399; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 400; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 401; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 402; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 403; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 404; GFX10-NEXT: s_waitcnt lgkmcnt(6) 405; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 406; GFX10-NEXT: s_waitcnt lgkmcnt(4) 407; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 408; GFX10-NEXT: s_waitcnt lgkmcnt(2) 409; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 410; GFX10-NEXT: s_waitcnt lgkmcnt(0) 411; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 412; GFX10-NEXT: s_setpc_b64 s[30:31] 413; 414; GFX11-LABEL: load_lds_v4i32_align2: 415; GFX11: ; %bb.0: 416; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 417; GFX11-NEXT: ds_load_u16 v1, v0 418; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 419; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 420; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 421; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 422; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 423; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 424; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 425; GFX11-NEXT: s_waitcnt lgkmcnt(6) 426; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 427; GFX11-NEXT: s_waitcnt lgkmcnt(4) 428; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 429; GFX11-NEXT: s_waitcnt lgkmcnt(2) 430; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 431; GFX11-NEXT: s_waitcnt lgkmcnt(0) 432; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 433; GFX11-NEXT: s_setpc_b64 s[30:31] 434 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 2 435 ret <4 x i32> %load 436} 437 438define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) { 439; GFX9-LABEL: load_lds_v4i32_align4: 440; GFX9: ; %bb.0: 441; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX9-NEXT: v_mov_b32_e32 v2, v0 443; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 444; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 445; GFX9-NEXT: s_waitcnt lgkmcnt(0) 446; GFX9-NEXT: s_setpc_b64 s[30:31] 447; 448; GFX7-LABEL: load_lds_v4i32_align4: 449; GFX7: ; %bb.0: 450; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 451; GFX7-NEXT: v_mov_b32_e32 v2, v0 452; GFX7-NEXT: s_mov_b32 m0, -1 453; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 454; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 455; GFX7-NEXT: s_waitcnt lgkmcnt(0) 456; GFX7-NEXT: s_setpc_b64 s[30:31] 457; 458; GFX6-LABEL: load_lds_v4i32_align4: 459; GFX6: ; %bb.0: 460; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 461; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 462; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 463; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0 464; GFX6-NEXT: s_mov_b32 m0, -1 465; GFX6-NEXT: ds_read_b32 v2, v2 466; GFX6-NEXT: ds_read_b32 v3, v3 467; GFX6-NEXT: ds_read_b32 v0, v0 468; GFX6-NEXT: ds_read_b32 v1, v1 469; GFX6-NEXT: s_waitcnt lgkmcnt(0) 470; GFX6-NEXT: s_setpc_b64 s[30:31] 471; 472; GFX10-LABEL: load_lds_v4i32_align4: 473; GFX10: ; %bb.0: 474; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 475; GFX10-NEXT: v_mov_b32_e32 v2, v0 476; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 477; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 478; GFX10-NEXT: s_waitcnt lgkmcnt(0) 479; GFX10-NEXT: s_setpc_b64 s[30:31] 480; 481; GFX11-LABEL: load_lds_v4i32_align4: 482; GFX11: ; %bb.0: 483; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 484; GFX11-NEXT: v_mov_b32_e32 v2, v0 485; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 486; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 487; GFX11-NEXT: s_waitcnt lgkmcnt(0) 488; GFX11-NEXT: s_setpc_b64 s[30:31] 489 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 4 490 ret <4 x i32> %load 491} 492 493define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) { 494; GFX9-LABEL: load_lds_v4i32_align8: 495; GFX9: ; %bb.0: 496; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 497; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 498; GFX9-NEXT: s_waitcnt lgkmcnt(0) 499; GFX9-NEXT: s_setpc_b64 s[30:31] 500; 501; GFX7-LABEL: load_lds_v4i32_align8: 502; GFX7: ; %bb.0: 503; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 504; GFX7-NEXT: s_mov_b32 m0, -1 505; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 506; GFX7-NEXT: s_waitcnt lgkmcnt(0) 507; GFX7-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX6-LABEL: load_lds_v4i32_align8: 510; GFX6: ; %bb.0: 511; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX6-NEXT: v_mov_b32_e32 v2, v0 513; GFX6-NEXT: s_mov_b32 m0, -1 514; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 515; GFX6-NEXT: ds_read_b64 v[0:1], v0 516; GFX6-NEXT: ds_read_b64 v[2:3], v2 517; GFX6-NEXT: s_waitcnt lgkmcnt(0) 518; GFX6-NEXT: s_setpc_b64 s[30:31] 519; 520; GFX10-LABEL: load_lds_v4i32_align8: 521; GFX10: ; %bb.0: 522; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 523; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 524; GFX10-NEXT: s_waitcnt lgkmcnt(0) 525; GFX10-NEXT: s_setpc_b64 s[30:31] 526; 527; GFX11-LABEL: load_lds_v4i32_align8: 528; GFX11: ; %bb.0: 529; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 530; GFX11-NEXT: ds_load_2addr_b64 v[0:3], v0 offset1:1 531; GFX11-NEXT: s_waitcnt lgkmcnt(0) 532; GFX11-NEXT: s_setpc_b64 s[30:31] 533 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 8 534 ret <4 x i32> %load 535} 536 537define <4 x i32> @load_lds_v4i32_align16(ptr addrspace(3) %ptr) { 538; GFX9-LABEL: load_lds_v4i32_align16: 539; GFX9: ; %bb.0: 540; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 541; GFX9-NEXT: ds_read_b128 v[0:3], v0 542; GFX9-NEXT: s_waitcnt lgkmcnt(0) 543; GFX9-NEXT: s_setpc_b64 s[30:31] 544; 545; GFX7-LABEL: load_lds_v4i32_align16: 546; GFX7: ; %bb.0: 547; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 548; GFX7-NEXT: s_mov_b32 m0, -1 549; GFX7-NEXT: ds_read_b128 v[0:3], v0 550; GFX7-NEXT: s_waitcnt lgkmcnt(0) 551; GFX7-NEXT: s_setpc_b64 s[30:31] 552; 553; GFX6-LABEL: load_lds_v4i32_align16: 554; GFX6: ; %bb.0: 555; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 556; GFX6-NEXT: v_mov_b32_e32 v2, v0 557; GFX6-NEXT: s_mov_b32 m0, -1 558; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 559; GFX6-NEXT: ds_read_b64 v[0:1], v0 560; GFX6-NEXT: ds_read_b64 v[2:3], v2 561; GFX6-NEXT: s_waitcnt lgkmcnt(0) 562; GFX6-NEXT: s_setpc_b64 s[30:31] 563; 564; GFX10-LABEL: load_lds_v4i32_align16: 565; GFX10: ; %bb.0: 566; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; GFX10-NEXT: ds_read_b128 v[0:3], v0 568; GFX10-NEXT: s_waitcnt lgkmcnt(0) 569; GFX10-NEXT: s_setpc_b64 s[30:31] 570; 571; GFX11-LABEL: load_lds_v4i32_align16: 572; GFX11: ; %bb.0: 573; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 574; GFX11-NEXT: ds_load_b128 v[0:3], v0 575; GFX11-NEXT: s_waitcnt lgkmcnt(0) 576; GFX11-NEXT: s_setpc_b64 s[30:31] 577 %load = load <4 x i32>, ptr addrspace(3) %ptr, align 16 578 ret <4 x i32> %load 579} 580