1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { 9; GFX9-LABEL: store_lds_v4i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 12; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: v_mov_b32_e32 v4, s6 15; GFX9-NEXT: v_mov_b32_e32 v0, s0 16; GFX9-NEXT: v_mov_b32_e32 v1, s1 17; GFX9-NEXT: v_mov_b32_e32 v2, s2 18; GFX9-NEXT: v_mov_b32_e32 v3, s3 19; GFX9-NEXT: ds_write_b128 v4, v[0:3] 20; GFX9-NEXT: s_endpgm 21; 22; GFX7-LABEL: store_lds_v4i32: 23; GFX7: ; %bb.0: 24; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 25; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 26; GFX7-NEXT: s_mov_b32 m0, -1 27; GFX7-NEXT: s_waitcnt lgkmcnt(0) 28; GFX7-NEXT: v_mov_b32_e32 v4, s6 29; GFX7-NEXT: v_mov_b32_e32 v0, s0 30; GFX7-NEXT: v_mov_b32_e32 v1, s1 31; GFX7-NEXT: v_mov_b32_e32 v2, s2 32; GFX7-NEXT: v_mov_b32_e32 v3, s3 33; GFX7-NEXT: ds_write_b128 v4, v[0:3] 34; GFX7-NEXT: s_endpgm 35; 36; GFX6-LABEL: store_lds_v4i32: 37; GFX6: ; %bb.0: 38; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 39; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 40; GFX6-NEXT: s_mov_b32 m0, -1 41; GFX6-NEXT: s_waitcnt lgkmcnt(0) 42; GFX6-NEXT: v_mov_b32_e32 v0, s2 43; GFX6-NEXT: v_mov_b32_e32 v1, s3 44; GFX6-NEXT: v_mov_b32_e32 v4, s4 45; GFX6-NEXT: v_mov_b32_e32 v2, s0 46; GFX6-NEXT: v_mov_b32_e32 v3, s1 47; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 48; GFX6-NEXT: s_endpgm 49; 50; GFX10-LABEL: store_lds_v4i32: 51; GFX10: ; %bb.0: 52; GFX10-NEXT: s_clause 0x1 53; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 54; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 55; GFX10-NEXT: s_waitcnt lgkmcnt(0) 56; GFX10-NEXT: v_mov_b32_e32 v4, s6 57; GFX10-NEXT: v_mov_b32_e32 v0, s0 58; GFX10-NEXT: v_mov_b32_e32 v1, s1 59; GFX10-NEXT: v_mov_b32_e32 v2, s2 60; GFX10-NEXT: v_mov_b32_e32 v3, s3 61; GFX10-NEXT: ds_write_b128 v4, v[0:3] 62; GFX10-NEXT: s_endpgm 63; 64; GFX11-LABEL: store_lds_v4i32: 65; GFX11: ; %bb.0: 66; GFX11-NEXT: s_clause 0x1 67; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 68; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 69; GFX11-NEXT: s_waitcnt lgkmcnt(0) 70; GFX11-NEXT: v_mov_b32_e32 v4, s6 71; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 72; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 73; GFX11-NEXT: ds_store_b128 v4, v[0:3] 74; GFX11-NEXT: s_endpgm 75 store <4 x i32> %x, ptr addrspace(3) %out 76 ret void 77} 78 79define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { 80; GFX9-LABEL: store_lds_v4i32_align1: 81; GFX9: ; %bb.0: 82; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 83; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 84; GFX9-NEXT: s_waitcnt lgkmcnt(0) 85; GFX9-NEXT: v_mov_b32_e32 v0, s6 86; GFX9-NEXT: v_mov_b32_e32 v1, s3 87; GFX9-NEXT: v_mov_b32_e32 v2, s2 88; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 89; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 90; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 91; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 92; GFX9-NEXT: v_mov_b32_e32 v1, s1 93; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 94; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 95; GFX9-NEXT: v_mov_b32_e32 v1, s0 96; GFX9-NEXT: s_lshr_b32 s4, s3, 8 97; GFX9-NEXT: ds_write_b8 v0, v1 98; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 99; GFX9-NEXT: v_mov_b32_e32 v1, s4 100; GFX9-NEXT: s_lshr_b32 s3, s3, 24 101; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 102; GFX9-NEXT: v_mov_b32_e32 v1, s3 103; GFX9-NEXT: s_lshr_b32 s3, s2, 8 104; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 105; GFX9-NEXT: v_mov_b32_e32 v1, s3 106; GFX9-NEXT: s_lshr_b32 s2, s2, 24 107; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 108; GFX9-NEXT: v_mov_b32_e32 v1, s2 109; GFX9-NEXT: s_lshr_b32 s2, s1, 8 110; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 111; GFX9-NEXT: v_mov_b32_e32 v1, s2 112; GFX9-NEXT: s_lshr_b32 s1, s1, 24 113; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 114; GFX9-NEXT: v_mov_b32_e32 v1, s1 115; GFX9-NEXT: s_lshr_b32 s1, s0, 8 116; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 117; GFX9-NEXT: v_mov_b32_e32 v1, s1 118; GFX9-NEXT: s_lshr_b32 s0, s0, 24 119; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 120; GFX9-NEXT: v_mov_b32_e32 v1, s0 121; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 122; GFX9-NEXT: s_endpgm 123; 124; GFX7-LABEL: store_lds_v4i32_align1: 125; GFX7: ; %bb.0: 126; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 127; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 128; GFX7-NEXT: s_mov_b32 m0, -1 129; GFX7-NEXT: s_waitcnt lgkmcnt(0) 130; GFX7-NEXT: v_mov_b32_e32 v0, s6 131; GFX7-NEXT: v_mov_b32_e32 v1, s3 132; GFX7-NEXT: v_mov_b32_e32 v2, s2 133; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 134; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 135; GFX7-NEXT: v_mov_b32_e32 v1, s1 136; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 137; GFX7-NEXT: v_mov_b32_e32 v1, s0 138; GFX7-NEXT: s_lshr_b32 s4, s3, 8 139; GFX7-NEXT: ds_write_b8 v0, v1 140; GFX7-NEXT: v_mov_b32_e32 v1, s4 141; GFX7-NEXT: s_lshr_b32 s4, s3, 24 142; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 143; GFX7-NEXT: v_mov_b32_e32 v1, s4 144; GFX7-NEXT: s_lshr_b32 s3, s3, 16 145; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 146; GFX7-NEXT: v_mov_b32_e32 v1, s3 147; GFX7-NEXT: s_lshr_b32 s3, s2, 8 148; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 149; GFX7-NEXT: v_mov_b32_e32 v1, s3 150; GFX7-NEXT: s_lshr_b32 s3, s2, 24 151; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 152; GFX7-NEXT: v_mov_b32_e32 v1, s3 153; GFX7-NEXT: s_lshr_b32 s2, s2, 16 154; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 155; GFX7-NEXT: v_mov_b32_e32 v1, s2 156; GFX7-NEXT: s_lshr_b32 s2, s1, 8 157; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 158; GFX7-NEXT: v_mov_b32_e32 v1, s2 159; GFX7-NEXT: s_lshr_b32 s2, s1, 24 160; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 161; GFX7-NEXT: v_mov_b32_e32 v1, s2 162; GFX7-NEXT: s_lshr_b32 s1, s1, 16 163; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 164; GFX7-NEXT: v_mov_b32_e32 v1, s1 165; GFX7-NEXT: s_lshr_b32 s1, s0, 8 166; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 167; GFX7-NEXT: v_mov_b32_e32 v1, s1 168; GFX7-NEXT: s_lshr_b32 s1, s0, 24 169; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 170; GFX7-NEXT: v_mov_b32_e32 v1, s1 171; GFX7-NEXT: s_lshr_b32 s0, s0, 16 172; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 173; GFX7-NEXT: v_mov_b32_e32 v1, s0 174; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 175; GFX7-NEXT: s_endpgm 176; 177; GFX6-LABEL: store_lds_v4i32_align1: 178; GFX6: ; %bb.0: 179; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 180; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 181; GFX6-NEXT: s_mov_b32 m0, -1 182; GFX6-NEXT: s_waitcnt lgkmcnt(0) 183; GFX6-NEXT: v_mov_b32_e32 v0, s6 184; GFX6-NEXT: v_mov_b32_e32 v1, s3 185; GFX6-NEXT: v_mov_b32_e32 v2, s2 186; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 187; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 188; GFX6-NEXT: v_mov_b32_e32 v1, s1 189; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 190; GFX6-NEXT: v_mov_b32_e32 v1, s0 191; GFX6-NEXT: s_lshr_b32 s4, s3, 8 192; GFX6-NEXT: ds_write_b8 v0, v1 193; GFX6-NEXT: v_mov_b32_e32 v1, s4 194; GFX6-NEXT: s_lshr_b32 s4, s3, 24 195; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 196; GFX6-NEXT: v_mov_b32_e32 v1, s4 197; GFX6-NEXT: s_lshr_b32 s3, s3, 16 198; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 199; GFX6-NEXT: v_mov_b32_e32 v1, s3 200; GFX6-NEXT: s_lshr_b32 s3, s2, 8 201; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 202; GFX6-NEXT: v_mov_b32_e32 v1, s3 203; GFX6-NEXT: s_lshr_b32 s3, s2, 24 204; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 205; GFX6-NEXT: v_mov_b32_e32 v1, s3 206; GFX6-NEXT: s_lshr_b32 s2, s2, 16 207; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 208; GFX6-NEXT: v_mov_b32_e32 v1, s2 209; GFX6-NEXT: s_lshr_b32 s2, s1, 8 210; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 211; GFX6-NEXT: v_mov_b32_e32 v1, s2 212; GFX6-NEXT: s_lshr_b32 s2, s1, 24 213; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 214; GFX6-NEXT: v_mov_b32_e32 v1, s2 215; GFX6-NEXT: s_lshr_b32 s1, s1, 16 216; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 217; GFX6-NEXT: v_mov_b32_e32 v1, s1 218; GFX6-NEXT: s_lshr_b32 s1, s0, 8 219; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 220; GFX6-NEXT: v_mov_b32_e32 v1, s1 221; GFX6-NEXT: s_lshr_b32 s1, s0, 24 222; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 223; GFX6-NEXT: v_mov_b32_e32 v1, s1 224; GFX6-NEXT: s_lshr_b32 s0, s0, 16 225; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 226; GFX6-NEXT: v_mov_b32_e32 v1, s0 227; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 228; GFX6-NEXT: s_endpgm 229; 230; GFX10-LABEL: store_lds_v4i32_align1: 231; GFX10: ; %bb.0: 232; GFX10-NEXT: s_clause 0x1 233; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 234; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 235; GFX10-NEXT: s_waitcnt lgkmcnt(0) 236; GFX10-NEXT: v_mov_b32_e32 v0, s6 237; GFX10-NEXT: v_mov_b32_e32 v1, s3 238; GFX10-NEXT: v_mov_b32_e32 v2, s2 239; GFX10-NEXT: s_lshr_b32 s5, s2, 8 240; GFX10-NEXT: s_lshr_b32 s2, s2, 24 241; GFX10-NEXT: v_mov_b32_e32 v3, s1 242; GFX10-NEXT: s_lshr_b32 s4, s3, 8 243; GFX10-NEXT: s_lshr_b32 s3, s3, 24 244; GFX10-NEXT: s_lshr_b32 s6, s1, 8 245; GFX10-NEXT: s_lshr_b32 s1, s1, 24 246; GFX10-NEXT: v_mov_b32_e32 v8, s2 247; GFX10-NEXT: v_mov_b32_e32 v4, s0 248; GFX10-NEXT: v_mov_b32_e32 v5, s4 249; GFX10-NEXT: v_mov_b32_e32 v6, s3 250; GFX10-NEXT: v_mov_b32_e32 v7, s5 251; GFX10-NEXT: v_mov_b32_e32 v9, s6 252; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 253; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 254; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 255; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 256; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 257; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 258; GFX10-NEXT: ds_write_b8 v0, v4 259; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 260; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 261; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 262; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 263; GFX10-NEXT: v_mov_b32_e32 v1, s1 264; GFX10-NEXT: s_lshr_b32 s1, s0, 8 265; GFX10-NEXT: s_lshr_b32 s0, s0, 24 266; GFX10-NEXT: v_mov_b32_e32 v2, s1 267; GFX10-NEXT: v_mov_b32_e32 v3, s0 268; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 269; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 270; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 271; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 272; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 273; GFX10-NEXT: s_endpgm 274; 275; GFX11-LABEL: store_lds_v4i32_align1: 276; GFX11: ; %bb.0: 277; GFX11-NEXT: s_clause 0x1 278; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 279; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 280; GFX11-NEXT: s_waitcnt lgkmcnt(0) 281; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 282; GFX11-NEXT: s_lshr_b32 s4, s3, 8 283; GFX11-NEXT: s_lshr_b32 s3, s3, 24 284; GFX11-NEXT: s_lshr_b32 s5, s2, 8 285; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 286; GFX11-NEXT: s_lshr_b32 s2, s2, 24 287; GFX11-NEXT: s_lshr_b32 s6, s1, 8 288; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5 289; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6 290; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4 291; GFX11-NEXT: s_lshr_b32 s1, s1, 24 292; GFX11-NEXT: s_lshr_b32 s7, s0, 8 293; GFX11-NEXT: s_lshr_b32 s0, s0, 24 294; GFX11-NEXT: ds_store_b8 v0, v2 offset:8 295; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:10 296; GFX11-NEXT: ds_store_b8 v0, v1 offset:12 297; GFX11-NEXT: ds_store_b8 v0, v4 298; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2 299; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 300; GFX11-NEXT: ds_store_b8 v0, v5 offset:13 301; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:14 302; GFX11-NEXT: ds_store_b8 v0, v6 offset:15 303; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7 304; GFX11-NEXT: v_mov_b32_e32 v11, s0 305; GFX11-NEXT: ds_store_b8 v0, v7 offset:9 306; GFX11-NEXT: ds_store_b8 v0, v8 offset:11 307; GFX11-NEXT: ds_store_b8 v0, v9 offset:5 308; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 309; GFX11-NEXT: ds_store_b8 v0, v1 offset:7 310; GFX11-NEXT: ds_store_b8 v0, v10 offset:1 311; GFX11-NEXT: ds_store_b8 v0, v11 offset:3 312; GFX11-NEXT: s_endpgm 313 store <4 x i32> %x, ptr addrspace(3) %out, align 1 314 ret void 315} 316 317define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { 318; GFX9-LABEL: store_lds_v4i32_align2: 319; GFX9: ; %bb.0: 320; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 321; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: v_mov_b32_e32 v0, s6 324; GFX9-NEXT: v_mov_b32_e32 v1, s3 325; GFX9-NEXT: v_mov_b32_e32 v2, s2 326; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 327; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 328; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 329; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 330; GFX9-NEXT: v_mov_b32_e32 v1, s1 331; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 332; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 333; GFX9-NEXT: v_mov_b32_e32 v1, s0 334; GFX9-NEXT: ds_write_b16 v0, v1 335; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 336; GFX9-NEXT: s_endpgm 337; 338; GFX7-LABEL: store_lds_v4i32_align2: 339; GFX7: ; %bb.0: 340; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 341; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 342; GFX7-NEXT: s_mov_b32 m0, -1 343; GFX7-NEXT: s_waitcnt lgkmcnt(0) 344; GFX7-NEXT: v_mov_b32_e32 v0, s6 345; GFX7-NEXT: v_mov_b32_e32 v1, s3 346; GFX7-NEXT: v_mov_b32_e32 v2, s2 347; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 348; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 349; GFX7-NEXT: v_mov_b32_e32 v1, s1 350; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 351; GFX7-NEXT: v_mov_b32_e32 v1, s0 352; GFX7-NEXT: s_lshr_b32 s3, s3, 16 353; GFX7-NEXT: ds_write_b16 v0, v1 354; GFX7-NEXT: v_mov_b32_e32 v1, s3 355; GFX7-NEXT: s_lshr_b32 s2, s2, 16 356; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 357; GFX7-NEXT: v_mov_b32_e32 v1, s2 358; GFX7-NEXT: s_lshr_b32 s1, s1, 16 359; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 360; GFX7-NEXT: v_mov_b32_e32 v1, s1 361; GFX7-NEXT: s_lshr_b32 s0, s0, 16 362; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 363; GFX7-NEXT: v_mov_b32_e32 v1, s0 364; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 365; GFX7-NEXT: s_endpgm 366; 367; GFX6-LABEL: store_lds_v4i32_align2: 368; GFX6: ; %bb.0: 369; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 370; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 371; GFX6-NEXT: s_mov_b32 m0, -1 372; GFX6-NEXT: s_waitcnt lgkmcnt(0) 373; GFX6-NEXT: v_mov_b32_e32 v0, s6 374; GFX6-NEXT: v_mov_b32_e32 v1, s3 375; GFX6-NEXT: v_mov_b32_e32 v2, s2 376; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 377; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 378; GFX6-NEXT: v_mov_b32_e32 v1, s1 379; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 380; GFX6-NEXT: v_mov_b32_e32 v1, s0 381; GFX6-NEXT: s_lshr_b32 s3, s3, 16 382; GFX6-NEXT: ds_write_b16 v0, v1 383; GFX6-NEXT: v_mov_b32_e32 v1, s3 384; GFX6-NEXT: s_lshr_b32 s2, s2, 16 385; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 386; GFX6-NEXT: v_mov_b32_e32 v1, s2 387; GFX6-NEXT: s_lshr_b32 s1, s1, 16 388; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 389; GFX6-NEXT: v_mov_b32_e32 v1, s1 390; GFX6-NEXT: s_lshr_b32 s0, s0, 16 391; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 392; GFX6-NEXT: v_mov_b32_e32 v1, s0 393; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 394; GFX6-NEXT: s_endpgm 395; 396; GFX10-LABEL: store_lds_v4i32_align2: 397; GFX10: ; %bb.0: 398; GFX10-NEXT: s_clause 0x1 399; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 400; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 401; GFX10-NEXT: s_waitcnt lgkmcnt(0) 402; GFX10-NEXT: v_mov_b32_e32 v0, s6 403; GFX10-NEXT: v_mov_b32_e32 v1, s3 404; GFX10-NEXT: v_mov_b32_e32 v2, s2 405; GFX10-NEXT: v_mov_b32_e32 v3, s1 406; GFX10-NEXT: v_mov_b32_e32 v4, s0 407; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 408; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 409; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 410; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 411; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 412; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 413; GFX10-NEXT: ds_write_b16 v0, v4 414; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 415; GFX10-NEXT: s_endpgm 416; 417; GFX11-LABEL: store_lds_v4i32_align2: 418; GFX11: ; %bb.0: 419; GFX11-NEXT: s_clause 0x1 420; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 421; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 422; GFX11-NEXT: s_waitcnt lgkmcnt(0) 423; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3 424; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 425; GFX11-NEXT: v_mov_b32_e32 v4, s2 426; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14 427; GFX11-NEXT: ds_store_b16 v0, v2 428; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 429; GFX11-NEXT: ds_store_b16 v0, v4 offset:8 430; GFX11-NEXT: ds_store_b16 v0, v1 offset:12 431; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:10 432; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 433; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 434; GFX11-NEXT: s_endpgm 435 store <4 x i32> %x, ptr addrspace(3) %out, align 2 436 ret void 437} 438 439define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { 440; GFX9-LABEL: store_lds_v4i32_align4: 441; GFX9: ; %bb.0: 442; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 443; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 445; GFX9-NEXT: v_mov_b32_e32 v0, s6 446; GFX9-NEXT: v_mov_b32_e32 v1, s0 447; GFX9-NEXT: v_mov_b32_e32 v2, s1 448; GFX9-NEXT: v_mov_b32_e32 v3, s2 449; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 450; GFX9-NEXT: v_mov_b32_e32 v1, s3 451; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 452; GFX9-NEXT: s_endpgm 453; 454; GFX7-LABEL: store_lds_v4i32_align4: 455; GFX7: ; %bb.0: 456; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 457; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 458; GFX7-NEXT: s_mov_b32 m0, -1 459; GFX7-NEXT: s_waitcnt lgkmcnt(0) 460; GFX7-NEXT: v_mov_b32_e32 v0, s6 461; GFX7-NEXT: v_mov_b32_e32 v1, s0 462; GFX7-NEXT: v_mov_b32_e32 v2, s1 463; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 464; GFX7-NEXT: v_mov_b32_e32 v1, s2 465; GFX7-NEXT: v_mov_b32_e32 v2, s3 466; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 467; GFX7-NEXT: s_endpgm 468; 469; GFX6-LABEL: store_lds_v4i32_align4: 470; GFX6: ; %bb.0: 471; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 472; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 473; GFX6-NEXT: s_mov_b32 m0, -1 474; GFX6-NEXT: s_waitcnt lgkmcnt(0) 475; GFX6-NEXT: v_mov_b32_e32 v0, s6 476; GFX6-NEXT: v_mov_b32_e32 v1, s2 477; GFX6-NEXT: v_mov_b32_e32 v2, s3 478; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 479; GFX6-NEXT: v_mov_b32_e32 v1, s0 480; GFX6-NEXT: v_mov_b32_e32 v2, s1 481; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 482; GFX6-NEXT: s_endpgm 483; 484; GFX10-LABEL: store_lds_v4i32_align4: 485; GFX10: ; %bb.0: 486; GFX10-NEXT: s_clause 0x1 487; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 488; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 489; GFX10-NEXT: s_waitcnt lgkmcnt(0) 490; GFX10-NEXT: v_mov_b32_e32 v0, s6 491; GFX10-NEXT: v_mov_b32_e32 v1, s2 492; GFX10-NEXT: v_mov_b32_e32 v2, s3 493; GFX10-NEXT: v_mov_b32_e32 v3, s0 494; GFX10-NEXT: v_mov_b32_e32 v4, s1 495; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 496; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 497; GFX10-NEXT: s_endpgm 498; 499; GFX11-LABEL: store_lds_v4i32_align4: 500; GFX11: ; %bb.0: 501; GFX11-NEXT: s_clause 0x1 502; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 503; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 504; GFX11-NEXT: s_waitcnt lgkmcnt(0) 505; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 506; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 507; GFX11-NEXT: v_mov_b32_e32 v4, s3 508; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 509; GFX11-NEXT: ds_store_2addr_b32 v0, v3, v4 offset0:2 offset1:3 510; GFX11-NEXT: s_endpgm 511 store <4 x i32> %x, ptr addrspace(3) %out, align 4 512 ret void 513} 514 515define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { 516; GFX9-LABEL: store_lds_v4i32_align8: 517; GFX9: ; %bb.0: 518; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 519; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 520; GFX9-NEXT: s_waitcnt lgkmcnt(0) 521; GFX9-NEXT: v_mov_b32_e32 v4, s6 522; GFX9-NEXT: v_mov_b32_e32 v0, s0 523; GFX9-NEXT: v_mov_b32_e32 v2, s2 524; GFX9-NEXT: v_mov_b32_e32 v1, s1 525; GFX9-NEXT: v_mov_b32_e32 v3, s3 526; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 527; GFX9-NEXT: s_endpgm 528; 529; GFX7-LABEL: store_lds_v4i32_align8: 530; GFX7: ; %bb.0: 531; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 532; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 533; GFX7-NEXT: s_mov_b32 m0, -1 534; GFX7-NEXT: s_waitcnt lgkmcnt(0) 535; GFX7-NEXT: v_mov_b32_e32 v4, s6 536; GFX7-NEXT: v_mov_b32_e32 v0, s0 537; GFX7-NEXT: v_mov_b32_e32 v2, s2 538; GFX7-NEXT: v_mov_b32_e32 v1, s1 539; GFX7-NEXT: v_mov_b32_e32 v3, s3 540; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 541; GFX7-NEXT: s_endpgm 542; 543; GFX6-LABEL: store_lds_v4i32_align8: 544; GFX6: ; %bb.0: 545; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 546; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 547; GFX6-NEXT: s_mov_b32 m0, -1 548; GFX6-NEXT: s_waitcnt lgkmcnt(0) 549; GFX6-NEXT: v_mov_b32_e32 v0, s0 550; GFX6-NEXT: v_mov_b32_e32 v1, s1 551; GFX6-NEXT: v_mov_b32_e32 v4, s4 552; GFX6-NEXT: v_mov_b32_e32 v2, s2 553; GFX6-NEXT: v_mov_b32_e32 v3, s3 554; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 555; GFX6-NEXT: s_endpgm 556; 557; GFX10-LABEL: store_lds_v4i32_align8: 558; GFX10: ; %bb.0: 559; GFX10-NEXT: s_clause 0x1 560; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 561; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 562; GFX10-NEXT: s_waitcnt lgkmcnt(0) 563; GFX10-NEXT: v_mov_b32_e32 v0, s0 564; GFX10-NEXT: v_mov_b32_e32 v1, s1 565; GFX10-NEXT: v_mov_b32_e32 v4, s6 566; GFX10-NEXT: v_mov_b32_e32 v2, s2 567; GFX10-NEXT: v_mov_b32_e32 v3, s3 568; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 569; GFX10-NEXT: s_endpgm 570; 571; GFX11-LABEL: store_lds_v4i32_align8: 572; GFX11: ; %bb.0: 573; GFX11-NEXT: s_clause 0x1 574; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 575; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 576; GFX11-NEXT: s_waitcnt lgkmcnt(0) 577; GFX11-NEXT: v_mov_b32_e32 v4, s6 578; GFX11-NEXT: v_mov_b32_e32 v0, s0 579; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 580; GFX11-NEXT: v_mov_b32_e32 v1, s1 581; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 582; GFX11-NEXT: s_endpgm 583 store <4 x i32> %x, ptr addrspace(3) %out, align 8 584 ret void 585} 586 587define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { 588; GFX9-LABEL: store_lds_v4i32_align16: 589; GFX9: ; %bb.0: 590; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 591; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 592; GFX9-NEXT: s_waitcnt lgkmcnt(0) 593; GFX9-NEXT: v_mov_b32_e32 v4, s6 594; GFX9-NEXT: v_mov_b32_e32 v0, s0 595; GFX9-NEXT: v_mov_b32_e32 v1, s1 596; GFX9-NEXT: v_mov_b32_e32 v2, s2 597; GFX9-NEXT: v_mov_b32_e32 v3, s3 598; GFX9-NEXT: ds_write_b128 v4, v[0:3] 599; GFX9-NEXT: s_endpgm 600; 601; GFX7-LABEL: store_lds_v4i32_align16: 602; GFX7: ; %bb.0: 603; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 604; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 605; GFX7-NEXT: s_mov_b32 m0, -1 606; GFX7-NEXT: s_waitcnt lgkmcnt(0) 607; GFX7-NEXT: v_mov_b32_e32 v4, s6 608; GFX7-NEXT: v_mov_b32_e32 v0, s0 609; GFX7-NEXT: v_mov_b32_e32 v1, s1 610; GFX7-NEXT: v_mov_b32_e32 v2, s2 611; GFX7-NEXT: v_mov_b32_e32 v3, s3 612; GFX7-NEXT: ds_write_b128 v4, v[0:3] 613; GFX7-NEXT: s_endpgm 614; 615; GFX6-LABEL: store_lds_v4i32_align16: 616; GFX6: ; %bb.0: 617; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 618; GFX6-NEXT: s_load_dword s4, s[4:5], 0x0 619; GFX6-NEXT: s_mov_b32 m0, -1 620; GFX6-NEXT: s_waitcnt lgkmcnt(0) 621; GFX6-NEXT: v_mov_b32_e32 v0, s2 622; GFX6-NEXT: v_mov_b32_e32 v1, s3 623; GFX6-NEXT: v_mov_b32_e32 v4, s4 624; GFX6-NEXT: v_mov_b32_e32 v2, s0 625; GFX6-NEXT: v_mov_b32_e32 v3, s1 626; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1 627; GFX6-NEXT: s_endpgm 628; 629; GFX10-LABEL: store_lds_v4i32_align16: 630; GFX10: ; %bb.0: 631; GFX10-NEXT: s_clause 0x1 632; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 633; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 634; GFX10-NEXT: s_waitcnt lgkmcnt(0) 635; GFX10-NEXT: v_mov_b32_e32 v4, s6 636; GFX10-NEXT: v_mov_b32_e32 v0, s0 637; GFX10-NEXT: v_mov_b32_e32 v1, s1 638; GFX10-NEXT: v_mov_b32_e32 v2, s2 639; GFX10-NEXT: v_mov_b32_e32 v3, s3 640; GFX10-NEXT: ds_write_b128 v4, v[0:3] 641; GFX10-NEXT: s_endpgm 642; 643; GFX11-LABEL: store_lds_v4i32_align16: 644; GFX11: ; %bb.0: 645; GFX11-NEXT: s_clause 0x1 646; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 647; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 648; GFX11-NEXT: s_waitcnt lgkmcnt(0) 649; GFX11-NEXT: v_mov_b32_e32 v4, s6 650; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 651; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 652; GFX11-NEXT: ds_store_b128 v4, v[0:3] 653; GFX11-NEXT: s_endpgm 654 store <4 x i32> %x, ptr addrspace(3) %out, align 16 655 ret void 656} 657