1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { 9; GFX9-LABEL: store_lds_v3i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 14; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 15; GFX9-NEXT: v_mov_b32_e32 v0, s0 16; GFX9-NEXT: v_mov_b32_e32 v1, s1 17; GFX9-NEXT: v_mov_b32_e32 v2, s2 18; GFX9-NEXT: s_waitcnt lgkmcnt(0) 19; GFX9-NEXT: v_mov_b32_e32 v3, s3 20; GFX9-NEXT: ds_write_b96 v3, v[0:2] 21; GFX9-NEXT: s_endpgm 22; 23; GFX7-LABEL: store_lds_v3i32: 24; GFX7: ; %bb.0: 25; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 26; GFX7-NEXT: s_waitcnt lgkmcnt(0) 27; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 28; GFX7-NEXT: s_mov_b32 m0, -1 29; GFX7-NEXT: v_mov_b32_e32 v0, s0 30; GFX7-NEXT: v_mov_b32_e32 v1, s1 31; GFX7-NEXT: v_mov_b32_e32 v2, s2 32; GFX7-NEXT: s_waitcnt lgkmcnt(0) 33; GFX7-NEXT: v_mov_b32_e32 v3, s3 34; GFX7-NEXT: ds_write_b96 v3, v[0:2] 35; GFX7-NEXT: s_endpgm 36; 37; GFX6-LABEL: store_lds_v3i32: 38; GFX6: ; %bb.0: 39; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 40; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 41; GFX6-NEXT: s_mov_b32 m0, -1 42; GFX6-NEXT: s_waitcnt lgkmcnt(0) 43; GFX6-NEXT: v_mov_b32_e32 v2, s6 44; GFX6-NEXT: v_mov_b32_e32 v1, s2 45; GFX6-NEXT: v_mov_b32_e32 v0, s0 46; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 47; GFX6-NEXT: v_mov_b32_e32 v1, s1 48; GFX6-NEXT: ds_write_b64 v2, v[0:1] 49; GFX6-NEXT: s_endpgm 50; 51; GFX10-LABEL: store_lds_v3i32: 52; GFX10: ; %bb.0: 53; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 54; GFX10-NEXT: s_waitcnt lgkmcnt(0) 55; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 56; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 57; GFX10-NEXT: v_mov_b32_e32 v0, s0 58; GFX10-NEXT: v_mov_b32_e32 v1, s1 59; GFX10-NEXT: v_mov_b32_e32 v2, s2 60; GFX10-NEXT: s_waitcnt lgkmcnt(0) 61; GFX10-NEXT: v_mov_b32_e32 v3, s3 62; GFX10-NEXT: ds_write_b96 v3, v[0:2] 63; GFX10-NEXT: s_endpgm 64; 65; GFX11-LABEL: store_lds_v3i32: 66; GFX11: ; %bb.0: 67; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 70; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 71; GFX11-NEXT: s_waitcnt lgkmcnt(0) 72; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 73; GFX11-NEXT: ds_store_b96 v3, v[0:2] 74; GFX11-NEXT: s_endpgm 75 store <3 x i32> %x, ptr addrspace(3) %out 76 ret void 77} 78 79define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { 80; GFX9-LABEL: store_lds_v3i32_align1: 81; GFX9: ; %bb.0: 82; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 83; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 84; GFX9-NEXT: s_waitcnt lgkmcnt(0) 85; GFX9-NEXT: v_mov_b32_e32 v0, s6 86; GFX9-NEXT: v_mov_b32_e32 v1, s2 87; GFX9-NEXT: v_mov_b32_e32 v2, s1 88; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 89; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 90; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 91; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 92; GFX9-NEXT: v_mov_b32_e32 v1, s0 93; GFX9-NEXT: s_lshr_b32 s3, s2, 8 94; GFX9-NEXT: ds_write_b8 v0, v1 95; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 96; GFX9-NEXT: v_mov_b32_e32 v1, s3 97; GFX9-NEXT: s_lshr_b32 s2, s2, 24 98; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 99; GFX9-NEXT: v_mov_b32_e32 v1, s2 100; GFX9-NEXT: s_lshr_b32 s2, s1, 8 101; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 102; GFX9-NEXT: v_mov_b32_e32 v1, s2 103; GFX9-NEXT: s_lshr_b32 s1, s1, 24 104; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 105; GFX9-NEXT: v_mov_b32_e32 v1, s1 106; GFX9-NEXT: s_lshr_b32 s1, s0, 8 107; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 108; GFX9-NEXT: v_mov_b32_e32 v1, s1 109; GFX9-NEXT: s_lshr_b32 s0, s0, 24 110; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 111; GFX9-NEXT: v_mov_b32_e32 v1, s0 112; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 113; GFX9-NEXT: s_endpgm 114; 115; GFX7-LABEL: store_lds_v3i32_align1: 116; GFX7: ; %bb.0: 117; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 118; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 119; GFX7-NEXT: s_mov_b32 m0, -1 120; GFX7-NEXT: s_waitcnt lgkmcnt(0) 121; GFX7-NEXT: v_mov_b32_e32 v0, s6 122; GFX7-NEXT: v_mov_b32_e32 v1, s2 123; GFX7-NEXT: v_mov_b32_e32 v2, s1 124; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 125; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 126; GFX7-NEXT: v_mov_b32_e32 v1, s0 127; GFX7-NEXT: s_lshr_b32 s3, s2, 8 128; GFX7-NEXT: ds_write_b8 v0, v1 129; GFX7-NEXT: v_mov_b32_e32 v1, s3 130; GFX7-NEXT: s_lshr_b32 s3, s2, 24 131; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 132; GFX7-NEXT: v_mov_b32_e32 v1, s3 133; GFX7-NEXT: s_lshr_b32 s2, s2, 16 134; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 135; GFX7-NEXT: v_mov_b32_e32 v1, s2 136; GFX7-NEXT: s_lshr_b32 s2, s1, 8 137; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 138; GFX7-NEXT: v_mov_b32_e32 v1, s2 139; GFX7-NEXT: s_lshr_b32 s2, s1, 24 140; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 141; GFX7-NEXT: v_mov_b32_e32 v1, s2 142; GFX7-NEXT: s_lshr_b32 s1, s1, 16 143; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 144; GFX7-NEXT: v_mov_b32_e32 v1, s1 145; GFX7-NEXT: s_lshr_b32 s1, s0, 8 146; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 147; GFX7-NEXT: v_mov_b32_e32 v1, s1 148; GFX7-NEXT: s_lshr_b32 s1, s0, 24 149; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 150; GFX7-NEXT: v_mov_b32_e32 v1, s1 151; GFX7-NEXT: s_lshr_b32 s0, s0, 16 152; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 153; GFX7-NEXT: v_mov_b32_e32 v1, s0 154; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 155; GFX7-NEXT: s_endpgm 156; 157; GFX6-LABEL: store_lds_v3i32_align1: 158; GFX6: ; %bb.0: 159; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 160; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 161; GFX6-NEXT: s_mov_b32 m0, -1 162; GFX6-NEXT: s_waitcnt lgkmcnt(0) 163; GFX6-NEXT: v_mov_b32_e32 v0, s6 164; GFX6-NEXT: v_mov_b32_e32 v1, s2 165; GFX6-NEXT: v_mov_b32_e32 v2, s1 166; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 167; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 168; GFX6-NEXT: v_mov_b32_e32 v1, s0 169; GFX6-NEXT: s_lshr_b32 s3, s2, 8 170; GFX6-NEXT: ds_write_b8 v0, v1 171; GFX6-NEXT: v_mov_b32_e32 v1, s3 172; GFX6-NEXT: s_lshr_b32 s3, s2, 24 173; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 174; GFX6-NEXT: v_mov_b32_e32 v1, s3 175; GFX6-NEXT: s_lshr_b32 s2, s2, 16 176; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 177; GFX6-NEXT: v_mov_b32_e32 v1, s2 178; GFX6-NEXT: s_lshr_b32 s2, s1, 8 179; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 180; GFX6-NEXT: v_mov_b32_e32 v1, s2 181; GFX6-NEXT: s_lshr_b32 s2, s1, 24 182; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 183; GFX6-NEXT: v_mov_b32_e32 v1, s2 184; GFX6-NEXT: s_lshr_b32 s1, s1, 16 185; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 186; GFX6-NEXT: v_mov_b32_e32 v1, s1 187; GFX6-NEXT: s_lshr_b32 s1, s0, 8 188; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 189; GFX6-NEXT: v_mov_b32_e32 v1, s1 190; GFX6-NEXT: s_lshr_b32 s1, s0, 24 191; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 192; GFX6-NEXT: v_mov_b32_e32 v1, s1 193; GFX6-NEXT: s_lshr_b32 s0, s0, 16 194; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 195; GFX6-NEXT: v_mov_b32_e32 v1, s0 196; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 197; GFX6-NEXT: s_endpgm 198; 199; GFX10-LABEL: store_lds_v3i32_align1: 200; GFX10: ; %bb.0: 201; GFX10-NEXT: s_clause 0x1 202; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 203; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 204; GFX10-NEXT: s_waitcnt lgkmcnt(0) 205; GFX10-NEXT: v_mov_b32_e32 v0, s6 206; GFX10-NEXT: v_mov_b32_e32 v1, s2 207; GFX10-NEXT: v_mov_b32_e32 v2, s1 208; GFX10-NEXT: v_mov_b32_e32 v3, s0 209; GFX10-NEXT: s_lshr_b32 s3, s2, 8 210; GFX10-NEXT: s_lshr_b32 s2, s2, 24 211; GFX10-NEXT: s_lshr_b32 s4, s1, 8 212; GFX10-NEXT: s_lshr_b32 s1, s1, 24 213; GFX10-NEXT: s_lshr_b32 s5, s0, 8 214; GFX10-NEXT: s_lshr_b32 s0, s0, 24 215; GFX10-NEXT: v_mov_b32_e32 v4, s3 216; GFX10-NEXT: v_mov_b32_e32 v5, s2 217; GFX10-NEXT: v_mov_b32_e32 v6, s4 218; GFX10-NEXT: v_mov_b32_e32 v7, s1 219; GFX10-NEXT: v_mov_b32_e32 v8, s5 220; GFX10-NEXT: v_mov_b32_e32 v9, s0 221; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 222; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 223; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 224; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 225; GFX10-NEXT: ds_write_b8 v0, v3 226; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 227; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 228; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 229; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 230; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 231; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 232; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 233; GFX10-NEXT: s_endpgm 234; 235; GFX11-LABEL: store_lds_v3i32_align1: 236; GFX11: ; %bb.0: 237; GFX11-NEXT: s_clause 0x1 238; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 239; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 240; GFX11-NEXT: s_waitcnt lgkmcnt(0) 241; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 242; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 243; GFX11-NEXT: s_lshr_b32 s3, s2, 8 244; GFX11-NEXT: s_lshr_b32 s2, s2, 24 245; GFX11-NEXT: s_lshr_b32 s4, s1, 8 246; GFX11-NEXT: s_lshr_b32 s1, s1, 24 247; GFX11-NEXT: s_lshr_b32 s5, s0, 8 248; GFX11-NEXT: s_lshr_b32 s0, s0, 24 249; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 250; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1 251; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0 252; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 253; GFX11-NEXT: ds_store_b8 v0, v3 254; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 255; GFX11-NEXT: ds_store_b8 v0, v2 offset:4 256; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 257; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 258; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 259; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 260; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 261; GFX11-NEXT: ds_store_b8 v0, v7 offset:7 262; GFX11-NEXT: ds_store_b8 v0, v8 offset:1 263; GFX11-NEXT: ds_store_b8 v0, v9 offset:3 264; GFX11-NEXT: s_endpgm 265 store <3 x i32> %x, ptr addrspace(3) %out, align 1 266 ret void 267} 268 269define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { 270; GFX9-LABEL: store_lds_v3i32_align2: 271; GFX9: ; %bb.0: 272; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 273; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 274; GFX9-NEXT: s_waitcnt lgkmcnt(0) 275; GFX9-NEXT: v_mov_b32_e32 v0, s6 276; GFX9-NEXT: v_mov_b32_e32 v1, s2 277; GFX9-NEXT: v_mov_b32_e32 v2, s1 278; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 279; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 280; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 281; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 282; GFX9-NEXT: v_mov_b32_e32 v1, s0 283; GFX9-NEXT: ds_write_b16 v0, v1 284; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 285; GFX9-NEXT: s_endpgm 286; 287; GFX7-LABEL: store_lds_v3i32_align2: 288; GFX7: ; %bb.0: 289; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 290; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 291; GFX7-NEXT: s_mov_b32 m0, -1 292; GFX7-NEXT: s_waitcnt lgkmcnt(0) 293; GFX7-NEXT: v_mov_b32_e32 v0, s6 294; GFX7-NEXT: v_mov_b32_e32 v1, s2 295; GFX7-NEXT: v_mov_b32_e32 v2, s1 296; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 297; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 298; GFX7-NEXT: v_mov_b32_e32 v1, s0 299; GFX7-NEXT: s_lshr_b32 s2, s2, 16 300; GFX7-NEXT: ds_write_b16 v0, v1 301; GFX7-NEXT: v_mov_b32_e32 v1, s2 302; GFX7-NEXT: s_lshr_b32 s1, s1, 16 303; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 304; GFX7-NEXT: v_mov_b32_e32 v1, s1 305; GFX7-NEXT: s_lshr_b32 s0, s0, 16 306; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 307; GFX7-NEXT: v_mov_b32_e32 v1, s0 308; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 309; GFX7-NEXT: s_endpgm 310; 311; GFX6-LABEL: store_lds_v3i32_align2: 312; GFX6: ; %bb.0: 313; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 314; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 315; GFX6-NEXT: s_mov_b32 m0, -1 316; GFX6-NEXT: s_waitcnt lgkmcnt(0) 317; GFX6-NEXT: v_mov_b32_e32 v0, s6 318; GFX6-NEXT: v_mov_b32_e32 v1, s2 319; GFX6-NEXT: v_mov_b32_e32 v2, s1 320; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 321; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 322; GFX6-NEXT: v_mov_b32_e32 v1, s0 323; GFX6-NEXT: s_lshr_b32 s2, s2, 16 324; GFX6-NEXT: ds_write_b16 v0, v1 325; GFX6-NEXT: v_mov_b32_e32 v1, s2 326; GFX6-NEXT: s_lshr_b32 s1, s1, 16 327; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 328; GFX6-NEXT: v_mov_b32_e32 v1, s1 329; GFX6-NEXT: s_lshr_b32 s0, s0, 16 330; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 331; GFX6-NEXT: v_mov_b32_e32 v1, s0 332; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 333; GFX6-NEXT: s_endpgm 334; 335; GFX10-LABEL: store_lds_v3i32_align2: 336; GFX10: ; %bb.0: 337; GFX10-NEXT: s_clause 0x1 338; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 339; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 340; GFX10-NEXT: s_waitcnt lgkmcnt(0) 341; GFX10-NEXT: v_mov_b32_e32 v0, s6 342; GFX10-NEXT: v_mov_b32_e32 v1, s2 343; GFX10-NEXT: v_mov_b32_e32 v2, s1 344; GFX10-NEXT: v_mov_b32_e32 v3, s0 345; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 346; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 347; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 348; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 349; GFX10-NEXT: ds_write_b16 v0, v3 350; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 351; GFX10-NEXT: s_endpgm 352; 353; GFX11-LABEL: store_lds_v3i32_align2: 354; GFX11: ; %bb.0: 355; GFX11-NEXT: s_clause 0x1 356; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 357; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 358; GFX11-NEXT: s_waitcnt lgkmcnt(0) 359; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2 360; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 361; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 362; GFX11-NEXT: ds_store_b16 v0, v2 363; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 364; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 365; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 366; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 367; GFX11-NEXT: s_endpgm 368 store <3 x i32> %x, ptr addrspace(3) %out, align 2 369 ret void 370} 371 372define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { 373; GFX9-LABEL: store_lds_v3i32_align4: 374; GFX9: ; %bb.0: 375; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 376; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 377; GFX9-NEXT: s_waitcnt lgkmcnt(0) 378; GFX9-NEXT: v_mov_b32_e32 v0, s6 379; GFX9-NEXT: v_mov_b32_e32 v1, s0 380; GFX9-NEXT: v_mov_b32_e32 v2, s1 381; GFX9-NEXT: v_mov_b32_e32 v3, s2 382; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 383; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 384; GFX9-NEXT: s_endpgm 385; 386; GFX7-LABEL: store_lds_v3i32_align4: 387; GFX7: ; %bb.0: 388; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 389; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 390; GFX7-NEXT: s_mov_b32 m0, -1 391; GFX7-NEXT: s_waitcnt lgkmcnt(0) 392; GFX7-NEXT: v_mov_b32_e32 v0, s6 393; GFX7-NEXT: v_mov_b32_e32 v1, s0 394; GFX7-NEXT: v_mov_b32_e32 v2, s1 395; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 396; GFX7-NEXT: v_mov_b32_e32 v1, s2 397; GFX7-NEXT: ds_write_b32 v0, v1 offset:8 398; GFX7-NEXT: s_endpgm 399; 400; GFX6-LABEL: store_lds_v3i32_align4: 401; GFX6: ; %bb.0: 402; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 403; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 404; GFX6-NEXT: s_mov_b32 m0, -1 405; GFX6-NEXT: s_waitcnt lgkmcnt(0) 406; GFX6-NEXT: v_mov_b32_e32 v0, s6 407; GFX6-NEXT: v_mov_b32_e32 v1, s2 408; GFX6-NEXT: v_mov_b32_e32 v2, s0 409; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 410; GFX6-NEXT: v_mov_b32_e32 v1, s1 411; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 412; GFX6-NEXT: s_endpgm 413; 414; GFX10-LABEL: store_lds_v3i32_align4: 415; GFX10: ; %bb.0: 416; GFX10-NEXT: s_clause 0x1 417; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 418; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 419; GFX10-NEXT: s_waitcnt lgkmcnt(0) 420; GFX10-NEXT: v_mov_b32_e32 v0, s6 421; GFX10-NEXT: v_mov_b32_e32 v1, s2 422; GFX10-NEXT: v_mov_b32_e32 v2, s0 423; GFX10-NEXT: v_mov_b32_e32 v3, s1 424; GFX10-NEXT: ds_write_b32 v0, v1 offset:8 425; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 426; GFX10-NEXT: s_endpgm 427; 428; GFX11-LABEL: store_lds_v3i32_align4: 429; GFX11: ; %bb.0: 430; GFX11-NEXT: s_clause 0x1 431; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 432; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 433; GFX11-NEXT: s_waitcnt lgkmcnt(0) 434; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0 435; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 436; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 437; GFX11-NEXT: ds_store_b32 v0, v3 offset:8 438; GFX11-NEXT: s_endpgm 439 store <3 x i32> %x, ptr addrspace(3) %out, align 4 440 ret void 441} 442 443define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { 444; GFX9-LABEL: store_lds_v3i32_align8: 445; GFX9: ; %bb.0: 446; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 447; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: v_mov_b32_e32 v2, s6 450; GFX9-NEXT: v_mov_b32_e32 v3, s2 451; GFX9-NEXT: v_mov_b32_e32 v0, s0 452; GFX9-NEXT: v_mov_b32_e32 v1, s1 453; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 454; GFX9-NEXT: ds_write_b64 v2, v[0:1] 455; GFX9-NEXT: s_endpgm 456; 457; GFX7-LABEL: store_lds_v3i32_align8: 458; GFX7: ; %bb.0: 459; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 460; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 461; GFX7-NEXT: s_mov_b32 m0, -1 462; GFX7-NEXT: s_waitcnt lgkmcnt(0) 463; GFX7-NEXT: v_mov_b32_e32 v2, s6 464; GFX7-NEXT: v_mov_b32_e32 v1, s2 465; GFX7-NEXT: v_mov_b32_e32 v0, s0 466; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 467; GFX7-NEXT: v_mov_b32_e32 v1, s1 468; GFX7-NEXT: ds_write_b64 v2, v[0:1] 469; GFX7-NEXT: s_endpgm 470; 471; GFX6-LABEL: store_lds_v3i32_align8: 472; GFX6: ; %bb.0: 473; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 474; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 475; GFX6-NEXT: s_mov_b32 m0, -1 476; GFX6-NEXT: s_waitcnt lgkmcnt(0) 477; GFX6-NEXT: v_mov_b32_e32 v2, s6 478; GFX6-NEXT: v_mov_b32_e32 v1, s2 479; GFX6-NEXT: v_mov_b32_e32 v0, s0 480; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 481; GFX6-NEXT: v_mov_b32_e32 v1, s1 482; GFX6-NEXT: ds_write_b64 v2, v[0:1] 483; GFX6-NEXT: s_endpgm 484; 485; GFX10-LABEL: store_lds_v3i32_align8: 486; GFX10: ; %bb.0: 487; GFX10-NEXT: s_clause 0x1 488; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0 489; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 490; GFX10-NEXT: s_waitcnt lgkmcnt(0) 491; GFX10-NEXT: v_mov_b32_e32 v2, s6 492; GFX10-NEXT: v_mov_b32_e32 v3, s2 493; GFX10-NEXT: v_mov_b32_e32 v0, s0 494; GFX10-NEXT: v_mov_b32_e32 v1, s1 495; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 496; GFX10-NEXT: ds_write_b64 v2, v[0:1] 497; GFX10-NEXT: s_endpgm 498; 499; GFX11-LABEL: store_lds_v3i32_align8: 500; GFX11: ; %bb.0: 501; GFX11-NEXT: s_clause 0x1 502; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 503; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 504; GFX11-NEXT: s_waitcnt lgkmcnt(0) 505; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 506; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 507; GFX11-NEXT: ds_store_b32 v2, v3 offset:8 508; GFX11-NEXT: ds_store_b64 v2, v[0:1] 509; GFX11-NEXT: s_endpgm 510 store <3 x i32> %x, ptr addrspace(3) %out, align 8 511 ret void 512} 513 514define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { 515; GFX9-LABEL: store_lds_v3i32_align16: 516; GFX9: ; %bb.0: 517; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 518; GFX9-NEXT: s_waitcnt lgkmcnt(0) 519; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 520; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 521; GFX9-NEXT: v_mov_b32_e32 v0, s0 522; GFX9-NEXT: v_mov_b32_e32 v1, s1 523; GFX9-NEXT: v_mov_b32_e32 v2, s2 524; GFX9-NEXT: s_waitcnt lgkmcnt(0) 525; GFX9-NEXT: v_mov_b32_e32 v3, s3 526; GFX9-NEXT: ds_write_b96 v3, v[0:2] 527; GFX9-NEXT: s_endpgm 528; 529; GFX7-LABEL: store_lds_v3i32_align16: 530; GFX7: ; %bb.0: 531; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 532; GFX7-NEXT: s_waitcnt lgkmcnt(0) 533; GFX7-NEXT: s_load_dword s3, s[4:5], 0x0 534; GFX7-NEXT: s_mov_b32 m0, -1 535; GFX7-NEXT: v_mov_b32_e32 v0, s0 536; GFX7-NEXT: v_mov_b32_e32 v1, s1 537; GFX7-NEXT: v_mov_b32_e32 v2, s2 538; GFX7-NEXT: s_waitcnt lgkmcnt(0) 539; GFX7-NEXT: v_mov_b32_e32 v3, s3 540; GFX7-NEXT: ds_write_b96 v3, v[0:2] 541; GFX7-NEXT: s_endpgm 542; 543; GFX6-LABEL: store_lds_v3i32_align16: 544; GFX6: ; %bb.0: 545; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 546; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 547; GFX6-NEXT: s_mov_b32 m0, -1 548; GFX6-NEXT: s_waitcnt lgkmcnt(0) 549; GFX6-NEXT: v_mov_b32_e32 v2, s6 550; GFX6-NEXT: v_mov_b32_e32 v1, s2 551; GFX6-NEXT: v_mov_b32_e32 v0, s0 552; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 553; GFX6-NEXT: v_mov_b32_e32 v1, s1 554; GFX6-NEXT: ds_write_b64 v2, v[0:1] 555; GFX6-NEXT: s_endpgm 556; 557; GFX10-LABEL: store_lds_v3i32_align16: 558; GFX10: ; %bb.0: 559; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 560; GFX10-NEXT: s_waitcnt lgkmcnt(0) 561; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 562; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 563; GFX10-NEXT: v_mov_b32_e32 v0, s0 564; GFX10-NEXT: v_mov_b32_e32 v1, s1 565; GFX10-NEXT: v_mov_b32_e32 v2, s2 566; GFX10-NEXT: s_waitcnt lgkmcnt(0) 567; GFX10-NEXT: v_mov_b32_e32 v3, s3 568; GFX10-NEXT: ds_write_b96 v3, v[0:2] 569; GFX10-NEXT: s_endpgm 570; 571; GFX11-LABEL: store_lds_v3i32_align16: 572; GFX11: ; %bb.0: 573; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 574; GFX11-NEXT: s_waitcnt lgkmcnt(0) 575; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 576; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 577; GFX11-NEXT: s_waitcnt lgkmcnt(0) 578; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 579; GFX11-NEXT: ds_store_b96 v3, v[0:2] 580; GFX11-NEXT: s_endpgm 581 store <3 x i32> %x, ptr addrspace(3) %out, align 16 582 ret void 583} 584