1*46e78230SJason Molenda /// BUILT with 2*46e78230SJason Molenda /// xcrun -sdk macosx.internal clang -mcpu=apple-m4 -g sme.c -o sme 3*46e78230SJason Molenda 4*46e78230SJason Molenda #include <stdint.h> 5*46e78230SJason Molenda #include <stdio.h> 6*46e78230SJason Molenda #include <stdlib.h> 7*46e78230SJason Molenda 8*46e78230SJason Molenda void write_sve_regs() { 9*46e78230SJason Molenda asm volatile("ptrue p0.b\n\t"); 10*46e78230SJason Molenda asm volatile("ptrue p1.h\n\t"); 11*46e78230SJason Molenda asm volatile("ptrue p2.s\n\t"); 12*46e78230SJason Molenda asm volatile("ptrue p3.d\n\t"); 13*46e78230SJason Molenda asm volatile("pfalse p4.b\n\t"); 14*46e78230SJason Molenda asm volatile("ptrue p5.b\n\t"); 15*46e78230SJason Molenda asm volatile("ptrue p6.h\n\t"); 16*46e78230SJason Molenda asm volatile("ptrue p7.s\n\t"); 17*46e78230SJason Molenda asm volatile("ptrue p8.d\n\t"); 18*46e78230SJason Molenda asm volatile("pfalse p9.b\n\t"); 19*46e78230SJason Molenda asm volatile("ptrue p10.b\n\t"); 20*46e78230SJason Molenda asm volatile("ptrue p11.h\n\t"); 21*46e78230SJason Molenda asm volatile("ptrue p12.s\n\t"); 22*46e78230SJason Molenda asm volatile("ptrue p13.d\n\t"); 23*46e78230SJason Molenda asm volatile("pfalse p14.b\n\t"); 24*46e78230SJason Molenda asm volatile("ptrue p15.b\n\t"); 25*46e78230SJason Molenda 26*46e78230SJason Molenda asm volatile("cpy z0.b, p0/z, #1\n\t"); 27*46e78230SJason Molenda asm volatile("cpy z1.b, p5/z, #2\n\t"); 28*46e78230SJason Molenda asm volatile("cpy z2.b, p10/z, #3\n\t"); 29*46e78230SJason Molenda asm volatile("cpy z3.b, p15/z, #4\n\t"); 30*46e78230SJason Molenda asm volatile("cpy z4.b, p0/z, #5\n\t"); 31*46e78230SJason Molenda asm volatile("cpy z5.b, p5/z, #6\n\t"); 32*46e78230SJason Molenda asm volatile("cpy z6.b, p10/z, #7\n\t"); 33*46e78230SJason Molenda asm volatile("cpy z7.b, p15/z, #8\n\t"); 34*46e78230SJason Molenda asm volatile("cpy z8.b, p0/z, #9\n\t"); 35*46e78230SJason Molenda asm volatile("cpy z9.b, p5/z, #10\n\t"); 36*46e78230SJason Molenda asm volatile("cpy z10.b, p10/z, #11\n\t"); 37*46e78230SJason Molenda asm volatile("cpy z11.b, p15/z, #12\n\t"); 38*46e78230SJason Molenda asm volatile("cpy z12.b, p0/z, #13\n\t"); 39*46e78230SJason Molenda asm volatile("cpy z13.b, p5/z, #14\n\t"); 40*46e78230SJason Molenda asm volatile("cpy z14.b, p10/z, #15\n\t"); 41*46e78230SJason Molenda asm volatile("cpy z15.b, p15/z, #16\n\t"); 42*46e78230SJason Molenda asm volatile("cpy z16.b, p0/z, #17\n\t"); 43*46e78230SJason Molenda asm volatile("cpy z17.b, p5/z, #18\n\t"); 44*46e78230SJason Molenda asm volatile("cpy z18.b, p10/z, #19\n\t"); 45*46e78230SJason Molenda asm volatile("cpy z19.b, p15/z, #20\n\t"); 46*46e78230SJason Molenda asm volatile("cpy z20.b, p0/z, #21\n\t"); 47*46e78230SJason Molenda asm volatile("cpy z21.b, p5/z, #22\n\t"); 48*46e78230SJason Molenda asm volatile("cpy z22.b, p10/z, #23\n\t"); 49*46e78230SJason Molenda asm volatile("cpy z23.b, p15/z, #24\n\t"); 50*46e78230SJason Molenda asm volatile("cpy z24.b, p0/z, #25\n\t"); 51*46e78230SJason Molenda asm volatile("cpy z25.b, p5/z, #26\n\t"); 52*46e78230SJason Molenda asm volatile("cpy z26.b, p10/z, #27\n\t"); 53*46e78230SJason Molenda asm volatile("cpy z27.b, p15/z, #28\n\t"); 54*46e78230SJason Molenda asm volatile("cpy z28.b, p0/z, #29\n\t"); 55*46e78230SJason Molenda asm volatile("cpy z29.b, p5/z, #30\n\t"); 56*46e78230SJason Molenda asm volatile("cpy z30.b, p10/z, #31\n\t"); 57*46e78230SJason Molenda asm volatile("cpy z31.b, p15/z, #32\n\t"); 58*46e78230SJason Molenda } 59*46e78230SJason Molenda 60*46e78230SJason Molenda #define MAX_VL_BYTES 256 61*46e78230SJason Molenda void set_za_register(int svl, int value_offset) { 62*46e78230SJason Molenda uint8_t data[MAX_VL_BYTES]; 63*46e78230SJason Molenda 64*46e78230SJason Molenda // ldr za will actually wrap the selected vector row, by the number of rows 65*46e78230SJason Molenda // you have. So setting one that didn't exist would actually set one that did. 66*46e78230SJason Molenda // That's why we need the streaming vector length here. 67*46e78230SJason Molenda for (int i = 0; i < svl; ++i) { 68*46e78230SJason Molenda // This may involve instructions that require the smefa64 extension. 69*46e78230SJason Molenda for (int j = 0; j < MAX_VL_BYTES; j++) 70*46e78230SJason Molenda data[j] = i + value_offset; 71*46e78230SJason Molenda // Each one of these loads a VL sized row of ZA. 72*46e78230SJason Molenda asm volatile("mov w12, %w0\n\t" 73*46e78230SJason Molenda "ldr za[w12, 0], [%1]\n\t" ::"r"(i), 74*46e78230SJason Molenda "r"(&data) 75*46e78230SJason Molenda : "w12"); 76*46e78230SJason Molenda } 77*46e78230SJason Molenda } 78*46e78230SJason Molenda 79*46e78230SJason Molenda static uint16_t arm_sme_svl_b(void) { 80*46e78230SJason Molenda uint64_t ret = 0; 81*46e78230SJason Molenda asm volatile("rdsvl %[ret], #1" : [ret] "=r"(ret)); 82*46e78230SJason Molenda return (uint16_t)ret; 83*46e78230SJason Molenda } 84*46e78230SJason Molenda 85*46e78230SJason Molenda void arm_sme2_set_zt0() { 86*46e78230SJason Molenda #define ZTO_LEN (512 / 8) 87*46e78230SJason Molenda uint8_t data[ZTO_LEN]; 88*46e78230SJason Molenda for (unsigned i = 0; i < ZTO_LEN; ++i) 89*46e78230SJason Molenda data[i] = i + 0; 90*46e78230SJason Molenda 91*46e78230SJason Molenda asm volatile("ldr zt0, [%0]" ::"r"(&data)); 92*46e78230SJason Molenda #undef ZT0_LEN 93*46e78230SJason Molenda } 94*46e78230SJason Molenda 95*46e78230SJason Molenda int main() { 96*46e78230SJason Molenda printf("Enable SME mode\n"); // break before sme 97*46e78230SJason Molenda 98*46e78230SJason Molenda asm volatile("smstart"); 99*46e78230SJason Molenda 100*46e78230SJason Molenda write_sve_regs(); 101*46e78230SJason Molenda 102*46e78230SJason Molenda set_za_register(arm_sme_svl_b(), 4); 103*46e78230SJason Molenda 104*46e78230SJason Molenda arm_sme2_set_zt0(); 105*46e78230SJason Molenda 106*46e78230SJason Molenda int c = 10; // break while sme 107*46e78230SJason Molenda c += 5; 108*46e78230SJason Molenda c += 5; 109*46e78230SJason Molenda 110*46e78230SJason Molenda asm volatile("smstop"); 111*46e78230SJason Molenda 112*46e78230SJason Molenda printf("SME mode disabled\n"); // break after sme 113*46e78230SJason Molenda } 114