1 /*
2 * intel pci-express 10Gb ethernet driver for 8259[89]
3 * copyright © 2007, coraid, inc.
4 * depessimised and made to work on the 82599 at bell labs, 2013.
5 *
6 * 82599 requests should ideally not cross a 4KB (page) boundary.
7 */
8 #include "u.h"
9 #include "../port/lib.h"
10 #include "mem.h"
11 #include "dat.h"
12 #include "fns.h"
13 #include "io.h"
14 #include "../port/error.h"
15 #include "../port/netif.h"
16 #include "etherif.h"
17
18 #define NEXTPOW2(x, m) (((x)+1) & (m))
19
20 enum {
21 Rbsz = ETHERMAXTU+32, /* +slop is for vlan headers, crcs, etc. */
22 Descalign= 128, /* 599 manual needs 128-byte alignment */
23
24 /* tunable parameters */
25 Goslow = 0, /* flag: go slow by throttling intrs, etc. */
26 /*
27 * these were 256, 1024 & 128, but 30, 47 and 1 are usually ample;
28 * however cpu servers and terminals can need more receive buffers
29 * due to bursts of traffic.
30 */
31 Nrd = 128, /* multiple of 8, power of 2 for NEXTPOW2 */
32 Nrb = 1024,
33 Ntd = 32, /* multiple of 8, power of 2 for NEXTPOW2 */
34 };
35
36 enum {
37 /* general */
38 Ctrl = 0x00000/4, /* Device Control */
39 Status = 0x00008/4, /* Device Status */
40 Ctrlext = 0x00018/4, /* Extended Device Control */
41 Esdp = 0x00020/4, /* extended sdp control */
42 Esodp = 0x00028/4, /* extended od sdp control (i2cctl on 599) */
43 Ledctl = 0x00200/4, /* led control */
44 Tcptimer = 0x0004c/4, /* tcp timer */
45 Ecc = 0x110b0/4, /* errata ecc control magic (pcie intr cause on 599) */
46
47 /* nvm */
48 Eec = 0x10010/4, /* eeprom/flash control */
49 Eerd = 0x10014/4, /* eeprom read */
50 Fla = 0x1001c/4, /* flash access */
51 Flop = 0x1013c/4, /* flash opcode */
52 Grc = 0x10200/4, /* general rx control */
53
54 /* interrupt */
55 Icr = 0x00800/4, /* interrupt cause read */
56 Ics = 0x00808/4, /* " set */
57 Ims = 0x00880/4, /* " mask read/set (actually enable) */
58 Imc = 0x00888/4, /* " mask clear */
59 Iac = 0x00810/4, /* " auto clear */
60 Iam = 0x00890/4, /* " auto mask enable */
61 Itr = 0x00820/4, /* " throttling rate regs (0-19) */
62 Ivar = 0x00900/4, /* " vector allocation regs. */
63 /* msi interrupt */
64 Msixt = 0x0000/4, /* msix table (bar3) */
65 Msipba = 0x2000/4, /* msix pending bit array (bar3) */
66 Pbacl = 0x11068/4, /* pba clear */
67 Gpie = 0x00898/4, /* general purpose int enable */
68
69 /* flow control */
70 Pfctop = 0x03008/4, /* priority flow ctl type opcode */
71 Fcttv = 0x03200/4, /* " transmit timer value (0-3) */
72 Fcrtl = 0x03220/4, /* " rx threshold low (0-7) +8n */
73 Fcrth = 0x03260/4, /* " rx threshold high (0-7) +8n */
74 Rcrtv = 0x032a0/4, /* " refresh value threshold */
75 Tfcs = 0x0ce00/4, /* " tx status */
76
77 /* rx dma */
78 Rbal = 0x01000/4, /* rx desc base low (0-63) +0x40n */
79 Rbah = 0x01004/4, /* " high */
80 Rdlen = 0x01008/4, /* " length */
81 Rdh = 0x01010/4, /* " head */
82 Rdt = 0x01018/4, /* " tail */
83 Rxdctl = 0x01028/4, /* " control */
84
85 Srrctl = 0x02100/4, /* split & replication rx ctl. array */
86 Dcarxctl = 0x02200/4, /* rx dca control */
87 Rdrxctl = 0x02f00/4, /* rx dma control */
88 Rxpbsize = 0x03c00/4, /* rx packet buffer size */
89 Rxctl = 0x03000/4, /* rx control */
90 Dropen = 0x03d04/4, /* drop enable control (598 only) */
91
92 /* rx */
93 Rxcsum = 0x05000/4, /* rx checksum control */
94 Rfctl = 0x05008/4, /* rx filter control */
95 Mta = 0x05200/4, /* multicast table array (0-127) */
96 Ral98 = 0x05400/4, /* rx address low (598) */
97 Rah98 = 0x05404/4,
98 Ral99 = 0x0a200/4, /* rx address low array (599) */
99 Rah99 = 0x0a204/4,
100 Psrtype = 0x05480/4, /* packet split rx type. */
101 Vfta = 0x0a000/4, /* vlan filter table array. */
102 Fctrl = 0x05080/4, /* filter control */
103 Vlnctrl = 0x05088/4, /* vlan control */
104 Msctctrl = 0x05090/4, /* multicast control */
105 Mrqc = 0x05818/4, /* multiple rx queues cmd */
106 Vmdctl = 0x0581c/4, /* vmdq control (598 only) */
107 Imir = 0x05a80/4, /* immediate irq rx (0-7) (598 only) */
108 Imirext = 0x05aa0/4, /* immediate irq rx ext (598 only) */
109 Imirvp = 0x05ac0/4, /* immediate irq vlan priority (598 only) */
110 Reta = 0x05c00/4, /* redirection table */
111 Rssrk = 0x05c80/4, /* rss random key */
112
113 /* tx */
114 Tdbal = 0x06000/4, /* tx desc base low +0x40n array */
115 Tdbah = 0x06004/4, /* " high */
116 Tdlen = 0x06008/4, /* " len */
117 Tdh = 0x06010/4, /* " head */
118 Tdt = 0x06018/4, /* " tail */
119 Txdctl = 0x06028/4, /* " control */
120 Tdwbal = 0x06038/4, /* " write-back address low */
121 Tdwbah = 0x0603c/4,
122
123 Dtxctl98 = 0x07e00/4, /* tx dma control (598 only) */
124 Dtxctl99 = 0x04a80/4, /* tx dma control (599 only) */
125 Tdcatxctrl98 = 0x07200/4, /* tx dca register (0-15) (598 only) */
126 Tdcatxctrl99 = 0x0600c/4, /* tx dca register (0-127) (599 only) */
127 Tipg = 0x0cb00/4, /* tx inter-packet gap (598 only) */
128 Txpbsize = 0x0cc00/4, /* tx packet-buffer size (0-15) */
129
130 /* mac */
131 Hlreg0 = 0x04240/4, /* highlander control reg 0 */
132 Hlreg1 = 0x04244/4, /* highlander control reg 1 (ro) */
133 Msca = 0x0425c/4, /* mdi signal cmd & addr */
134 Msrwd = 0x04260/4, /* mdi single rw data */
135 Mhadd = 0x04268/4, /* mac addr high & max frame */
136 Pcss1 = 0x04288/4, /* xgxs status 1 */
137 Pcss2 = 0x0428c/4,
138 Xpcss = 0x04290/4, /* 10gb-x pcs status */
139 Serdesc = 0x04298/4, /* serdes control */
140 Macs = 0x0429c/4, /* fifo control & report */
141 Autoc = 0x042a0/4, /* autodetect control & status */
142 Links = 0x042a4/4, /* link status */
143 Links2 = 0x04324/4, /* 599 only */
144 Autoc2 = 0x042a8/4,
145 };
146
147 enum {
148 Factive = 1<<0,
149 Enable = 1<<31,
150
151 /* Ctrl */
152 Rst = 1<<26, /* full nic reset */
153
154 /* Txdctl */
155 Ten = 1<<25,
156
157 /* Dtxctl99 */
158 Te = 1<<0, /* dma tx enable */
159
160 /* Fctrl */
161 Bam = 1<<10, /* broadcast accept mode */
162 Upe = 1<<9, /* unicast promiscuous */
163 Mpe = 1<<8, /* multicast promiscuous */
164
165 /* Rxdctl */
166 Pthresh = 0, /* prefresh threshold shift in bits */
167 Hthresh = 8, /* host buffer minimum threshold " */
168 Wthresh = 16, /* writeback threshold */
169 Renable = 1<<25,
170
171 /* Rxctl */
172 Rxen = 1<<0,
173 Dmbyps = 1<<1, /* descr. monitor bypass (598 only) */
174
175 /* Rdrxctl */
176 Rdmt½ = 0, /* 598 */
177 Rdmt¼ = 1, /* 598 */
178 Rdmt⅛ = 2, /* 598 */
179 Crcstrip = 1<<1, /* 599 */
180 Rscfrstsize = 037<<17, /* 599; should be zero */
181
182 /* Rxcsum */
183 Ippcse = 1<<12, /* ip payload checksum enable */
184
185 /* Eerd */
186 EEstart = 1<<0, /* Start Read */
187 EEdone = 1<<1, /* Read done */
188
189 /* interrupts */
190 Irx0 = 1<<0, /* driver defined */
191 Itx0 = 1<<1, /* driver defined */
192 Lsc = 1<<20, /* link status change */
193
194 /* Links */
195 Lnkup = 1<<30,
196 Lnkspd = 1<<29,
197
198 /* Hlreg0 */
199 Txcrcen = 1<<0, /* add crc during xmit */
200 Rxcrcstrip = 1<<1, /* strip crc during recv */
201 Jumboen = 1<<2,
202 Txpaden = 1<<10, /* pad short frames during xmit */
203
204 /* Autoc */
205 Flu = 1<<0, /* force link up */
206 Lmsshift = 13, /* link mode select shift */
207 Lmsmask = 7,
208 };
209
210 typedef struct Ctlr Ctlr;
211 typedef struct Rd Rd;
212 typedef struct Td Td;
213
214 typedef struct {
215 uint reg;
216 char *name;
217 } Stat;
218
219 Stat stattab[] = {
220 0x4000, "crc error",
221 0x4004, "illegal byte",
222 0x4008, "short packet",
223 0x3fa0, "missed pkt0",
224 0x4034, "mac local flt",
225 0x4038, "mac rmt flt",
226 0x4040, "rx length err",
227 0x3f60, "xon tx",
228 0xcf60, "xon rx",
229 0x3f68, "xoff tx",
230 0xcf68, "xoff rx",
231 0x405c, "rx 040",
232 0x4060, "rx 07f",
233 0x4064, "rx 100",
234 0x4068, "rx 200",
235 0x406c, "rx 3ff",
236 0x4070, "rx big",
237 0x4074, "rx ok",
238 0x4078, "rx bcast",
239 0x3fc0, "rx no buf0",
240 0x40a4, "rx runt",
241 0x40a8, "rx frag",
242 0x40ac, "rx ovrsz",
243 0x40b0, "rx jab",
244 0x40d0, "rx pkt",
245
246 0x40d4, "tx pkt",
247 0x40d8, "tx 040",
248 0x40dc, "tx 07f",
249 0x40e0, "tx 100",
250 0x40e4, "tx 200",
251 0x40e8, "tx 3ff",
252 0x40ec, "tx big",
253 0x40f4, "tx bcast",
254 0x4120, "xsum err",
255 };
256
257 /* status */
258 enum {
259 Pif = 1<<7, /* past exact filter (sic) */
260 Ipcs = 1<<6, /* ip checksum calculated */
261 L4cs = 1<<5, /* layer 2 */
262 Tcpcs = 1<<4, /* tcp checksum calculated */
263 Vp = 1<<3, /* 802.1q packet matched vet */
264 Ixsm = 1<<2, /* ignore checksum */
265 Reop = 1<<1, /* end of packet */
266 Rdd = 1<<0, /* descriptor done */
267 };
268
269 struct Rd { /* Receive Descriptor */
270 u32int addr[2];
271 ushort length;
272 ushort cksum;
273 uchar status;
274 uchar errors;
275 ushort vlan;
276 };
277
278 enum {
279 /* Td cmd */
280 Rs = 1<<3, /* report status */
281 Ic = 1<<2, /* insert checksum */
282 Ifcs = 1<<1, /* insert FCS (ethernet crc) */
283 Teop = 1<<0, /* end of packet */
284
285 /* Td status */
286 Tdd = 1<<0, /* descriptor done */
287 };
288
289 struct Td { /* Transmit Descriptor */
290 u32int addr[2];
291 ushort length;
292 uchar cso;
293 uchar cmd;
294 uchar status;
295 uchar css;
296 ushort vlan;
297 };
298
299 struct Ctlr {
300 Pcidev *p;
301 Ether *edev;
302 int type;
303
304 /* virtual */
305 u32int *reg;
306 u32int *msix; /* unused */
307
308 /* physical */
309 u32int *physreg;
310 u32int *physmsix; /* unused */
311
312 uchar flag;
313 int nrd;
314 int ntd;
315 int nrb; /* # bufs this Ctlr has in the pool */
316 uint rbsz;
317 int procsrunning;
318 int attached;
319
320 Watermark wmrb;
321 Watermark wmrd;
322 Watermark wmtd;
323
324 QLock slock;
325 QLock alock; /* attach lock */
326 QLock tlock;
327 Rendez lrendez;
328 Rendez trendez;
329 Rendez rrendez;
330
331 uint im; /* interrupt mask */
332 uint lim;
333 uint rim;
334 uint tim;
335 Lock imlock;
336
337 Rd* rdba; /* receive descriptor base address */
338 Block** rb; /* receive buffers */
339 int rdt; /* receive descriptor tail */
340 int rdfree; /* rx descriptors awaiting packets */
341
342 Td* tdba; /* transmit descriptor base address */
343 int tdh; /* transmit descriptor head */
344 int tdt; /* transmit descriptor tail */
345 Block** tb; /* transmit buffers */
346
347 uchar ra[Eaddrlen]; /* receive address */
348 uchar mta[128]; /* multicast table array */
349 ulong stats[nelem(stattab)];
350 uint speeds[3];
351 };
352
353 enum {
354 I82598 = 1,
355 I82599,
356 };
357
358 static Ctlr *ctlrtab[4];
359 static int nctlr;
360 static Lock rblock;
361 static Block *rbpool;
362 static int nrbfull; /* # of rcv Blocks with data awaiting processing */
363
364 static void
readstats(Ctlr * ctlr)365 readstats(Ctlr *ctlr)
366 {
367 int i;
368
369 qlock(&ctlr->slock);
370 for(i = 0; i < nelem(ctlr->stats); i++)
371 ctlr->stats[i] += ctlr->reg[stattab[i].reg >> 2];
372 qunlock(&ctlr->slock);
373 }
374
375 static int speedtab[] = {
376 0,
377 1000,
378 10000,
379 };
380
381 static long
ifstat(Ether * edev,void * a,long n,ulong offset)382 ifstat(Ether *edev, void *a, long n, ulong offset)
383 {
384 uint i, *t;
385 char *s, *p, *e;
386 Ctlr *ctlr;
387
388 ctlr = edev->ctlr;
389 p = s = malloc(READSTR);
390 if(p == nil)
391 error(Enomem);
392 e = p + READSTR;
393
394 readstats(ctlr);
395 for(i = 0; i < nelem(stattab); i++)
396 if(ctlr->stats[i] > 0)
397 p = seprint(p, e, "%.10s %uld\n", stattab[i].name,
398 ctlr->stats[i]);
399 t = ctlr->speeds;
400 p = seprint(p, e, "speeds: 0:%d 1000:%d 10000:%d\n", t[0], t[1], t[2]);
401 p = seprint(p, e, "mtu: min:%d max:%d\n", edev->minmtu, edev->maxmtu);
402 p = seprint(p, e, "rdfree %d rdh %d rdt %d\n", ctlr->rdfree, ctlr->reg[Rdt],
403 ctlr->reg[Rdh]);
404 p = seprintmark(p, e, &ctlr->wmrb);
405 p = seprintmark(p, e, &ctlr->wmrd);
406 p = seprintmark(p, e, &ctlr->wmtd);
407 USED(p);
408 n = readstr(offset, a, n, s);
409 free(s);
410
411 return n;
412 }
413
414 static void
ienable(Ctlr * ctlr,int i)415 ienable(Ctlr *ctlr, int i)
416 {
417 ilock(&ctlr->imlock);
418 ctlr->im |= i;
419 ctlr->reg[Ims] = ctlr->im;
420 iunlock(&ctlr->imlock);
421 }
422
423 static int
lim(void * v)424 lim(void *v)
425 {
426 return ((Ctlr*)v)->lim != 0;
427 }
428
429 static void
lproc(void * v)430 lproc(void *v)
431 {
432 int r, i;
433 Ctlr *ctlr;
434 Ether *e;
435
436 e = v;
437 ctlr = e->ctlr;
438 for (;;) {
439 r = ctlr->reg[Links];
440 e->link = (r & Lnkup) != 0;
441 i = 0;
442 if(e->link)
443 i = 1 + ((r & Lnkspd) != 0);
444 ctlr->speeds[i]++;
445 e->mbps = speedtab[i];
446 ctlr->lim = 0;
447 ienable(ctlr, Lsc);
448 sleep(&ctlr->lrendez, lim, ctlr);
449 ctlr->lim = 0;
450 }
451 }
452
453 static long
ctl(Ether *,void *,long)454 ctl(Ether *, void *, long)
455 {
456 error(Ebadarg);
457 return -1;
458 }
459
460 static Block*
rballoc(void)461 rballoc(void)
462 {
463 Block *bp;
464
465 ilock(&rblock);
466 if((bp = rbpool) != nil){
467 rbpool = bp->next;
468 bp->next = 0;
469 ainc(&bp->ref); /* prevent bp from being freed */
470 }
471 iunlock(&rblock);
472 return bp;
473 }
474
475 void
rbfree(Block * b)476 rbfree(Block *b)
477 {
478 b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base);
479 b->flag &= ~(Bipck | Budpck | Btcpck | Bpktck);
480 ilock(&rblock);
481 b->next = rbpool;
482 rbpool = b;
483 nrbfull--;
484 iunlock(&rblock);
485 }
486
487 static int
cleanup(Ctlr * ctlr,int tdh)488 cleanup(Ctlr *ctlr, int tdh)
489 {
490 Block *b;
491 uint m, n;
492
493 m = ctlr->ntd - 1;
494 while(ctlr->tdba[n = NEXTPOW2(tdh, m)].status & Tdd){
495 tdh = n;
496 b = ctlr->tb[tdh];
497 ctlr->tb[tdh] = 0;
498 if (b)
499 freeb(b);
500 ctlr->tdba[tdh].status = 0;
501 }
502 return tdh;
503 }
504
505 void
transmit(Ether * e)506 transmit(Ether *e)
507 {
508 uint i, m, tdt, tdh;
509 Ctlr *ctlr;
510 Block *b;
511 Td *t;
512
513 ctlr = e->ctlr;
514 if(!canqlock(&ctlr->tlock)){
515 ienable(ctlr, Itx0);
516 return;
517 }
518 tdh = ctlr->tdh = cleanup(ctlr, ctlr->tdh);
519 tdt = ctlr->tdt;
520 m = ctlr->ntd - 1;
521 for(i = 0; ; i++){
522 if(NEXTPOW2(tdt, m) == tdh){ /* ring full? */
523 ienable(ctlr, Itx0);
524 break;
525 }
526 if((b = qget(e->oq)) == nil)
527 break;
528 assert(ctlr->tdba != nil);
529 t = ctlr->tdba + tdt;
530 t->addr[0] = PCIWADDR(b->rp);
531 t->length = BLEN(b);
532 t->cmd = Ifcs | Teop;
533 if (!Goslow)
534 t->cmd |= Rs;
535 ctlr->tb[tdt] = b;
536 /* note size of queue of tds awaiting transmission */
537 notemark(&ctlr->wmtd, (tdt + Ntd - tdh) % Ntd);
538 tdt = NEXTPOW2(tdt, m);
539 }
540 if(i) {
541 coherence();
542 ctlr->reg[Tdt] = ctlr->tdt = tdt; /* make new Tds active */
543 coherence();
544 ienable(ctlr, Itx0);
545 }
546 qunlock(&ctlr->tlock);
547 }
548
549 static int
tim(void * c)550 tim(void *c)
551 {
552 return ((Ctlr*)c)->tim != 0;
553 }
554
555 static void
tproc(void * v)556 tproc(void *v)
557 {
558 Ctlr *ctlr;
559 Ether *e;
560
561 e = v;
562 ctlr = e->ctlr;
563 for (;;) {
564 sleep(&ctlr->trendez, tim, ctlr); /* xmit interrupt kicks us */
565 ctlr->tim = 0;
566 transmit(e);
567 }
568 }
569
570 static void
rxinit(Ctlr * ctlr)571 rxinit(Ctlr *ctlr)
572 {
573 int i, is598, autoc;
574 ulong until;
575 Block *b;
576
577 ctlr->reg[Rxctl] &= ~Rxen;
578 ctlr->reg[Rxdctl] = 0;
579 for(i = 0; i < ctlr->nrd; i++){
580 b = ctlr->rb[i];
581 ctlr->rb[i] = 0;
582 if(b)
583 freeb(b);
584 }
585 ctlr->rdfree = 0;
586
587 coherence();
588 ctlr->reg[Fctrl] |= Bam;
589 ctlr->reg[Fctrl] &= ~(Upe | Mpe);
590
591 /* intel gets some csums wrong (e.g., errata 44) */
592 ctlr->reg[Rxcsum] &= ~Ippcse;
593 ctlr->reg[Hlreg0] &= ~Jumboen; /* jumbos are a bad idea */
594 ctlr->reg[Hlreg0] |= Txcrcen | Rxcrcstrip | Txpaden;
595 ctlr->reg[Srrctl] = (ctlr->rbsz + 1024 - 1) / 1024;
596 ctlr->reg[Mhadd] = ctlr->rbsz << 16;
597
598 ctlr->reg[Rbal] = PCIWADDR(ctlr->rdba);
599 ctlr->reg[Rbah] = 0;
600 ctlr->reg[Rdlen] = ctlr->nrd*sizeof(Rd); /* must be multiple of 128 */
601 ctlr->reg[Rdh] = 0;
602 ctlr->reg[Rdt] = ctlr->rdt = 0;
603 coherence();
604
605 is598 = (ctlr->type == I82598);
606 if (is598)
607 ctlr->reg[Rdrxctl] = Rdmt¼;
608 else {
609 ctlr->reg[Rdrxctl] |= Crcstrip;
610 ctlr->reg[Rdrxctl] &= ~Rscfrstsize;
611 }
612 if (Goslow && is598)
613 ctlr->reg[Rxdctl] = 8<<Wthresh | 8<<Pthresh | 4<<Hthresh | Renable;
614 else
615 ctlr->reg[Rxdctl] = Renable;
616 coherence();
617
618 /*
619 * don't wait forever like an idiot (and hang the system),
620 * maybe it's disconnected.
621 */
622 until = TK2MS(MACHP(0)->ticks) + 250;
623 while (!(ctlr->reg[Rxdctl] & Renable) && TK2MS(MACHP(0)->ticks) < until)
624 ;
625 if(!(ctlr->reg[Rxdctl] & Renable))
626 print("#l%d: Renable didn't come on, might be disconnected\n",
627 ctlr->edev->ctlrno);
628
629 ctlr->reg[Rxctl] |= Rxen | (is598? Dmbyps: 0);
630
631 if (is598){
632 autoc = ctlr->reg[Autoc];
633 /* what is this rubbish and why do we care? */
634 print("#l%d: autoc %#ux; lms %d (3 is 10g sfp)\n",
635 ctlr->edev->ctlrno, autoc, (autoc>>Lmsshift) & Lmsmask);
636 ctlr->reg[Autoc] |= Flu;
637 coherence();
638 delay(50);
639 }
640 }
641
642 static void
replenish(Ctlr * ctlr,uint rdh)643 replenish(Ctlr *ctlr, uint rdh)
644 {
645 int rdt, m, i;
646 Block *b;
647 Rd *r;
648
649 m = ctlr->nrd - 1;
650 i = 0;
651 for(rdt = ctlr->rdt; NEXTPOW2(rdt, m) != rdh; rdt = NEXTPOW2(rdt, m)){
652 r = ctlr->rdba + rdt;
653 if((b = rballoc()) == nil){
654 print("#l%d: no buffers\n", ctlr->edev->ctlrno);
655 break;
656 }
657 ctlr->rb[rdt] = b;
658 r->addr[0] = PCIWADDR(b->rp);
659 r->status = 0;
660 ctlr->rdfree++;
661 i++;
662 }
663 if(i) {
664 coherence();
665 ctlr->reg[Rdt] = ctlr->rdt = rdt; /* hand back recycled rdescs */
666 coherence();
667 }
668 }
669
670 static int
rim(void * v)671 rim(void *v)
672 {
673 return ((Ctlr*)v)->rim != 0;
674 }
675
676 void
rproc(void * v)677 rproc(void *v)
678 {
679 int passed;
680 uint m, rdh;
681 Block *bp;
682 Ctlr *ctlr;
683 Ether *e;
684 Rd *r;
685
686 e = v;
687 ctlr = e->ctlr;
688 m = ctlr->nrd - 1;
689 for (rdh = 0; ; ) {
690 replenish(ctlr, rdh);
691 ienable(ctlr, Irx0);
692 sleep(&ctlr->rrendez, rim, ctlr);
693 passed = 0;
694 for (;;) {
695 ctlr->rim = 0;
696 r = ctlr->rdba + rdh;
697 if(!(r->status & Rdd))
698 break; /* wait for pkts to arrive */
699 bp = ctlr->rb[rdh];
700 ctlr->rb[rdh] = 0;
701 if (r->length > ETHERMAXTU)
702 print("#l%d: got jumbo of %d bytes\n",
703 e->ctlrno, r->length);
704 bp->wp += r->length;
705 bp->lim = bp->wp; /* lie like a dog */
706 // r->status = 0;
707
708 ilock(&rblock);
709 nrbfull++;
710 iunlock(&rblock);
711 notemark(&ctlr->wmrb, nrbfull);
712 etheriq(e, bp, 1);
713
714 passed++;
715 ctlr->rdfree--;
716 rdh = NEXTPOW2(rdh, m);
717 if (ctlr->rdfree <= ctlr->nrd - 16)
718 replenish(ctlr, rdh);
719 }
720 /* note how many rds had full buffers */
721 notemark(&ctlr->wmrd, passed);
722 }
723 }
724
725 static void
promiscuous(void * a,int on)726 promiscuous(void *a, int on)
727 {
728 Ctlr *ctlr;
729 Ether *e;
730
731 e = a;
732 ctlr = e->ctlr;
733 if(on)
734 ctlr->reg[Fctrl] |= Upe | Mpe;
735 else
736 ctlr->reg[Fctrl] &= ~(Upe | Mpe);
737 }
738
739 static void
multicast(void * a,uchar * ea,int on)740 multicast(void *a, uchar *ea, int on)
741 {
742 int b, i;
743 Ctlr *ctlr;
744 Ether *e;
745
746 e = a;
747 ctlr = e->ctlr;
748
749 /*
750 * multiple ether addresses can hash to the same filter bit,
751 * so it's never safe to clear a filter bit.
752 * if we want to clear filter bits, we need to keep track of
753 * all the multicast addresses in use, clear all the filter bits,
754 * then set the ones corresponding to in-use addresses.
755 */
756 i = ea[5] >> 1;
757 b = (ea[5]&1)<<4 | ea[4]>>4;
758 b = 1 << b;
759 if(on)
760 ctlr->mta[i] |= b;
761 // else
762 // ctlr->mta[i] &= ~b;
763 ctlr->reg[Mta+i] = ctlr->mta[i];
764 }
765
766 static void
freemem(Ctlr * ctlr)767 freemem(Ctlr *ctlr)
768 {
769 Block *b;
770
771 while(b = rballoc()){
772 b->free = 0;
773 freeb(b);
774 }
775 free(ctlr->rdba);
776 ctlr->rdba = nil;
777 free(ctlr->tdba);
778 ctlr->tdba = nil;
779 free(ctlr->rb);
780 ctlr->rb = nil;
781 free(ctlr->tb);
782 ctlr->tb = nil;
783 }
784
785 static int
detach(Ctlr * ctlr)786 detach(Ctlr *ctlr)
787 {
788 int i, is598;
789
790 ctlr->reg[Imc] = ~0;
791 ctlr->reg[Ctrl] |= Rst;
792 for(i = 0; i < 100; i++){
793 delay(1);
794 if((ctlr->reg[Ctrl] & Rst) == 0)
795 break;
796 }
797 if (i >= 100)
798 return -1;
799 is598 = (ctlr->type == I82598);
800 if (is598) { /* errata */
801 delay(50);
802 ctlr->reg[Ecc] &= ~(1<<21 | 1<<18 | 1<<9 | 1<<6);
803 }
804
805 /* not cleared by reset; kill it manually. */
806 for(i = 1; i < 16; i++)
807 ctlr->reg[is598? Rah98: Rah99] &= ~Enable;
808 for(i = 0; i < 128; i++)
809 ctlr->reg[Mta + i] = 0;
810 for(i = 1; i < (is598? 640: 128); i++)
811 ctlr->reg[Vfta + i] = 0;
812
813 // freemem(ctlr); // TODO
814 ctlr->attached = 0;
815 return 0;
816 }
817
818 static void
shutdown(Ether * e)819 shutdown(Ether *e)
820 {
821 detach(e->ctlr);
822 // freemem(e->ctlr);
823 }
824
825 /* ≤ 20ms */
826 static ushort
eeread(Ctlr * ctlr,int i)827 eeread(Ctlr *ctlr, int i)
828 {
829 ctlr->reg[Eerd] = EEstart | i<<2;
830 while((ctlr->reg[Eerd] & EEdone) == 0)
831 ;
832 return ctlr->reg[Eerd] >> 16;
833 }
834
835 static int
eeload(Ctlr * ctlr)836 eeload(Ctlr *ctlr)
837 {
838 ushort u, v, p, l, i, j;
839
840 if((eeread(ctlr, 0) & 0xc0) != 0x40)
841 return -1;
842 u = 0;
843 for(i = 0; i < 0x40; i++)
844 u += eeread(ctlr, i);
845 for(i = 3; i < 0xf; i++){
846 p = eeread(ctlr, i);
847 l = eeread(ctlr, p++);
848 if((int)p + l + 1 > 0xffff)
849 continue;
850 for(j = p; j < p + l; j++)
851 u += eeread(ctlr, j);
852 }
853 if(u != 0xbaba)
854 return -1;
855 if(ctlr->reg[Status] & (1<<3))
856 u = eeread(ctlr, 10);
857 else
858 u = eeread(ctlr, 9);
859 u++;
860 for(i = 0; i < Eaddrlen;){
861 v = eeread(ctlr, u + i/2);
862 ctlr->ra[i++] = v;
863 ctlr->ra[i++] = v>>8;
864 }
865 ctlr->ra[5] += (ctlr->reg[Status] & 0xc) >> 2;
866 return 0;
867 }
868
869 static int
reset(Ctlr * ctlr)870 reset(Ctlr *ctlr)
871 {
872 int i, is598;
873 uchar *p;
874
875 if(detach(ctlr)){
876 print("82598: reset timeout\n");
877 return -1;
878 }
879 if(eeload(ctlr)){
880 print("82598: eeprom failure\n");
881 return -1;
882 }
883 p = ctlr->ra;
884 is598 = (ctlr->type == I82598);
885 ctlr->reg[is598? Ral98: Ral99] = p[3]<<24 | p[2]<<16 | p[1]<<8 | p[0];
886 ctlr->reg[is598? Rah98: Rah99] = p[5]<<8 | p[4] | Enable;
887
888 readstats(ctlr);
889 for(i = 0; i<nelem(ctlr->stats); i++)
890 ctlr->stats[i] = 0;
891
892 ctlr->reg[Ctrlext] |= 1 << 16; /* required by errata (spec change 4) */
893 if (Goslow) {
894 /* make some guesses for flow control */
895 ctlr->reg[Fcrtl] = 0x10000 | Enable;
896 ctlr->reg[Fcrth] = 0x40000 | Enable;
897 ctlr->reg[Rcrtv] = 0x6000;
898 } else
899 ctlr->reg[Fcrtl] = ctlr->reg[Fcrth] = ctlr->reg[Rcrtv] = 0;
900
901 /* configure interrupt mapping (don't ask) */
902 ctlr->reg[Ivar+0] = 0 | 1<<7;
903 ctlr->reg[Ivar+64/4] = 1 | 1<<7;
904 // ctlr->reg[Ivar+97/4] = (2 | 1<<7) << (8*(97%4));
905
906 if (Goslow) {
907 /* interrupt throttling goes here. */
908 for(i = Itr; i < Itr + 20; i++)
909 ctlr->reg[i] = 128; /* ¼µs intervals */
910 ctlr->reg[Itr + Itx0] = 256;
911 } else { /* don't throttle */
912 for(i = Itr; i < Itr + 20; i++)
913 ctlr->reg[i] = 0; /* ¼µs intervals */
914 ctlr->reg[Itr + Itx0] = 0;
915 }
916 return 0;
917 }
918
919 static void
txinit(Ctlr * ctlr)920 txinit(Ctlr *ctlr)
921 {
922 Block *b;
923 int i;
924
925 if (Goslow)
926 ctlr->reg[Txdctl] = 16<<Wthresh | 16<<Pthresh;
927 else
928 ctlr->reg[Txdctl] = 0;
929 if (ctlr->type == I82599)
930 ctlr->reg[Dtxctl99] = 0;
931 coherence();
932 for(i = 0; i < ctlr->ntd; i++){
933 b = ctlr->tb[i];
934 ctlr->tb[i] = 0;
935 if(b)
936 freeb(b);
937 }
938
939 assert(ctlr->tdba != nil);
940 memset(ctlr->tdba, 0, ctlr->ntd * sizeof(Td));
941 ctlr->reg[Tdbal] = PCIWADDR(ctlr->tdba);
942 ctlr->reg[Tdbah] = 0;
943 ctlr->reg[Tdlen] = ctlr->ntd*sizeof(Td); /* must be multiple of 128 */
944 ctlr->reg[Tdh] = 0;
945 ctlr->tdh = ctlr->ntd - 1;
946 ctlr->reg[Tdt] = ctlr->tdt = 0;
947 coherence();
948 if (ctlr->type == I82599)
949 ctlr->reg[Dtxctl99] |= Te;
950 coherence();
951 ctlr->reg[Txdctl] |= Ten;
952 coherence();
953 while (!(ctlr->reg[Txdctl] & Ten))
954 ;
955 }
956
957 static void
attach(Ether * e)958 attach(Ether *e)
959 {
960 Block *b;
961 Ctlr *ctlr;
962 char buf[KNAMELEN];
963
964 ctlr = e->ctlr;
965 ctlr->edev = e; /* point back to Ether* */
966 qlock(&ctlr->alock);
967 if(waserror()){
968 reset(ctlr);
969 freemem(ctlr);
970 qunlock(&ctlr->alock);
971 nexterror();
972 }
973 if(ctlr->rdba == nil) {
974 ctlr->nrd = Nrd;
975 ctlr->ntd = Ntd;
976 ctlr->rdba = mallocalign(ctlr->nrd * sizeof *ctlr->rdba,
977 Descalign, 0, 0);
978 ctlr->tdba = mallocalign(ctlr->ntd * sizeof *ctlr->tdba,
979 Descalign, 0, 0);
980 ctlr->rb = malloc(ctlr->nrd * sizeof(Block *));
981 ctlr->tb = malloc(ctlr->ntd * sizeof(Block *));
982 if (ctlr->rdba == nil || ctlr->tdba == nil ||
983 ctlr->rb == nil || ctlr->tb == nil)
984 error(Enomem);
985
986 for(ctlr->nrb = 0; ctlr->nrb < 2*Nrb; ctlr->nrb++){
987 b = allocb(ctlr->rbsz + BY2PG); /* see rbfree() */
988 if(b == nil)
989 error(Enomem);
990 b->free = rbfree;
991 freeb(b);
992 }
993 }
994 if (!ctlr->attached) {
995 rxinit(ctlr);
996 txinit(ctlr);
997 nrbfull = 0;
998 if (!ctlr->procsrunning) {
999 snprint(buf, sizeof buf, "#l%dl", e->ctlrno);
1000 kproc(buf, lproc, e);
1001 snprint(buf, sizeof buf, "#l%dr", e->ctlrno);
1002 kproc(buf, rproc, e);
1003 snprint(buf, sizeof buf, "#l%dt", e->ctlrno);
1004 kproc(buf, tproc, e);
1005 ctlr->procsrunning = 1;
1006 }
1007 initmark(&ctlr->wmrb, Nrb, "rcv bufs unprocessed");
1008 initmark(&ctlr->wmrd, Nrd-1, "rcv descrs processed at once");
1009 initmark(&ctlr->wmtd, Ntd-1, "xmit descr queue len");
1010 ctlr->attached = 1;
1011 }
1012 qunlock(&ctlr->alock);
1013 poperror();
1014 }
1015
1016 static void
interrupt(Ureg *,void * v)1017 interrupt(Ureg*, void *v)
1018 {
1019 int icr, im;
1020 Ctlr *ctlr;
1021 Ether *e;
1022
1023 e = v;
1024 ctlr = e->ctlr;
1025 ilock(&ctlr->imlock);
1026 ctlr->reg[Imc] = ~0; /* disable all intrs */
1027 im = ctlr->im;
1028 while((icr = ctlr->reg[Icr] & ctlr->im) != 0){
1029 if(icr & Irx0){
1030 im &= ~Irx0;
1031 ctlr->rim = Irx0;
1032 wakeup(&ctlr->rrendez);
1033 }
1034 if(icr & Itx0){
1035 im &= ~Itx0;
1036 ctlr->tim = Itx0;
1037 wakeup(&ctlr->trendez);
1038 }
1039 if(icr & Lsc){
1040 im &= ~Lsc;
1041 ctlr->lim = Lsc;
1042 wakeup(&ctlr->lrendez);
1043 }
1044 }
1045 ctlr->reg[Ims] = ctlr->im = im; /* enable only intrs we didn't service */
1046 iunlock(&ctlr->imlock);
1047 }
1048
1049 static void
scan(void)1050 scan(void)
1051 {
1052 int pciregs, pcimsix, type;
1053 ulong io, iomsi;
1054 void *mem, *memmsi;
1055 Ctlr *ctlr;
1056 Pcidev *p;
1057
1058 p = 0;
1059 while(p = pcimatch(p, Vintel, 0)){
1060 switch(p->did){
1061 case 0x10b6: /* 82598 backplane */
1062 case 0x10c6: /* 82598 af dual port */
1063 case 0x10c7: /* 82598 af single port */
1064 case 0x10dd: /* 82598 at cx4 */
1065 case 0x10ec: /* 82598 at cx4 dual port */
1066 pcimsix = 3;
1067 type = I82598;
1068 break;
1069 case 0x10f7: /* 82599 kx/kx4 */
1070 case 0x10f8: /* 82599 kx/kx4/kx */
1071 case 0x10f9: /* 82599 cx4 */
1072 case 0x10fb: /* 82599 sfi/sfp+ */
1073 case 0x10fc: /* 82599 xaui/bx4 */
1074 case 0x1557: /* 82599 single-port sfi */
1075 pcimsix = 4;
1076 type = I82599;
1077 break;
1078 default:
1079 continue;
1080 }
1081 pciregs = 0;
1082 if(nctlr >= nelem(ctlrtab)){
1083 print("i82598: too many controllers\n");
1084 return;
1085 }
1086
1087 io = p->mem[pciregs].bar & ~0xf;
1088 mem = vmap(io, p->mem[pciregs].size);
1089 if(mem == nil){
1090 print("i82598: can't map regs %#p\n",
1091 p->mem[pciregs].bar);
1092 continue;
1093 }
1094
1095 iomsi = p->mem[pcimsix].bar & ~0xf;
1096 memmsi = vmap(iomsi, p->mem[pcimsix].size);
1097 if(memmsi == nil){
1098 print("i82598: can't map msi-x regs %#p\n",
1099 p->mem[pcimsix].bar);
1100 vunmap(mem, p->mem[pciregs].size);
1101 continue;
1102 }
1103
1104 ctlr = malloc(sizeof *ctlr);
1105 if(ctlr == nil) {
1106 vunmap(mem, p->mem[pciregs].size);
1107 vunmap(memmsi, p->mem[pcimsix].size);
1108 error(Enomem);
1109 }
1110 ctlr->p = p;
1111 ctlr->type = type;
1112 ctlr->physreg = (u32int*)io;
1113 ctlr->physmsix = (u32int*)iomsi;
1114 ctlr->reg = (u32int*)mem;
1115 ctlr->msix = (u32int*)memmsi; /* unused */
1116 ctlr->rbsz = Rbsz;
1117 if(reset(ctlr)){
1118 print("i82598: can't reset\n");
1119 free(ctlr);
1120 vunmap(mem, p->mem[pciregs].size);
1121 vunmap(memmsi, p->mem[pcimsix].size);
1122 continue;
1123 }
1124 pcisetbme(p);
1125 ctlrtab[nctlr++] = ctlr;
1126 }
1127 }
1128
1129 static int
pnp(Ether * e)1130 pnp(Ether *e)
1131 {
1132 int i;
1133 Ctlr *ctlr;
1134
1135 if(nctlr == 0)
1136 scan();
1137 ctlr = nil;
1138 for(i = 0; i < nctlr; i++){
1139 ctlr = ctlrtab[i];
1140 if(ctlr == nil || ctlr->flag & Factive)
1141 continue;
1142 if(e->port == 0 || e->port == (ulong)ctlr->reg)
1143 break;
1144 }
1145 if (i >= nctlr)
1146 return -1;
1147 ctlr->flag |= Factive;
1148 e->ctlr = ctlr;
1149 e->port = (uintptr)ctlr->physreg;
1150 e->irq = ctlr->p->intl;
1151 e->tbdf = ctlr->p->tbdf;
1152 e->mbps = 10000;
1153 e->maxmtu = ETHERMAXTU;
1154 memmove(e->ea, ctlr->ra, Eaddrlen);
1155
1156 e->arg = e;
1157 e->attach = attach;
1158 e->detach = shutdown;
1159 e->transmit = transmit;
1160 e->interrupt = interrupt;
1161 e->ifstat = ifstat;
1162 e->shutdown = shutdown;
1163 e->ctl = ctl;
1164 e->multicast = multicast;
1165 e->promiscuous = promiscuous;
1166
1167 return 0;
1168 }
1169
1170 void
ether82598link(void)1171 ether82598link(void)
1172 {
1173 addethercard("i82598", pnp);
1174 addethercard("i10gbe", pnp);
1175 }
1176