xref: /plan9/sys/src/9/pc/ether82598.c (revision 588d0145e19f8596f2f4442d05dd8a9eda147983)
1 /*
2  * intel pci-express 10Gb ethernet driver for 8259[89]
3  * copyright © 2007, coraid, inc.
4  * depessimised and made to work on the 82599 at bell labs, 2013.
5  *
6  * 82599 requests should ideally not cross a 4KB (page) boundary.
7  */
8 #include "u.h"
9 #include "../port/lib.h"
10 #include "mem.h"
11 #include "dat.h"
12 #include "fns.h"
13 #include "io.h"
14 #include "../port/error.h"
15 #include "../port/netif.h"
16 #include "etherif.h"
17 
18 #define NEXTPOW2(x, m)	(((x)+1) & (m))
19 
20 enum {
21 	Rbsz	= ETHERMAXTU+32, /* +slop is for vlan headers, crcs, etc. */
22 	Descalign= 128,		/* 599 manual needs 128-byte alignment */
23 
24 	/* tunable parameters */
25 	Goslow	= 0,		/* flag: go slow by throttling intrs, etc. */
26 	/* were 256, 1024 & 64, but 30, 47 and 1 are ample. */
27 	Nrd	= 64,		/* multiple of 8, power of 2 for NEXTPOW2 */
28 	Nrb	= 128,
29 	Ntd	= 32,		/* multiple of 8, power of 2 for NEXTPOW2 */
30 };
31 
32 enum {
33 	/* general */
34 	Ctrl		= 0x00000/4,	/* Device Control */
35 	Status		= 0x00008/4,	/* Device Status */
36 	Ctrlext		= 0x00018/4,	/* Extended Device Control */
37 	Esdp		= 0x00020/4,	/* extended sdp control */
38 	Esodp		= 0x00028/4,	/* extended od sdp control (i2cctl on 599) */
39 	Ledctl		= 0x00200/4,	/* led control */
40 	Tcptimer	= 0x0004c/4,	/* tcp timer */
41 	Ecc		= 0x110b0/4,	/* errata ecc control magic (pcie intr cause on 599) */
42 
43 	/* nvm */
44 	Eec		= 0x10010/4,	/* eeprom/flash control */
45 	Eerd		= 0x10014/4,	/* eeprom read */
46 	Fla		= 0x1001c/4,	/* flash access */
47 	Flop		= 0x1013c/4,	/* flash opcode */
48 	Grc		= 0x10200/4,	/* general rx control */
49 
50 	/* interrupt */
51 	Icr		= 0x00800/4,	/* interrupt cause read */
52 	Ics		= 0x00808/4,	/* " set */
53 	Ims		= 0x00880/4,	/* " mask read/set (actually enable) */
54 	Imc		= 0x00888/4,	/* " mask clear */
55 	Iac		= 0x00810/4,	/* " auto clear */
56 	Iam		= 0x00890/4,	/* " auto mask enable */
57 	Itr		= 0x00820/4,	/* " throttling rate regs (0-19) */
58 	Ivar		= 0x00900/4,	/* " vector allocation regs. */
59 	/* msi interrupt */
60 	Msixt		= 0x0000/4,	/* msix table (bar3) */
61 	Msipba		= 0x2000/4,	/* msix pending bit array (bar3) */
62 	Pbacl		= 0x11068/4,	/* pba clear */
63 	Gpie		= 0x00898/4,	/* general purpose int enable */
64 
65 	/* flow control */
66 	Pfctop		= 0x03008/4,	/* priority flow ctl type opcode */
67 	Fcttv		= 0x03200/4,	/* " transmit timer value (0-3) */
68 	Fcrtl		= 0x03220/4,	/* " rx threshold low (0-7) +8n */
69 	Fcrth		= 0x03260/4,	/* " rx threshold high (0-7) +8n */
70 	Rcrtv		= 0x032a0/4,	/* " refresh value threshold */
71 	Tfcs		= 0x0ce00/4,	/* " tx status */
72 
73 	/* rx dma */
74 	Rbal		= 0x01000/4,	/* rx desc base low (0-63) +0x40n */
75 	Rbah		= 0x01004/4,	/* " high */
76 	Rdlen		= 0x01008/4,	/* " length */
77 	Rdh		= 0x01010/4,	/* " head */
78 	Rdt		= 0x01018/4,	/* " tail */
79 	Rxdctl		= 0x01028/4,	/* " control */
80 
81 	Srrctl		= 0x02100/4,	/* split & replication rx ctl. array */
82 	Dcarxctl	= 0x02200/4,	/* rx dca control */
83 	Rdrxctl		= 0x02f00/4,	/* rx dma control */
84 	Rxpbsize	= 0x03c00/4,	/* rx packet buffer size */
85 	Rxctl		= 0x03000/4,	/* rx control */
86 	Dropen		= 0x03d04/4,	/* drop enable control (598 only) */
87 
88 	/* rx */
89 	Rxcsum		= 0x05000/4,	/* rx checksum control */
90 	Rfctl		= 0x05008/4,	/* rx filter control */
91 	Mta		= 0x05200/4,	/* multicast table array (0-127) */
92 	Ral98		= 0x05400/4,	/* rx address low (598) */
93 	Rah98		= 0x05404/4,
94 	Ral99		= 0x0a200/4,	/* rx address low array (599) */
95 	Rah99		= 0x0a204/4,
96 	Psrtype		= 0x05480/4,	/* packet split rx type. */
97 	Vfta		= 0x0a000/4,	/* vlan filter table array. */
98 	Fctrl		= 0x05080/4,	/* filter control */
99 	Vlnctrl		= 0x05088/4,	/* vlan control */
100 	Msctctrl	= 0x05090/4,	/* multicast control */
101 	Mrqc		= 0x05818/4,	/* multiple rx queues cmd */
102 	Vmdctl		= 0x0581c/4,	/* vmdq control (598 only) */
103 	Imir		= 0x05a80/4,	/* immediate irq rx (0-7) (598 only) */
104 	Imirext		= 0x05aa0/4,	/* immediate irq rx ext (598 only) */
105 	Imirvp		= 0x05ac0/4,	/* immediate irq vlan priority (598 only) */
106 	Reta		= 0x05c00/4,	/* redirection table */
107 	Rssrk		= 0x05c80/4,	/* rss random key */
108 
109 	/* tx */
110 	Tdbal		= 0x06000/4,	/* tx desc base low +0x40n array */
111 	Tdbah		= 0x06004/4,	/* " high */
112 	Tdlen		= 0x06008/4,	/* " len */
113 	Tdh		= 0x06010/4,	/* " head */
114 	Tdt		= 0x06018/4,	/* " tail */
115 	Txdctl		= 0x06028/4,	/* " control */
116 	Tdwbal		= 0x06038/4,	/* " write-back address low */
117 	Tdwbah		= 0x0603c/4,
118 
119 	Dtxctl98	= 0x07e00/4,	/* tx dma control (598 only) */
120 	Dtxctl99	= 0x04a80/4,	/* tx dma control (599 only) */
121 	Tdcatxctrl98	= 0x07200/4,	/* tx dca register (0-15) (598 only) */
122 	Tdcatxctrl99	= 0x0600c/4,	/* tx dca register (0-127) (599 only) */
123 	Tipg		= 0x0cb00/4,	/* tx inter-packet gap (598 only) */
124 	Txpbsize	= 0x0cc00/4,	/* tx packet-buffer size (0-15) */
125 
126 	/* mac */
127 	Hlreg0		= 0x04240/4,	/* highlander control reg 0 */
128 	Hlreg1		= 0x04244/4,	/* highlander control reg 1 (ro) */
129 	Msca		= 0x0425c/4,	/* mdi signal cmd & addr */
130 	Msrwd		= 0x04260/4,	/* mdi single rw data */
131 	Mhadd		= 0x04268/4,	/* mac addr high & max frame */
132 	Pcss1		= 0x04288/4,	/* xgxs status 1 */
133 	Pcss2		= 0x0428c/4,
134 	Xpcss		= 0x04290/4,	/* 10gb-x pcs status */
135 	Serdesc		= 0x04298/4,	/* serdes control */
136 	Macs		= 0x0429c/4,	/* fifo control & report */
137 	Autoc		= 0x042a0/4,	/* autodetect control & status */
138 	Links		= 0x042a4/4,	/* link status */
139 	Links2		= 0x04324/4,	/* 599 only */
140 	Autoc2		= 0x042a8/4,
141 };
142 
143 enum {
144 	Factive		= 1<<0,
145 	Enable		= 1<<31,
146 
147 	/* Ctrl */
148 	Rst		= 1<<26,	/* full nic reset */
149 
150 	/* Txdctl */
151 	Ten		= 1<<25,
152 
153 	/* Dtxctl99 */
154 	Te		= 1<<0,		/* dma tx enable */
155 
156 	/* Fctrl */
157 	Bam		= 1<<10,	/* broadcast accept mode */
158 	Upe 		= 1<<9,		/* unicast promiscuous */
159 	Mpe 		= 1<<8,		/* multicast promiscuous */
160 
161 	/* Rxdctl */
162 	Pthresh		= 0,		/* prefresh threshold shift in bits */
163 	Hthresh		= 8,		/* host buffer minimum threshold " */
164 	Wthresh		= 16,		/* writeback threshold */
165 	Renable		= 1<<25,
166 
167 	/* Rxctl */
168 	Rxen		= 1<<0,
169 	Dmbyps		= 1<<1,		/* descr. monitor bypass (598 only) */
170 
171 	/* Rdrxctl */
172 	Rdmt½		= 0,		/* 598 */
173 	Rdmt¼		= 1,		/* 598 */
174 	Rdmt⅛		= 2,		/* 598 */
175 	Crcstrip	= 1<<1,		/* 599 */
176 	Rscfrstsize	= 037<<17,	/* 599; should be zero */
177 
178 	/* Rxcsum */
179 	Ippcse		= 1<<12,	/* ip payload checksum enable */
180 
181 	/* Eerd */
182 	EEstart		= 1<<0,		/* Start Read */
183 	EEdone		= 1<<1,		/* Read done */
184 
185 	/* interrupts */
186 	Irx0		= 1<<0,		/* driver defined */
187 	Itx0		= 1<<1,		/* driver defined */
188 	Lsc		= 1<<20,	/* link status change */
189 
190 	/* Links */
191 	Lnkup		= 1<<30,
192 	Lnkspd		= 1<<29,
193 
194 	/* Hlreg0 */
195 	Txcrcen		= 1<<0,		/* add crc during xmit */
196 	Rxcrcstrip	= 1<<1,		/* strip crc during recv */
197 	Jumboen		= 1<<2,
198 	Txpaden		= 1<<10,	/* pad short frames during xmit */
199 
200 	/* Autoc */
201 	Flu		= 1<<0,		/* force link up */
202 	Lmsshift	= 13,		/* link mode select shift */
203 	Lmsmask		= 7,
204 };
205 
206 typedef struct Ctlr Ctlr;
207 typedef struct Rd Rd;
208 typedef struct Td Td;
209 
210 typedef struct {
211 	uint	reg;
212 	char	*name;
213 } Stat;
214 
215 Stat stattab[] = {
216 	0x4000,	"crc error",
217 	0x4004,	"illegal byte",
218 	0x4008,	"short packet",
219 	0x3fa0,	"missed pkt0",
220 	0x4034,	"mac local flt",
221 	0x4038,	"mac rmt flt",
222 	0x4040,	"rx length err",
223 	0x3f60,	"xon tx",
224 	0xcf60,	"xon rx",
225 	0x3f68,	"xoff tx",
226 	0xcf68,	"xoff rx",
227 	0x405c,	"rx 040",
228 	0x4060,	"rx 07f",
229 	0x4064,	"rx 100",
230 	0x4068,	"rx 200",
231 	0x406c,	"rx 3ff",
232 	0x4070,	"rx big",
233 	0x4074,	"rx ok",
234 	0x4078,	"rx bcast",
235 	0x3fc0,	"rx no buf0",
236 	0x40a4,	"rx runt",
237 	0x40a8,	"rx frag",
238 	0x40ac,	"rx ovrsz",
239 	0x40b0,	"rx jab",
240 	0x40d0,	"rx pkt",
241 
242 	0x40d4,	"tx pkt",
243 	0x40d8,	"tx 040",
244 	0x40dc,	"tx 07f",
245 	0x40e0,	"tx 100",
246 	0x40e4,	"tx 200",
247 	0x40e8,	"tx 3ff",
248 	0x40ec,	"tx big",
249 	0x40f4,	"tx bcast",
250 	0x4120,	"xsum err",
251 };
252 
253 /* status */
254 enum {
255 	Pif	= 1<<7,	/* past exact filter (sic) */
256 	Ipcs	= 1<<6,	/* ip checksum calculated */
257 	L4cs	= 1<<5,	/* layer 2 */
258 	Tcpcs	= 1<<4,	/* tcp checksum calculated */
259 	Vp	= 1<<3,	/* 802.1q packet matched vet */
260 	Ixsm	= 1<<2,	/* ignore checksum */
261 	Reop	= 1<<1,	/* end of packet */
262 	Rdd	= 1<<0,	/* descriptor done */
263 };
264 
265 struct Rd {			/* Receive Descriptor */
266 	u32int	addr[2];
267 	ushort	length;
268 	ushort	cksum;
269 	uchar	status;
270 	uchar	errors;
271 	ushort	vlan;
272 };
273 
274 enum {
275 	/* Td cmd */
276 	Rs	= 1<<3,		/* report status */
277 	Ic	= 1<<2,		/* insert checksum */
278 	Ifcs	= 1<<1,		/* insert FCS (ethernet crc) */
279 	Teop	= 1<<0,		/* end of packet */
280 
281 	/* Td status */
282 	Tdd	= 1<<0,		/* descriptor done */
283 };
284 
285 struct Td {			/* Transmit Descriptor */
286 	u32int	addr[2];
287 	ushort	length;
288 	uchar	cso;
289 	uchar	cmd;
290 	uchar	status;
291 	uchar	css;
292 	ushort	vlan;
293 };
294 
295 struct Ctlr {
296 	Pcidev	*p;
297 	Ether	*edev;
298 	int	type;
299 
300 	/* virtual */
301 	u32int	*reg;
302 	u32int	*msix;			/* unused */
303 
304 	/* physical */
305 	u32int	*physreg;
306 	u32int	*physmsix;		/* unused */
307 
308 	uchar	flag;
309 	int	nrd;
310 	int	ntd;
311 	int	nrb;			/* # bufs this Ctlr has in the pool */
312 	uint	rbsz;
313 	int	procsrunning;
314 	int	attached;
315 
316 	Watermark wmrb;
317 	Watermark wmrd;
318 	Watermark wmtd;
319 
320 	QLock	slock;
321 	QLock	alock;			/* attach lock */
322 	QLock	tlock;
323 	Rendez	lrendez;
324 	Rendez	trendez;
325 	Rendez	rrendez;
326 
327 	uint	im;			/* interrupt mask */
328 	uint	lim;
329 	uint	rim;
330 	uint	tim;
331 	Lock	imlock;
332 
333 	Rd*	rdba;			/* receive descriptor base address */
334 	Block**	rb;			/* receive buffers */
335 	int	rdt;			/* receive descriptor tail */
336 	int	rdfree;			/* rx descriptors awaiting packets */
337 
338 	Td*	tdba;			/* transmit descriptor base address */
339 	int	tdh;			/* transmit descriptor head */
340 	int	tdt;			/* transmit descriptor tail */
341 	Block**	tb;			/* transmit buffers */
342 
343 	uchar	ra[Eaddrlen];		/* receive address */
344 	uchar	mta[128];		/* multicast table array */
345 	ulong	stats[nelem(stattab)];
346 	uint	speeds[3];
347 };
348 
349 enum {
350 	I82598 = 1,
351 	I82599,
352 };
353 
354 static	Ctlr	*ctlrtab[4];
355 static	int	nctlr;
356 static	Lock	rblock;
357 static	Block	*rbpool;
358 static	int	nrbfull;  /* # of rcv Blocks with data awaiting processing */
359 
360 static void
readstats(Ctlr * ctlr)361 readstats(Ctlr *ctlr)
362 {
363 	int i;
364 
365 	qlock(&ctlr->slock);
366 	for(i = 0; i < nelem(ctlr->stats); i++)
367 		ctlr->stats[i] += ctlr->reg[stattab[i].reg >> 2];
368 	qunlock(&ctlr->slock);
369 }
370 
371 static int speedtab[] = {
372 	0,
373 	1000,
374 	10000,
375 };
376 
377 static long
ifstat(Ether * edev,void * a,long n,ulong offset)378 ifstat(Ether *edev, void *a, long n, ulong offset)
379 {
380 	uint i, *t;
381 	char *s, *p, *e;
382 	Ctlr *ctlr;
383 
384 	ctlr = edev->ctlr;
385 	p = s = malloc(READSTR);
386 	if(p == nil)
387 		error(Enomem);
388 	e = p + READSTR;
389 
390 	readstats(ctlr);
391 	for(i = 0; i < nelem(stattab); i++)
392 		if(ctlr->stats[i] > 0)
393 			p = seprint(p, e, "%.10s  %uld\n", stattab[i].name,
394 				ctlr->stats[i]);
395 	t = ctlr->speeds;
396 	p = seprint(p, e, "speeds: 0:%d 1000:%d 10000:%d\n", t[0], t[1], t[2]);
397 	p = seprint(p, e, "mtu: min:%d max:%d\n", edev->minmtu, edev->maxmtu);
398 	p = seprint(p, e, "rdfree %d rdh %d rdt %d\n", ctlr->rdfree, ctlr->reg[Rdt],
399 		ctlr->reg[Rdh]);
400 	p = seprintmark(p, e, &ctlr->wmrb);
401 	p = seprintmark(p, e, &ctlr->wmrd);
402 	p = seprintmark(p, e, &ctlr->wmtd);
403 	USED(p);
404 	n = readstr(offset, a, n, s);
405 	free(s);
406 
407 	return n;
408 }
409 
410 static void
ienable(Ctlr * ctlr,int i)411 ienable(Ctlr *ctlr, int i)
412 {
413 	ilock(&ctlr->imlock);
414 	ctlr->im |= i;
415 	ctlr->reg[Ims] = ctlr->im;
416 	iunlock(&ctlr->imlock);
417 }
418 
419 static int
lim(void * v)420 lim(void *v)
421 {
422 	return ((Ctlr*)v)->lim != 0;
423 }
424 
425 static void
lproc(void * v)426 lproc(void *v)
427 {
428 	int r, i;
429 	Ctlr *ctlr;
430 	Ether *e;
431 
432 	e = v;
433 	ctlr = e->ctlr;
434 	for (;;) {
435 		r = ctlr->reg[Links];
436 		e->link = (r & Lnkup) != 0;
437 		i = 0;
438 		if(e->link)
439 			i = 1 + ((r & Lnkspd) != 0);
440 		ctlr->speeds[i]++;
441 		e->mbps = speedtab[i];
442 		ctlr->lim = 0;
443 		ienable(ctlr, Lsc);
444 		sleep(&ctlr->lrendez, lim, ctlr);
445 		ctlr->lim = 0;
446 	}
447 }
448 
449 static long
ctl(Ether *,void *,long)450 ctl(Ether *, void *, long)
451 {
452 	error(Ebadarg);
453 	return -1;
454 }
455 
456 static Block*
rballoc(void)457 rballoc(void)
458 {
459 	Block *bp;
460 
461 	ilock(&rblock);
462 	if((bp = rbpool) != nil){
463 		rbpool = bp->next;
464 		bp->next = 0;
465 		_xinc(&bp->ref);	/* prevent bp from being freed */
466 	}
467 	iunlock(&rblock);
468 	return bp;
469 }
470 
471 void
rbfree(Block * b)472 rbfree(Block *b)
473 {
474 	b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base);
475  	b->flag &= ~(Bipck | Budpck | Btcpck | Bpktck);
476 	ilock(&rblock);
477 	b->next = rbpool;
478 	rbpool = b;
479 	nrbfull--;
480 	iunlock(&rblock);
481 }
482 
483 static int
cleanup(Ctlr * ctlr,int tdh)484 cleanup(Ctlr *ctlr, int tdh)
485 {
486 	Block *b;
487 	uint m, n;
488 
489 	m = ctlr->ntd - 1;
490 	while(ctlr->tdba[n = NEXTPOW2(tdh, m)].status & Tdd){
491 		tdh = n;
492 		b = ctlr->tb[tdh];
493 		ctlr->tb[tdh] = 0;
494 		if (b)
495 			freeb(b);
496 		ctlr->tdba[tdh].status = 0;
497 	}
498 	return tdh;
499 }
500 
501 void
transmit(Ether * e)502 transmit(Ether *e)
503 {
504 	uint i, m, tdt, tdh;
505 	Ctlr *ctlr;
506 	Block *b;
507 	Td *t;
508 
509 	ctlr = e->ctlr;
510 	if(!canqlock(&ctlr->tlock)){
511 		ienable(ctlr, Itx0);
512 		return;
513 	}
514 	tdh = ctlr->tdh = cleanup(ctlr, ctlr->tdh);
515 	tdt = ctlr->tdt;
516 	m = ctlr->ntd - 1;
517 	for(i = 0; ; i++){
518 		if(NEXTPOW2(tdt, m) == tdh){	/* ring full? */
519 			ienable(ctlr, Itx0);
520 			break;
521 		}
522 		if((b = qget(e->oq)) == nil)
523 			break;
524 		assert(ctlr->tdba != nil);
525 		t = ctlr->tdba + tdt;
526 		t->addr[0] = PCIWADDR(b->rp);
527 		t->length = BLEN(b);
528 		t->cmd = Ifcs | Teop;
529 		if (!Goslow)
530 			t->cmd |= Rs;
531 		ctlr->tb[tdt] = b;
532 		/* note size of queue of tds awaiting transmission */
533 		notemark(&ctlr->wmtd, (tdt + Ntd - tdh) % Ntd);
534 		tdt = NEXTPOW2(tdt, m);
535 	}
536 	if(i) {
537 		coherence();
538 		ctlr->reg[Tdt] = ctlr->tdt = tdt;  /* make new Tds active */
539 		coherence();
540 		ienable(ctlr, Itx0);
541 	}
542 	qunlock(&ctlr->tlock);
543 }
544 
545 static int
tim(void * c)546 tim(void *c)
547 {
548 	return ((Ctlr*)c)->tim != 0;
549 }
550 
551 static void
tproc(void * v)552 tproc(void *v)
553 {
554 	Ctlr *ctlr;
555 	Ether *e;
556 
557 	e = v;
558 	ctlr = e->ctlr;
559 	for (;;) {
560 		sleep(&ctlr->trendez, tim, ctlr); /* xmit interrupt kicks us */
561 		ctlr->tim = 0;
562 		transmit(e);
563 	}
564 }
565 
566 static void
rxinit(Ctlr * ctlr)567 rxinit(Ctlr *ctlr)
568 {
569 	int i, is598, autoc;
570 	ulong until;
571 	Block *b;
572 
573 	ctlr->reg[Rxctl] &= ~Rxen;
574 	ctlr->reg[Rxdctl] = 0;
575 	for(i = 0; i < ctlr->nrd; i++){
576 		b = ctlr->rb[i];
577 		ctlr->rb[i] = 0;
578 		if(b)
579 			freeb(b);
580 	}
581 	ctlr->rdfree = 0;
582 
583 	coherence();
584 	ctlr->reg[Fctrl] |= Bam;
585 	ctlr->reg[Fctrl] &= ~(Upe | Mpe);
586 
587 	/* intel gets some csums wrong (e.g., errata 44) */
588 	ctlr->reg[Rxcsum] &= ~Ippcse;
589 	ctlr->reg[Hlreg0] &= ~Jumboen;		/* jumbos are a bad idea */
590 	ctlr->reg[Hlreg0] |= Txcrcen | Rxcrcstrip | Txpaden;
591 	ctlr->reg[Srrctl] = (ctlr->rbsz + 1024 - 1) / 1024;
592 	ctlr->reg[Mhadd] = ctlr->rbsz << 16;
593 
594 	ctlr->reg[Rbal] = PCIWADDR(ctlr->rdba);
595 	ctlr->reg[Rbah] = 0;
596 	ctlr->reg[Rdlen] = ctlr->nrd*sizeof(Rd); /* must be multiple of 128 */
597 	ctlr->reg[Rdh] = 0;
598 	ctlr->reg[Rdt] = ctlr->rdt = 0;
599 	coherence();
600 
601 	is598 = (ctlr->type == I82598);
602 	if (is598)
603 		ctlr->reg[Rdrxctl] = Rdmt¼;
604 	else {
605 		ctlr->reg[Rdrxctl] |= Crcstrip;
606 		ctlr->reg[Rdrxctl] &= ~Rscfrstsize;
607 	}
608 	if (Goslow && is598)
609 		ctlr->reg[Rxdctl] = 8<<Wthresh | 8<<Pthresh | 4<<Hthresh | Renable;
610 	else
611 		ctlr->reg[Rxdctl] = Renable;
612 	coherence();
613 
614 	/*
615 	 * don't wait forever like an idiot (and hang the system),
616 	 * maybe it's disconnected.
617 	 */
618 	until = TK2MS(MACHP(0)->ticks) + 250;
619 	while (!(ctlr->reg[Rxdctl] & Renable) && TK2MS(MACHP(0)->ticks) < until)
620 		;
621 	if(!(ctlr->reg[Rxdctl] & Renable))
622 		print("#l%d: Renable didn't come on, might be disconnected\n",
623 			ctlr->edev->ctlrno);
624 
625 	ctlr->reg[Rxctl] |= Rxen | (is598? Dmbyps: 0);
626 
627 	if (is598){
628 		autoc = ctlr->reg[Autoc];
629 		/* what is this rubbish and why do we care? */
630 		print("#l%d: autoc %#ux; lms %d (3 is 10g sfp)\n",
631 			ctlr->edev->ctlrno, autoc, (autoc>>Lmsshift) & Lmsmask);
632 		ctlr->reg[Autoc] |= Flu;
633 		coherence();
634 		delay(50);
635 	}
636 }
637 
638 static void
replenish(Ctlr * ctlr,uint rdh)639 replenish(Ctlr *ctlr, uint rdh)
640 {
641 	int rdt, m, i;
642 	Block *b;
643 	Rd *r;
644 
645 	m = ctlr->nrd - 1;
646 	i = 0;
647 	for(rdt = ctlr->rdt; NEXTPOW2(rdt, m) != rdh; rdt = NEXTPOW2(rdt, m)){
648 		r = ctlr->rdba + rdt;
649 		if((b = rballoc()) == nil){
650 			print("#l%d: no buffers\n", ctlr->edev->ctlrno);
651 			break;
652 		}
653 		ctlr->rb[rdt] = b;
654 		r->addr[0] = PCIWADDR(b->rp);
655 		r->status = 0;
656 		ctlr->rdfree++;
657 		i++;
658 	}
659 	if(i) {
660 		coherence();
661 		ctlr->reg[Rdt] = ctlr->rdt = rdt; /* hand back recycled rdescs */
662 		coherence();
663 	}
664 }
665 
666 static int
rim(void * v)667 rim(void *v)
668 {
669 	return ((Ctlr*)v)->rim != 0;
670 }
671 
672 void
rproc(void * v)673 rproc(void *v)
674 {
675 	int passed;
676 	uint m, rdh;
677 	Block *bp;
678 	Ctlr *ctlr;
679 	Ether *e;
680 	Rd *r;
681 
682 	e = v;
683 	ctlr = e->ctlr;
684 	m = ctlr->nrd - 1;
685 	for (rdh = 0; ; ) {
686 		replenish(ctlr, rdh);
687 		ienable(ctlr, Irx0);
688 		sleep(&ctlr->rrendez, rim, ctlr);
689 		passed = 0;
690 		for (;;) {
691 			ctlr->rim = 0;
692 			r = ctlr->rdba + rdh;
693 			if(!(r->status & Rdd))
694 				break;		/* wait for pkts to arrive */
695 			bp = ctlr->rb[rdh];
696 			ctlr->rb[rdh] = 0;
697 			if (r->length > ETHERMAXTU)
698 				print("#l%d: got jumbo of %d bytes\n",
699 					e->ctlrno, r->length);
700 			bp->wp += r->length;
701 			bp->lim = bp->wp;		/* lie like a dog */
702 //			r->status = 0;
703 
704 			ilock(&rblock);
705 			nrbfull++;
706 			iunlock(&rblock);
707 			notemark(&ctlr->wmrb, nrbfull);
708 			etheriq(e, bp, 1);
709 
710 			passed++;
711 			ctlr->rdfree--;
712 			rdh = NEXTPOW2(rdh, m);
713 			if (ctlr->rdfree <= ctlr->nrd - 16)
714 				replenish(ctlr, rdh);
715 		}
716 		/* note how many rds had full buffers */
717 		notemark(&ctlr->wmrd, passed);
718 	}
719 }
720 
721 static void
promiscuous(void * a,int on)722 promiscuous(void *a, int on)
723 {
724 	Ctlr *ctlr;
725 	Ether *e;
726 
727 	e = a;
728 	ctlr = e->ctlr;
729 	if(on)
730 		ctlr->reg[Fctrl] |= Upe | Mpe;
731 	else
732 		ctlr->reg[Fctrl] &= ~(Upe | Mpe);
733 }
734 
735 static void
multicast(void * a,uchar * ea,int on)736 multicast(void *a, uchar *ea, int on)
737 {
738 	int b, i;
739 	Ctlr *ctlr;
740 	Ether *e;
741 
742 	e = a;
743 	ctlr = e->ctlr;
744 
745 	/*
746 	 * multiple ether addresses can hash to the same filter bit,
747 	 * so it's never safe to clear a filter bit.
748 	 * if we want to clear filter bits, we need to keep track of
749 	 * all the multicast addresses in use, clear all the filter bits,
750 	 * then set the ones corresponding to in-use addresses.
751 	 */
752 	i = ea[5] >> 1;
753 	b = (ea[5]&1)<<4 | ea[4]>>4;
754 	b = 1 << b;
755 	if(on)
756 		ctlr->mta[i] |= b;
757 //	else
758 //		ctlr->mta[i] &= ~b;
759 	ctlr->reg[Mta+i] = ctlr->mta[i];
760 }
761 
762 static void
freemem(Ctlr * ctlr)763 freemem(Ctlr *ctlr)
764 {
765 	Block *b;
766 
767 	while(b = rballoc()){
768 		b->free = 0;
769 		freeb(b);
770 	}
771 	free(ctlr->rdba);
772 	ctlr->rdba = nil;
773 	free(ctlr->tdba);
774 	ctlr->tdba = nil;
775 	free(ctlr->rb);
776 	ctlr->rb = nil;
777 	free(ctlr->tb);
778 	ctlr->tb = nil;
779 }
780 
781 static int
detach(Ctlr * ctlr)782 detach(Ctlr *ctlr)
783 {
784 	int i, is598;
785 
786 	ctlr->reg[Imc] = ~0;
787 	ctlr->reg[Ctrl] |= Rst;
788 	for(i = 0; i < 100; i++){
789 		delay(1);
790 		if((ctlr->reg[Ctrl] & Rst) == 0)
791 			break;
792 	}
793 	if (i >= 100)
794 		return -1;
795 	is598 = (ctlr->type == I82598);
796 	if (is598) {			/* errata */
797 		delay(50);
798 		ctlr->reg[Ecc] &= ~(1<<21 | 1<<18 | 1<<9 | 1<<6);
799 	}
800 
801 	/* not cleared by reset; kill it manually. */
802 	for(i = 1; i < 16; i++)
803 		ctlr->reg[is598? Rah98: Rah99] &= ~Enable;
804 	for(i = 0; i < 128; i++)
805 		ctlr->reg[Mta + i] = 0;
806 	for(i = 1; i < (is598? 640: 128); i++)
807 		ctlr->reg[Vfta + i] = 0;
808 
809 //	freemem(ctlr);			// TODO
810 	ctlr->attached = 0;
811 	return 0;
812 }
813 
814 static void
shutdown(Ether * e)815 shutdown(Ether *e)
816 {
817 	detach(e->ctlr);
818 //	freemem(e->ctlr);
819 }
820 
821 /* ≤ 20ms */
822 static ushort
eeread(Ctlr * ctlr,int i)823 eeread(Ctlr *ctlr, int i)
824 {
825 	ctlr->reg[Eerd] = EEstart | i<<2;
826 	while((ctlr->reg[Eerd] & EEdone) == 0)
827 		;
828 	return ctlr->reg[Eerd] >> 16;
829 }
830 
831 static int
eeload(Ctlr * ctlr)832 eeload(Ctlr *ctlr)
833 {
834 	ushort u, v, p, l, i, j;
835 
836 	if((eeread(ctlr, 0) & 0xc0) != 0x40)
837 		return -1;
838 	u = 0;
839 	for(i = 0; i < 0x40; i++)
840 		u +=  eeread(ctlr, i);
841 	for(i = 3; i < 0xf; i++){
842 		p = eeread(ctlr, i);
843 		l = eeread(ctlr, p++);
844 		if((int)p + l + 1 > 0xffff)
845 			continue;
846 		for(j = p; j < p + l; j++)
847 			u += eeread(ctlr, j);
848 	}
849 	if(u != 0xbaba)
850 		return -1;
851 	if(ctlr->reg[Status] & (1<<3))
852 		u = eeread(ctlr, 10);
853 	else
854 		u = eeread(ctlr, 9);
855 	u++;
856 	for(i = 0; i < Eaddrlen;){
857 		v = eeread(ctlr, u + i/2);
858 		ctlr->ra[i++] = v;
859 		ctlr->ra[i++] = v>>8;
860 	}
861 	ctlr->ra[5] += (ctlr->reg[Status] & 0xc) >> 2;
862 	return 0;
863 }
864 
865 static int
reset(Ctlr * ctlr)866 reset(Ctlr *ctlr)
867 {
868 	int i, is598;
869 	uchar *p;
870 
871 	if(detach(ctlr)){
872 		print("82598: reset timeout\n");
873 		return -1;
874 	}
875 	if(eeload(ctlr)){
876 		print("82598: eeprom failure\n");
877 		return -1;
878 	}
879 	p = ctlr->ra;
880 	is598 = (ctlr->type == I82598);
881 	ctlr->reg[is598? Ral98: Ral99] = p[3]<<24 | p[2]<<16 | p[1]<<8 | p[0];
882 	ctlr->reg[is598? Rah98: Rah99] = p[5]<<8 | p[4] | Enable;
883 
884 	readstats(ctlr);
885 	for(i = 0; i<nelem(ctlr->stats); i++)
886 		ctlr->stats[i] = 0;
887 
888 	ctlr->reg[Ctrlext] |= 1 << 16;	/* required by errata (spec change 4) */
889 	if (Goslow) {
890 		/* make some guesses for flow control */
891 		ctlr->reg[Fcrtl] = 0x10000 | Enable;
892 		ctlr->reg[Fcrth] = 0x40000 | Enable;
893 		ctlr->reg[Rcrtv] = 0x6000;
894 	} else
895 		ctlr->reg[Fcrtl] = ctlr->reg[Fcrth] = ctlr->reg[Rcrtv] = 0;
896 
897 	/* configure interrupt mapping (don't ask) */
898 	ctlr->reg[Ivar+0] =     0 | 1<<7;
899 	ctlr->reg[Ivar+64/4] =  1 | 1<<7;
900 //	ctlr->reg[Ivar+97/4] = (2 | 1<<7) << (8*(97%4));
901 
902 	if (Goslow) {
903 		/* interrupt throttling goes here. */
904 		for(i = Itr; i < Itr + 20; i++)
905 			ctlr->reg[i] = 128;		/* ¼µs intervals */
906 		ctlr->reg[Itr + Itx0] = 256;
907 	} else {					/* don't throttle */
908 		for(i = Itr; i < Itr + 20; i++)
909 			ctlr->reg[i] = 0;		/* ¼µs intervals */
910 		ctlr->reg[Itr + Itx0] = 0;
911 	}
912 	return 0;
913 }
914 
915 static void
txinit(Ctlr * ctlr)916 txinit(Ctlr *ctlr)
917 {
918 	Block *b;
919 	int i;
920 
921 	if (Goslow)
922 		ctlr->reg[Txdctl] = 16<<Wthresh | 16<<Pthresh;
923 	else
924 		ctlr->reg[Txdctl] = 0;
925 	if (ctlr->type == I82599)
926 		ctlr->reg[Dtxctl99] = 0;
927 	coherence();
928 	for(i = 0; i < ctlr->ntd; i++){
929 		b = ctlr->tb[i];
930 		ctlr->tb[i] = 0;
931 		if(b)
932 			freeb(b);
933 	}
934 
935 	assert(ctlr->tdba != nil);
936 	memset(ctlr->tdba, 0, ctlr->ntd * sizeof(Td));
937 	ctlr->reg[Tdbal] = PCIWADDR(ctlr->tdba);
938 	ctlr->reg[Tdbah] = 0;
939 	ctlr->reg[Tdlen] = ctlr->ntd*sizeof(Td); /* must be multiple of 128 */
940 	ctlr->reg[Tdh] = 0;
941 	ctlr->tdh = ctlr->ntd - 1;
942 	ctlr->reg[Tdt] = ctlr->tdt = 0;
943 	coherence();
944 	if (ctlr->type == I82599)
945 		ctlr->reg[Dtxctl99] |= Te;
946 	coherence();
947 	ctlr->reg[Txdctl] |= Ten;
948 	coherence();
949 	while (!(ctlr->reg[Txdctl] & Ten))
950 		;
951 }
952 
953 static void
attach(Ether * e)954 attach(Ether *e)
955 {
956 	Block *b;
957 	Ctlr *ctlr;
958 	char buf[KNAMELEN];
959 
960 	ctlr = e->ctlr;
961 	ctlr->edev = e;			/* point back to Ether* */
962 	qlock(&ctlr->alock);
963 	if(waserror()){
964 		reset(ctlr);
965 		freemem(ctlr);
966 		qunlock(&ctlr->alock);
967 		nexterror();
968 	}
969 	if(ctlr->rdba == nil) {
970 		ctlr->nrd = Nrd;
971 		ctlr->ntd = Ntd;
972 		ctlr->rdba = mallocalign(ctlr->nrd * sizeof *ctlr->rdba,
973 			Descalign, 0, 0);
974 		ctlr->tdba = mallocalign(ctlr->ntd * sizeof *ctlr->tdba,
975 			Descalign, 0, 0);
976 		ctlr->rb = malloc(ctlr->nrd * sizeof(Block *));
977 		ctlr->tb = malloc(ctlr->ntd * sizeof(Block *));
978 		if (ctlr->rdba == nil || ctlr->tdba == nil ||
979 		    ctlr->rb == nil || ctlr->tb == nil)
980 			error(Enomem);
981 
982 		for(ctlr->nrb = 0; ctlr->nrb < 2*Nrb; ctlr->nrb++){
983 			b = allocb(ctlr->rbsz + BY2PG);	/* see rbfree() */
984 			if(b == nil)
985 				error(Enomem);
986 			b->free = rbfree;
987 			freeb(b);
988 		}
989 	}
990 	if (!ctlr->attached) {
991 		rxinit(ctlr);
992 		txinit(ctlr);
993 		nrbfull = 0;
994 		if (!ctlr->procsrunning) {
995 			snprint(buf, sizeof buf, "#l%dl", e->ctlrno);
996 			kproc(buf, lproc, e);
997 			snprint(buf, sizeof buf, "#l%dr", e->ctlrno);
998 			kproc(buf, rproc, e);
999 			snprint(buf, sizeof buf, "#l%dt", e->ctlrno);
1000 			kproc(buf, tproc, e);
1001 			ctlr->procsrunning = 1;
1002 		}
1003 		initmark(&ctlr->wmrb, Nrb, "rcv bufs unprocessed");
1004 		initmark(&ctlr->wmrd, Nrd-1, "rcv descrs processed at once");
1005 		initmark(&ctlr->wmtd, Ntd-1, "xmit descr queue len");
1006 		ctlr->attached = 1;
1007 	}
1008 	qunlock(&ctlr->alock);
1009 	poperror();
1010 }
1011 
1012 static void
interrupt(Ureg *,void * v)1013 interrupt(Ureg*, void *v)
1014 {
1015 	int icr, im;
1016 	Ctlr *ctlr;
1017 	Ether *e;
1018 
1019 	e = v;
1020 	ctlr = e->ctlr;
1021 	ilock(&ctlr->imlock);
1022 	ctlr->reg[Imc] = ~0;			/* disable all intrs */
1023 	im = ctlr->im;
1024 	while((icr = ctlr->reg[Icr] & ctlr->im) != 0){
1025 		if(icr & Irx0){
1026 			im &= ~Irx0;
1027 			ctlr->rim = Irx0;
1028 			wakeup(&ctlr->rrendez);
1029 		}
1030 		if(icr & Itx0){
1031 			im &= ~Itx0;
1032 			ctlr->tim = Itx0;
1033 			wakeup(&ctlr->trendez);
1034 		}
1035 		if(icr & Lsc){
1036 			im &= ~Lsc;
1037 			ctlr->lim = Lsc;
1038 			wakeup(&ctlr->lrendez);
1039 		}
1040 	}
1041 	ctlr->reg[Ims] = ctlr->im = im; /* enable only intrs we didn't service */
1042 	iunlock(&ctlr->imlock);
1043 }
1044 
1045 static void
scan(void)1046 scan(void)
1047 {
1048 	int pciregs, pcimsix, type;
1049 	ulong io, iomsi;
1050 	void *mem, *memmsi;
1051 	Ctlr *ctlr;
1052 	Pcidev *p;
1053 
1054 	p = 0;
1055 	while(p = pcimatch(p, Vintel, 0)){
1056 		switch(p->did){
1057 		case 0x10b6:		/* 82598 backplane */
1058 		case 0x10c6:		/* 82598 af dual port */
1059 		case 0x10c7:		/* 82598 af single port */
1060 		case 0x10dd:		/* 82598 at cx4 */
1061 		case 0x10ec:		/* 82598 at cx4 dual port */
1062 			pcimsix = 3;
1063 			type = I82598;
1064 			break;
1065 		case 0x10f7:		/* 82599 kx/kx4 */
1066 		case 0x10f8:		/* 82599 kx/kx4/kx */
1067 		case 0x10f9:		/* 82599 cx4 */
1068 		case 0x10fb:		/* 82599 sfi/sfp+ */
1069 		case 0x10fc:		/* 82599 xaui/bx4 */
1070 		case 0x1557:		/* 82599 single-port sfi */
1071 			pcimsix = 4;
1072 			type = I82599;
1073 			break;
1074 		default:
1075 			continue;
1076 		}
1077 		pciregs = 0;
1078 		if(nctlr >= nelem(ctlrtab)){
1079 			print("i82598: too many controllers\n");
1080 			return;
1081 		}
1082 
1083 		io = p->mem[pciregs].bar & ~0xf;
1084 		mem = vmap(io, p->mem[pciregs].size);
1085 		if(mem == nil){
1086 			print("i82598: can't map regs %#p\n",
1087 				p->mem[pciregs].bar);
1088 			continue;
1089 		}
1090 
1091 		iomsi = p->mem[pcimsix].bar & ~0xf;
1092 		memmsi = vmap(iomsi, p->mem[pcimsix].size);
1093 		if(memmsi == nil){
1094 			print("i82598: can't map msi-x regs %#p\n",
1095 				p->mem[pcimsix].bar);
1096 			vunmap(mem, p->mem[pciregs].size);
1097 			continue;
1098 		}
1099 
1100 		ctlr = malloc(sizeof *ctlr);
1101 		if(ctlr == nil) {
1102 			vunmap(mem, p->mem[pciregs].size);
1103 			vunmap(memmsi, p->mem[pcimsix].size);
1104 			error(Enomem);
1105 		}
1106 		ctlr->p = p;
1107 		ctlr->type = type;
1108 		ctlr->physreg = (u32int*)io;
1109 		ctlr->physmsix = (u32int*)iomsi;
1110 		ctlr->reg = (u32int*)mem;
1111 		ctlr->msix = (u32int*)memmsi;	/* unused */
1112 		ctlr->rbsz = Rbsz;
1113 		if(reset(ctlr)){
1114 			print("i82598: can't reset\n");
1115 			free(ctlr);
1116 			vunmap(mem, p->mem[pciregs].size);
1117 			vunmap(memmsi, p->mem[pcimsix].size);
1118 			continue;
1119 		}
1120 		pcisetbme(p);
1121 		ctlrtab[nctlr++] = ctlr;
1122 	}
1123 }
1124 
1125 static int
pnp(Ether * e)1126 pnp(Ether *e)
1127 {
1128 	int i;
1129 	Ctlr *ctlr;
1130 
1131 	if(nctlr == 0)
1132 		scan();
1133 	ctlr = nil;
1134 	for(i = 0; i < nctlr; i++){
1135 		ctlr = ctlrtab[i];
1136 		if(ctlr == nil || ctlr->flag & Factive)
1137 			continue;
1138 		if(e->port == 0 || e->port == (ulong)ctlr->reg)
1139 			break;
1140 	}
1141 	if (i >= nctlr)
1142 		return -1;
1143 	ctlr->flag |= Factive;
1144 	e->ctlr = ctlr;
1145 	e->port = (uintptr)ctlr->physreg;
1146 	e->irq = ctlr->p->intl;
1147 	e->tbdf = ctlr->p->tbdf;
1148 	e->mbps = 10000;
1149 	e->maxmtu = ETHERMAXTU;
1150 	memmove(e->ea, ctlr->ra, Eaddrlen);
1151 
1152 	e->arg = e;
1153 	e->attach = attach;
1154 	e->detach = shutdown;
1155 	e->transmit = transmit;
1156 	e->interrupt = interrupt;
1157 	e->ifstat = ifstat;
1158 	e->shutdown = shutdown;
1159 	e->ctl = ctl;
1160 	e->multicast = multicast;
1161 	e->promiscuous = promiscuous;
1162 
1163 	return 0;
1164 }
1165 
1166 void
ether82598link(void)1167 ether82598link(void)
1168 {
1169 	addethercard("i82598", pnp);
1170 	addethercard("i10gbe", pnp);
1171 }
1172