xref: /plan9-contrib/sys/src/9/pc/ether82598.c (revision 61d44851dbae9c6db4696bac4b180d884ecea735)
1 /*
2  * intel pci-express 10Gb ethernet driver for 8259[89]
3  * copyright © 2007, coraid, inc.
4  * depessimised and made to work on the 82599 at bell labs, 2013.
5  *
6  * 82599 requests should ideally not cross a 4KB (page) boundary.
7  */
8 #include "u.h"
9 #include "../port/lib.h"
10 #include "mem.h"
11 #include "dat.h"
12 #include "fns.h"
13 #include "io.h"
14 #include "../port/error.h"
15 #include "../port/netif.h"
16 #include "etherif.h"
17 
18 #define NEXTPOW2(x, m)	(((x)+1) & (m))
19 
20 enum {
21 	Rbsz	= ETHERMAXTU+32, /* +slop is for vlan headers, crcs, etc. */
22 	Descalign= 128,		/* 599 manual needs 128-byte alignment */
23 
24 	/* tunable parameters */
25 	Goslow	= 0,		/* flag: go slow by throttling intrs, etc. */
26 	/*
27 	 * these were 256, 1024 & 128, but 30, 47 and 1 are usually ample;
28 	 * however cpu servers and terminals can need more receive buffers
29 	 * due to bursts of traffic.
30 	 */
31 	Nrd	= 128,		/* multiple of 8, power of 2 for NEXTPOW2 */
32 	Nrb	= 1024,
33 	Ntd	= 32,		/* multiple of 8, power of 2 for NEXTPOW2 */
34 };
35 
36 enum {
37 	/* general */
38 	Ctrl		= 0x00000/4,	/* Device Control */
39 	Status		= 0x00008/4,	/* Device Status */
40 	Ctrlext		= 0x00018/4,	/* Extended Device Control */
41 	Esdp		= 0x00020/4,	/* extended sdp control */
42 	Esodp		= 0x00028/4,	/* extended od sdp control (i2cctl on 599) */
43 	Ledctl		= 0x00200/4,	/* led control */
44 	Tcptimer	= 0x0004c/4,	/* tcp timer */
45 	Ecc		= 0x110b0/4,	/* errata ecc control magic (pcie intr cause on 599) */
46 
47 	/* nvm */
48 	Eec		= 0x10010/4,	/* eeprom/flash control */
49 	Eerd		= 0x10014/4,	/* eeprom read */
50 	Fla		= 0x1001c/4,	/* flash access */
51 	Flop		= 0x1013c/4,	/* flash opcode */
52 	Grc		= 0x10200/4,	/* general rx control */
53 
54 	/* interrupt */
55 	Icr		= 0x00800/4,	/* interrupt cause read */
56 	Ics		= 0x00808/4,	/* " set */
57 	Ims		= 0x00880/4,	/* " mask read/set (actually enable) */
58 	Imc		= 0x00888/4,	/* " mask clear */
59 	Iac		= 0x00810/4,	/* " auto clear */
60 	Iam		= 0x00890/4,	/* " auto mask enable */
61 	Itr		= 0x00820/4,	/* " throttling rate regs (0-19) */
62 	Ivar		= 0x00900/4,	/* " vector allocation regs. */
63 	/* msi interrupt */
64 	Msixt		= 0x0000/4,	/* msix table (bar3) */
65 	Msipba		= 0x2000/4,	/* msix pending bit array (bar3) */
66 	Pbacl		= 0x11068/4,	/* pba clear */
67 	Gpie		= 0x00898/4,	/* general purpose int enable */
68 
69 	/* flow control */
70 	Pfctop		= 0x03008/4,	/* priority flow ctl type opcode */
71 	Fcttv		= 0x03200/4,	/* " transmit timer value (0-3) */
72 	Fcrtl		= 0x03220/4,	/* " rx threshold low (0-7) +8n */
73 	Fcrth		= 0x03260/4,	/* " rx threshold high (0-7) +8n */
74 	Rcrtv		= 0x032a0/4,	/* " refresh value threshold */
75 	Tfcs		= 0x0ce00/4,	/* " tx status */
76 
77 	/* rx dma */
78 	Rbal		= 0x01000/4,	/* rx desc base low (0-63) +0x40n */
79 	Rbah		= 0x01004/4,	/* " high */
80 	Rdlen		= 0x01008/4,	/* " length */
81 	Rdh		= 0x01010/4,	/* " head */
82 	Rdt		= 0x01018/4,	/* " tail */
83 	Rxdctl		= 0x01028/4,	/* " control */
84 
85 	Srrctl		= 0x02100/4,	/* split & replication rx ctl. array */
86 	Dcarxctl	= 0x02200/4,	/* rx dca control */
87 	Rdrxctl		= 0x02f00/4,	/* rx dma control */
88 	Rxpbsize	= 0x03c00/4,	/* rx packet buffer size */
89 	Rxctl		= 0x03000/4,	/* rx control */
90 	Dropen		= 0x03d04/4,	/* drop enable control (598 only) */
91 
92 	/* rx */
93 	Rxcsum		= 0x05000/4,	/* rx checksum control */
94 	Rfctl		= 0x05008/4,	/* rx filter control */
95 	Mta		= 0x05200/4,	/* multicast table array (0-127) */
96 	Ral98		= 0x05400/4,	/* rx address low (598) */
97 	Rah98		= 0x05404/4,
98 	Ral99		= 0x0a200/4,	/* rx address low array (599) */
99 	Rah99		= 0x0a204/4,
100 	Psrtype		= 0x05480/4,	/* packet split rx type. */
101 	Vfta		= 0x0a000/4,	/* vlan filter table array. */
102 	Fctrl		= 0x05080/4,	/* filter control */
103 	Vlnctrl		= 0x05088/4,	/* vlan control */
104 	Msctctrl	= 0x05090/4,	/* multicast control */
105 	Mrqc		= 0x05818/4,	/* multiple rx queues cmd */
106 	Vmdctl		= 0x0581c/4,	/* vmdq control (598 only) */
107 	Imir		= 0x05a80/4,	/* immediate irq rx (0-7) (598 only) */
108 	Imirext		= 0x05aa0/4,	/* immediate irq rx ext (598 only) */
109 	Imirvp		= 0x05ac0/4,	/* immediate irq vlan priority (598 only) */
110 	Reta		= 0x05c00/4,	/* redirection table */
111 	Rssrk		= 0x05c80/4,	/* rss random key */
112 
113 	/* tx */
114 	Tdbal		= 0x06000/4,	/* tx desc base low +0x40n array */
115 	Tdbah		= 0x06004/4,	/* " high */
116 	Tdlen		= 0x06008/4,	/* " len */
117 	Tdh		= 0x06010/4,	/* " head */
118 	Tdt		= 0x06018/4,	/* " tail */
119 	Txdctl		= 0x06028/4,	/* " control */
120 	Tdwbal		= 0x06038/4,	/* " write-back address low */
121 	Tdwbah		= 0x0603c/4,
122 
123 	Dtxctl98	= 0x07e00/4,	/* tx dma control (598 only) */
124 	Dtxctl99	= 0x04a80/4,	/* tx dma control (599 only) */
125 	Tdcatxctrl98	= 0x07200/4,	/* tx dca register (0-15) (598 only) */
126 	Tdcatxctrl99	= 0x0600c/4,	/* tx dca register (0-127) (599 only) */
127 	Tipg		= 0x0cb00/4,	/* tx inter-packet gap (598 only) */
128 	Txpbsize	= 0x0cc00/4,	/* tx packet-buffer size (0-15) */
129 
130 	/* mac */
131 	Hlreg0		= 0x04240/4,	/* highlander control reg 0 */
132 	Hlreg1		= 0x04244/4,	/* highlander control reg 1 (ro) */
133 	Msca		= 0x0425c/4,	/* mdi signal cmd & addr */
134 	Msrwd		= 0x04260/4,	/* mdi single rw data */
135 	Mhadd		= 0x04268/4,	/* mac addr high & max frame */
136 	Pcss1		= 0x04288/4,	/* xgxs status 1 */
137 	Pcss2		= 0x0428c/4,
138 	Xpcss		= 0x04290/4,	/* 10gb-x pcs status */
139 	Serdesc		= 0x04298/4,	/* serdes control */
140 	Macs		= 0x0429c/4,	/* fifo control & report */
141 	Autoc		= 0x042a0/4,	/* autodetect control & status */
142 	Links		= 0x042a4/4,	/* link status */
143 	Links2		= 0x04324/4,	/* 599 only */
144 	Autoc2		= 0x042a8/4,
145 };
146 
147 enum {
148 	Factive		= 1<<0,
149 	Enable		= 1<<31,
150 
151 	/* Ctrl */
152 	Rst		= 1<<26,	/* full nic reset */
153 
154 	/* Txdctl */
155 	Ten		= 1<<25,
156 
157 	/* Dtxctl99 */
158 	Te		= 1<<0,		/* dma tx enable */
159 
160 	/* Fctrl */
161 	Bam		= 1<<10,	/* broadcast accept mode */
162 	Upe 		= 1<<9,		/* unicast promiscuous */
163 	Mpe 		= 1<<8,		/* multicast promiscuous */
164 
165 	/* Rxdctl */
166 	Pthresh		= 0,		/* prefresh threshold shift in bits */
167 	Hthresh		= 8,		/* host buffer minimum threshold " */
168 	Wthresh		= 16,		/* writeback threshold */
169 	Renable		= 1<<25,
170 
171 	/* Rxctl */
172 	Rxen		= 1<<0,
173 	Dmbyps		= 1<<1,		/* descr. monitor bypass (598 only) */
174 
175 	/* Rdrxctl */
176 	Rdmt½		= 0,		/* 598 */
177 	Rdmt¼		= 1,		/* 598 */
178 	Rdmt⅛		= 2,		/* 598 */
179 	Crcstrip	= 1<<1,		/* 599 */
180 	Rscfrstsize	= 037<<17,	/* 599; should be zero */
181 
182 	/* Rxcsum */
183 	Ippcse		= 1<<12,	/* ip payload checksum enable */
184 
185 	/* Eerd */
186 	EEstart		= 1<<0,		/* Start Read */
187 	EEdone		= 1<<1,		/* Read done */
188 
189 	/* interrupts */
190 	Irx0		= 1<<0,		/* driver defined */
191 	Itx0		= 1<<1,		/* driver defined */
192 	Lsc		= 1<<20,	/* link status change */
193 
194 	/* Links */
195 	Lnkup		= 1<<30,
196 	Lnkspd		= 1<<29,
197 
198 	/* Hlreg0 */
199 	Txcrcen		= 1<<0,		/* add crc during xmit */
200 	Rxcrcstrip	= 1<<1,		/* strip crc during recv */
201 	Jumboen		= 1<<2,
202 	Txpaden		= 1<<10,	/* pad short frames during xmit */
203 
204 	/* Autoc */
205 	Flu		= 1<<0,		/* force link up */
206 	Lmsshift	= 13,		/* link mode select shift */
207 	Lmsmask		= 7,
208 };
209 
210 typedef struct Ctlr Ctlr;
211 typedef struct Rd Rd;
212 typedef struct Td Td;
213 
214 typedef struct {
215 	uint	reg;
216 	char	*name;
217 } Stat;
218 
219 Stat stattab[] = {
220 	0x4000,	"crc error",
221 	0x4004,	"illegal byte",
222 	0x4008,	"short packet",
223 	0x3fa0,	"missed pkt0",
224 	0x4034,	"mac local flt",
225 	0x4038,	"mac rmt flt",
226 	0x4040,	"rx length err",
227 	0x3f60,	"xon tx",
228 	0xcf60,	"xon rx",
229 	0x3f68,	"xoff tx",
230 	0xcf68,	"xoff rx",
231 	0x405c,	"rx 040",
232 	0x4060,	"rx 07f",
233 	0x4064,	"rx 100",
234 	0x4068,	"rx 200",
235 	0x406c,	"rx 3ff",
236 	0x4070,	"rx big",
237 	0x4074,	"rx ok",
238 	0x4078,	"rx bcast",
239 	0x3fc0,	"rx no buf0",
240 	0x40a4,	"rx runt",
241 	0x40a8,	"rx frag",
242 	0x40ac,	"rx ovrsz",
243 	0x40b0,	"rx jab",
244 	0x40d0,	"rx pkt",
245 
246 	0x40d4,	"tx pkt",
247 	0x40d8,	"tx 040",
248 	0x40dc,	"tx 07f",
249 	0x40e0,	"tx 100",
250 	0x40e4,	"tx 200",
251 	0x40e8,	"tx 3ff",
252 	0x40ec,	"tx big",
253 	0x40f4,	"tx bcast",
254 	0x4120,	"xsum err",
255 };
256 
257 /* status */
258 enum {
259 	Pif	= 1<<7,	/* past exact filter (sic) */
260 	Ipcs	= 1<<6,	/* ip checksum calculated */
261 	L4cs	= 1<<5,	/* layer 2 */
262 	Tcpcs	= 1<<4,	/* tcp checksum calculated */
263 	Vp	= 1<<3,	/* 802.1q packet matched vet */
264 	Ixsm	= 1<<2,	/* ignore checksum */
265 	Reop	= 1<<1,	/* end of packet */
266 	Rdd	= 1<<0,	/* descriptor done */
267 };
268 
269 struct Rd {			/* Receive Descriptor */
270 	u32int	addr[2];
271 	ushort	length;
272 	ushort	cksum;
273 	uchar	status;
274 	uchar	errors;
275 	ushort	vlan;
276 };
277 
278 enum {
279 	/* Td cmd */
280 	Rs	= 1<<3,		/* report status */
281 	Ic	= 1<<2,		/* insert checksum */
282 	Ifcs	= 1<<1,		/* insert FCS (ethernet crc) */
283 	Teop	= 1<<0,		/* end of packet */
284 
285 	/* Td status */
286 	Tdd	= 1<<0,		/* descriptor done */
287 };
288 
289 struct Td {			/* Transmit Descriptor */
290 	u32int	addr[2];
291 	ushort	length;
292 	uchar	cso;
293 	uchar	cmd;
294 	uchar	status;
295 	uchar	css;
296 	ushort	vlan;
297 };
298 
299 struct Ctlr {
300 	Pcidev	*p;
301 	Ether	*edev;
302 	int	type;
303 
304 	/* virtual */
305 	u32int	*reg;
306 	u32int	*msix;			/* unused */
307 
308 	/* physical */
309 	u32int	*physreg;
310 	u32int	*physmsix;		/* unused */
311 
312 	uchar	flag;
313 	int	nrd;
314 	int	ntd;
315 	int	nrb;			/* # bufs this Ctlr has in the pool */
316 	uint	rbsz;
317 	int	procsrunning;
318 	int	attached;
319 
320 	Watermark wmrb;
321 	Watermark wmrd;
322 	Watermark wmtd;
323 
324 	QLock	slock;
325 	QLock	alock;			/* attach lock */
326 	QLock	tlock;
327 	Rendez	lrendez;
328 	Rendez	trendez;
329 	Rendez	rrendez;
330 
331 	uint	im;			/* interrupt mask */
332 	uint	lim;
333 	uint	rim;
334 	uint	tim;
335 	Lock	imlock;
336 
337 	Rd*	rdba;			/* receive descriptor base address */
338 	Block**	rb;			/* receive buffers */
339 	int	rdt;			/* receive descriptor tail */
340 	int	rdfree;			/* rx descriptors awaiting packets */
341 
342 	Td*	tdba;			/* transmit descriptor base address */
343 	int	tdh;			/* transmit descriptor head */
344 	int	tdt;			/* transmit descriptor tail */
345 	Block**	tb;			/* transmit buffers */
346 
347 	uchar	ra[Eaddrlen];		/* receive address */
348 	uchar	mta[128];		/* multicast table array */
349 	ulong	stats[nelem(stattab)];
350 	uint	speeds[3];
351 };
352 
353 enum {
354 	I82598 = 1,
355 	I82599,
356 };
357 
358 static	Ctlr	*ctlrtab[4];
359 static	int	nctlr;
360 static	Lock	rblock;
361 static	Block	*rbpool;
362 static	int	nrbfull;  /* # of rcv Blocks with data awaiting processing */
363 
364 static void
readstats(Ctlr * ctlr)365 readstats(Ctlr *ctlr)
366 {
367 	int i;
368 
369 	qlock(&ctlr->slock);
370 	for(i = 0; i < nelem(ctlr->stats); i++)
371 		ctlr->stats[i] += ctlr->reg[stattab[i].reg >> 2];
372 	qunlock(&ctlr->slock);
373 }
374 
375 static int speedtab[] = {
376 	0,
377 	1000,
378 	10000,
379 };
380 
381 static long
ifstat(Ether * edev,void * a,long n,ulong offset)382 ifstat(Ether *edev, void *a, long n, ulong offset)
383 {
384 	uint i, *t;
385 	char *s, *p, *e;
386 	Ctlr *ctlr;
387 
388 	ctlr = edev->ctlr;
389 	p = s = malloc(READSTR);
390 	if(p == nil)
391 		error(Enomem);
392 	e = p + READSTR;
393 
394 	readstats(ctlr);
395 	for(i = 0; i < nelem(stattab); i++)
396 		if(ctlr->stats[i] > 0)
397 			p = seprint(p, e, "%.10s  %uld\n", stattab[i].name,
398 				ctlr->stats[i]);
399 	t = ctlr->speeds;
400 	p = seprint(p, e, "speeds: 0:%d 1000:%d 10000:%d\n", t[0], t[1], t[2]);
401 	p = seprint(p, e, "mtu: min:%d max:%d\n", edev->minmtu, edev->maxmtu);
402 	p = seprint(p, e, "rdfree %d rdh %d rdt %d\n", ctlr->rdfree, ctlr->reg[Rdt],
403 		ctlr->reg[Rdh]);
404 	p = seprintmark(p, e, &ctlr->wmrb);
405 	p = seprintmark(p, e, &ctlr->wmrd);
406 	p = seprintmark(p, e, &ctlr->wmtd);
407 	USED(p);
408 	n = readstr(offset, a, n, s);
409 	free(s);
410 
411 	return n;
412 }
413 
414 static void
ienable(Ctlr * ctlr,int i)415 ienable(Ctlr *ctlr, int i)
416 {
417 	ilock(&ctlr->imlock);
418 	ctlr->im |= i;
419 	ctlr->reg[Ims] = ctlr->im;
420 	iunlock(&ctlr->imlock);
421 }
422 
423 static int
lim(void * v)424 lim(void *v)
425 {
426 	return ((Ctlr*)v)->lim != 0;
427 }
428 
429 static void
lproc(void * v)430 lproc(void *v)
431 {
432 	int r, i;
433 	Ctlr *ctlr;
434 	Ether *e;
435 
436 	e = v;
437 	ctlr = e->ctlr;
438 	for (;;) {
439 		r = ctlr->reg[Links];
440 		e->link = (r & Lnkup) != 0;
441 		i = 0;
442 		if(e->link)
443 			i = 1 + ((r & Lnkspd) != 0);
444 		ctlr->speeds[i]++;
445 		e->mbps = speedtab[i];
446 		ctlr->lim = 0;
447 		ienable(ctlr, Lsc);
448 		sleep(&ctlr->lrendez, lim, ctlr);
449 		ctlr->lim = 0;
450 	}
451 }
452 
453 static long
ctl(Ether *,void *,long)454 ctl(Ether *, void *, long)
455 {
456 	error(Ebadarg);
457 	return -1;
458 }
459 
460 static Block*
rballoc(void)461 rballoc(void)
462 {
463 	Block *bp;
464 
465 	ilock(&rblock);
466 	if((bp = rbpool) != nil){
467 		rbpool = bp->next;
468 		bp->next = 0;
469 		ainc(&bp->ref);	/* prevent bp from being freed */
470 	}
471 	iunlock(&rblock);
472 	return bp;
473 }
474 
475 void
rbfree(Block * b)476 rbfree(Block *b)
477 {
478 	b->rp = b->wp = (uchar*)PGROUND((uintptr)b->base);
479  	b->flag &= ~(Bipck | Budpck | Btcpck | Bpktck);
480 	ilock(&rblock);
481 	b->next = rbpool;
482 	rbpool = b;
483 	nrbfull--;
484 	iunlock(&rblock);
485 }
486 
487 static int
cleanup(Ctlr * ctlr,int tdh)488 cleanup(Ctlr *ctlr, int tdh)
489 {
490 	Block *b;
491 	uint m, n;
492 
493 	m = ctlr->ntd - 1;
494 	while(ctlr->tdba[n = NEXTPOW2(tdh, m)].status & Tdd){
495 		tdh = n;
496 		b = ctlr->tb[tdh];
497 		ctlr->tb[tdh] = 0;
498 		if (b)
499 			freeb(b);
500 		ctlr->tdba[tdh].status = 0;
501 	}
502 	return tdh;
503 }
504 
505 void
transmit(Ether * e)506 transmit(Ether *e)
507 {
508 	uint i, m, tdt, tdh;
509 	Ctlr *ctlr;
510 	Block *b;
511 	Td *t;
512 
513 	ctlr = e->ctlr;
514 	if(!canqlock(&ctlr->tlock)){
515 		ienable(ctlr, Itx0);
516 		return;
517 	}
518 	tdh = ctlr->tdh = cleanup(ctlr, ctlr->tdh);
519 	tdt = ctlr->tdt;
520 	m = ctlr->ntd - 1;
521 	for(i = 0; ; i++){
522 		if(NEXTPOW2(tdt, m) == tdh){	/* ring full? */
523 			ienable(ctlr, Itx0);
524 			break;
525 		}
526 		if((b = qget(e->oq)) == nil)
527 			break;
528 		assert(ctlr->tdba != nil);
529 		t = ctlr->tdba + tdt;
530 		t->addr[0] = PCIWADDR(b->rp);
531 		t->length = BLEN(b);
532 		t->cmd = Ifcs | Teop;
533 		if (!Goslow)
534 			t->cmd |= Rs;
535 		ctlr->tb[tdt] = b;
536 		/* note size of queue of tds awaiting transmission */
537 		notemark(&ctlr->wmtd, (tdt + Ntd - tdh) % Ntd);
538 		tdt = NEXTPOW2(tdt, m);
539 	}
540 	if(i) {
541 		coherence();
542 		ctlr->reg[Tdt] = ctlr->tdt = tdt;  /* make new Tds active */
543 		coherence();
544 		ienable(ctlr, Itx0);
545 	}
546 	qunlock(&ctlr->tlock);
547 }
548 
549 static int
tim(void * c)550 tim(void *c)
551 {
552 	return ((Ctlr*)c)->tim != 0;
553 }
554 
555 static void
tproc(void * v)556 tproc(void *v)
557 {
558 	Ctlr *ctlr;
559 	Ether *e;
560 
561 	e = v;
562 	ctlr = e->ctlr;
563 	for (;;) {
564 		sleep(&ctlr->trendez, tim, ctlr); /* xmit interrupt kicks us */
565 		ctlr->tim = 0;
566 		transmit(e);
567 	}
568 }
569 
570 static void
rxinit(Ctlr * ctlr)571 rxinit(Ctlr *ctlr)
572 {
573 	int i, is598, autoc;
574 	ulong until;
575 	Block *b;
576 
577 	ctlr->reg[Rxctl] &= ~Rxen;
578 	ctlr->reg[Rxdctl] = 0;
579 	for(i = 0; i < ctlr->nrd; i++){
580 		b = ctlr->rb[i];
581 		ctlr->rb[i] = 0;
582 		if(b)
583 			freeb(b);
584 	}
585 	ctlr->rdfree = 0;
586 
587 	coherence();
588 	ctlr->reg[Fctrl] |= Bam;
589 	ctlr->reg[Fctrl] &= ~(Upe | Mpe);
590 
591 	/* intel gets some csums wrong (e.g., errata 44) */
592 	ctlr->reg[Rxcsum] &= ~Ippcse;
593 	ctlr->reg[Hlreg0] &= ~Jumboen;		/* jumbos are a bad idea */
594 	ctlr->reg[Hlreg0] |= Txcrcen | Rxcrcstrip | Txpaden;
595 	ctlr->reg[Srrctl] = (ctlr->rbsz + 1024 - 1) / 1024;
596 	ctlr->reg[Mhadd] = ctlr->rbsz << 16;
597 
598 	ctlr->reg[Rbal] = PCIWADDR(ctlr->rdba);
599 	ctlr->reg[Rbah] = 0;
600 	ctlr->reg[Rdlen] = ctlr->nrd*sizeof(Rd); /* must be multiple of 128 */
601 	ctlr->reg[Rdh] = 0;
602 	ctlr->reg[Rdt] = ctlr->rdt = 0;
603 	coherence();
604 
605 	is598 = (ctlr->type == I82598);
606 	if (is598)
607 		ctlr->reg[Rdrxctl] = Rdmt¼;
608 	else {
609 		ctlr->reg[Rdrxctl] |= Crcstrip;
610 		ctlr->reg[Rdrxctl] &= ~Rscfrstsize;
611 	}
612 	if (Goslow && is598)
613 		ctlr->reg[Rxdctl] = 8<<Wthresh | 8<<Pthresh | 4<<Hthresh | Renable;
614 	else
615 		ctlr->reg[Rxdctl] = Renable;
616 	coherence();
617 
618 	/*
619 	 * don't wait forever like an idiot (and hang the system),
620 	 * maybe it's disconnected.
621 	 */
622 	until = TK2MS(MACHP(0)->ticks) + 250;
623 	while (!(ctlr->reg[Rxdctl] & Renable) && TK2MS(MACHP(0)->ticks) < until)
624 		;
625 	if(!(ctlr->reg[Rxdctl] & Renable))
626 		print("#l%d: Renable didn't come on, might be disconnected\n",
627 			ctlr->edev->ctlrno);
628 
629 	ctlr->reg[Rxctl] |= Rxen | (is598? Dmbyps: 0);
630 
631 	if (is598){
632 		autoc = ctlr->reg[Autoc];
633 		/* what is this rubbish and why do we care? */
634 		print("#l%d: autoc %#ux; lms %d (3 is 10g sfp)\n",
635 			ctlr->edev->ctlrno, autoc, (autoc>>Lmsshift) & Lmsmask);
636 		ctlr->reg[Autoc] |= Flu;
637 		coherence();
638 		delay(50);
639 	}
640 }
641 
642 static void
replenish(Ctlr * ctlr,uint rdh)643 replenish(Ctlr *ctlr, uint rdh)
644 {
645 	int rdt, m, i;
646 	Block *b;
647 	Rd *r;
648 
649 	m = ctlr->nrd - 1;
650 	i = 0;
651 	for(rdt = ctlr->rdt; NEXTPOW2(rdt, m) != rdh; rdt = NEXTPOW2(rdt, m)){
652 		r = ctlr->rdba + rdt;
653 		if((b = rballoc()) == nil){
654 			print("#l%d: no buffers\n", ctlr->edev->ctlrno);
655 			break;
656 		}
657 		ctlr->rb[rdt] = b;
658 		r->addr[0] = PCIWADDR(b->rp);
659 		r->status = 0;
660 		ctlr->rdfree++;
661 		i++;
662 	}
663 	if(i) {
664 		coherence();
665 		ctlr->reg[Rdt] = ctlr->rdt = rdt; /* hand back recycled rdescs */
666 		coherence();
667 	}
668 }
669 
670 static int
rim(void * v)671 rim(void *v)
672 {
673 	return ((Ctlr*)v)->rim != 0;
674 }
675 
676 void
rproc(void * v)677 rproc(void *v)
678 {
679 	int passed;
680 	uint m, rdh;
681 	Block *bp;
682 	Ctlr *ctlr;
683 	Ether *e;
684 	Rd *r;
685 
686 	e = v;
687 	ctlr = e->ctlr;
688 	m = ctlr->nrd - 1;
689 	for (rdh = 0; ; ) {
690 		replenish(ctlr, rdh);
691 		ienable(ctlr, Irx0);
692 		sleep(&ctlr->rrendez, rim, ctlr);
693 		passed = 0;
694 		for (;;) {
695 			ctlr->rim = 0;
696 			r = ctlr->rdba + rdh;
697 			if(!(r->status & Rdd))
698 				break;		/* wait for pkts to arrive */
699 			bp = ctlr->rb[rdh];
700 			ctlr->rb[rdh] = 0;
701 			if (r->length > ETHERMAXTU)
702 				print("#l%d: got jumbo of %d bytes\n",
703 					e->ctlrno, r->length);
704 			bp->wp += r->length;
705 			bp->lim = bp->wp;		/* lie like a dog */
706 //			r->status = 0;
707 
708 			ilock(&rblock);
709 			nrbfull++;
710 			iunlock(&rblock);
711 			notemark(&ctlr->wmrb, nrbfull);
712 			etheriq(e, bp, 1);
713 
714 			passed++;
715 			ctlr->rdfree--;
716 			rdh = NEXTPOW2(rdh, m);
717 			if (ctlr->rdfree <= ctlr->nrd - 16)
718 				replenish(ctlr, rdh);
719 		}
720 		/* note how many rds had full buffers */
721 		notemark(&ctlr->wmrd, passed);
722 	}
723 }
724 
725 static void
promiscuous(void * a,int on)726 promiscuous(void *a, int on)
727 {
728 	Ctlr *ctlr;
729 	Ether *e;
730 
731 	e = a;
732 	ctlr = e->ctlr;
733 	if(on)
734 		ctlr->reg[Fctrl] |= Upe | Mpe;
735 	else
736 		ctlr->reg[Fctrl] &= ~(Upe | Mpe);
737 }
738 
739 static void
multicast(void * a,uchar * ea,int on)740 multicast(void *a, uchar *ea, int on)
741 {
742 	int b, i;
743 	Ctlr *ctlr;
744 	Ether *e;
745 
746 	e = a;
747 	ctlr = e->ctlr;
748 
749 	/*
750 	 * multiple ether addresses can hash to the same filter bit,
751 	 * so it's never safe to clear a filter bit.
752 	 * if we want to clear filter bits, we need to keep track of
753 	 * all the multicast addresses in use, clear all the filter bits,
754 	 * then set the ones corresponding to in-use addresses.
755 	 */
756 	i = ea[5] >> 1;
757 	b = (ea[5]&1)<<4 | ea[4]>>4;
758 	b = 1 << b;
759 	if(on)
760 		ctlr->mta[i] |= b;
761 //	else
762 //		ctlr->mta[i] &= ~b;
763 	ctlr->reg[Mta+i] = ctlr->mta[i];
764 }
765 
766 static void
freemem(Ctlr * ctlr)767 freemem(Ctlr *ctlr)
768 {
769 	Block *b;
770 
771 	while(b = rballoc()){
772 		b->free = 0;
773 		freeb(b);
774 	}
775 	free(ctlr->rdba);
776 	ctlr->rdba = nil;
777 	free(ctlr->tdba);
778 	ctlr->tdba = nil;
779 	free(ctlr->rb);
780 	ctlr->rb = nil;
781 	free(ctlr->tb);
782 	ctlr->tb = nil;
783 }
784 
785 static int
detach(Ctlr * ctlr)786 detach(Ctlr *ctlr)
787 {
788 	int i, is598;
789 
790 	ctlr->reg[Imc] = ~0;
791 	ctlr->reg[Ctrl] |= Rst;
792 	for(i = 0; i < 100; i++){
793 		delay(1);
794 		if((ctlr->reg[Ctrl] & Rst) == 0)
795 			break;
796 	}
797 	if (i >= 100)
798 		return -1;
799 	is598 = (ctlr->type == I82598);
800 	if (is598) {			/* errata */
801 		delay(50);
802 		ctlr->reg[Ecc] &= ~(1<<21 | 1<<18 | 1<<9 | 1<<6);
803 	}
804 
805 	/* not cleared by reset; kill it manually. */
806 	for(i = 1; i < 16; i++)
807 		ctlr->reg[is598? Rah98: Rah99] &= ~Enable;
808 	for(i = 0; i < 128; i++)
809 		ctlr->reg[Mta + i] = 0;
810 	for(i = 1; i < (is598? 640: 128); i++)
811 		ctlr->reg[Vfta + i] = 0;
812 
813 //	freemem(ctlr);			// TODO
814 	ctlr->attached = 0;
815 	return 0;
816 }
817 
818 static void
shutdown(Ether * e)819 shutdown(Ether *e)
820 {
821 	detach(e->ctlr);
822 //	freemem(e->ctlr);
823 }
824 
825 /* ≤ 20ms */
826 static ushort
eeread(Ctlr * ctlr,int i)827 eeread(Ctlr *ctlr, int i)
828 {
829 	ctlr->reg[Eerd] = EEstart | i<<2;
830 	while((ctlr->reg[Eerd] & EEdone) == 0)
831 		;
832 	return ctlr->reg[Eerd] >> 16;
833 }
834 
835 static int
eeload(Ctlr * ctlr)836 eeload(Ctlr *ctlr)
837 {
838 	ushort u, v, p, l, i, j;
839 
840 	if((eeread(ctlr, 0) & 0xc0) != 0x40)
841 		return -1;
842 	u = 0;
843 	for(i = 0; i < 0x40; i++)
844 		u +=  eeread(ctlr, i);
845 	for(i = 3; i < 0xf; i++){
846 		p = eeread(ctlr, i);
847 		l = eeread(ctlr, p++);
848 		if((int)p + l + 1 > 0xffff)
849 			continue;
850 		for(j = p; j < p + l; j++)
851 			u += eeread(ctlr, j);
852 	}
853 	if(u != 0xbaba)
854 		return -1;
855 	if(ctlr->reg[Status] & (1<<3))
856 		u = eeread(ctlr, 10);
857 	else
858 		u = eeread(ctlr, 9);
859 	u++;
860 	for(i = 0; i < Eaddrlen;){
861 		v = eeread(ctlr, u + i/2);
862 		ctlr->ra[i++] = v;
863 		ctlr->ra[i++] = v>>8;
864 	}
865 	ctlr->ra[5] += (ctlr->reg[Status] & 0xc) >> 2;
866 	return 0;
867 }
868 
869 static int
reset(Ctlr * ctlr)870 reset(Ctlr *ctlr)
871 {
872 	int i, is598;
873 	uchar *p;
874 
875 	if(detach(ctlr)){
876 		print("82598: reset timeout\n");
877 		return -1;
878 	}
879 	if(eeload(ctlr)){
880 		print("82598: eeprom failure\n");
881 		return -1;
882 	}
883 	p = ctlr->ra;
884 	is598 = (ctlr->type == I82598);
885 	ctlr->reg[is598? Ral98: Ral99] = p[3]<<24 | p[2]<<16 | p[1]<<8 | p[0];
886 	ctlr->reg[is598? Rah98: Rah99] = p[5]<<8 | p[4] | Enable;
887 
888 	readstats(ctlr);
889 	for(i = 0; i<nelem(ctlr->stats); i++)
890 		ctlr->stats[i] = 0;
891 
892 	ctlr->reg[Ctrlext] |= 1 << 16;	/* required by errata (spec change 4) */
893 	if (Goslow) {
894 		/* make some guesses for flow control */
895 		ctlr->reg[Fcrtl] = 0x10000 | Enable;
896 		ctlr->reg[Fcrth] = 0x40000 | Enable;
897 		ctlr->reg[Rcrtv] = 0x6000;
898 	} else
899 		ctlr->reg[Fcrtl] = ctlr->reg[Fcrth] = ctlr->reg[Rcrtv] = 0;
900 
901 	/* configure interrupt mapping (don't ask) */
902 	ctlr->reg[Ivar+0] =     0 | 1<<7;
903 	ctlr->reg[Ivar+64/4] =  1 | 1<<7;
904 //	ctlr->reg[Ivar+97/4] = (2 | 1<<7) << (8*(97%4));
905 
906 	if (Goslow) {
907 		/* interrupt throttling goes here. */
908 		for(i = Itr; i < Itr + 20; i++)
909 			ctlr->reg[i] = 128;		/* ¼µs intervals */
910 		ctlr->reg[Itr + Itx0] = 256;
911 	} else {					/* don't throttle */
912 		for(i = Itr; i < Itr + 20; i++)
913 			ctlr->reg[i] = 0;		/* ¼µs intervals */
914 		ctlr->reg[Itr + Itx0] = 0;
915 	}
916 	return 0;
917 }
918 
919 static void
txinit(Ctlr * ctlr)920 txinit(Ctlr *ctlr)
921 {
922 	Block *b;
923 	int i;
924 
925 	if (Goslow)
926 		ctlr->reg[Txdctl] = 16<<Wthresh | 16<<Pthresh;
927 	else
928 		ctlr->reg[Txdctl] = 0;
929 	if (ctlr->type == I82599)
930 		ctlr->reg[Dtxctl99] = 0;
931 	coherence();
932 	for(i = 0; i < ctlr->ntd; i++){
933 		b = ctlr->tb[i];
934 		ctlr->tb[i] = 0;
935 		if(b)
936 			freeb(b);
937 	}
938 
939 	assert(ctlr->tdba != nil);
940 	memset(ctlr->tdba, 0, ctlr->ntd * sizeof(Td));
941 	ctlr->reg[Tdbal] = PCIWADDR(ctlr->tdba);
942 	ctlr->reg[Tdbah] = 0;
943 	ctlr->reg[Tdlen] = ctlr->ntd*sizeof(Td); /* must be multiple of 128 */
944 	ctlr->reg[Tdh] = 0;
945 	ctlr->tdh = ctlr->ntd - 1;
946 	ctlr->reg[Tdt] = ctlr->tdt = 0;
947 	coherence();
948 	if (ctlr->type == I82599)
949 		ctlr->reg[Dtxctl99] |= Te;
950 	coherence();
951 	ctlr->reg[Txdctl] |= Ten;
952 	coherence();
953 	while (!(ctlr->reg[Txdctl] & Ten))
954 		;
955 }
956 
957 static void
attach(Ether * e)958 attach(Ether *e)
959 {
960 	Block *b;
961 	Ctlr *ctlr;
962 	char buf[KNAMELEN];
963 
964 	ctlr = e->ctlr;
965 	ctlr->edev = e;			/* point back to Ether* */
966 	qlock(&ctlr->alock);
967 	if(waserror()){
968 		reset(ctlr);
969 		freemem(ctlr);
970 		qunlock(&ctlr->alock);
971 		nexterror();
972 	}
973 	if(ctlr->rdba == nil) {
974 		ctlr->nrd = Nrd;
975 		ctlr->ntd = Ntd;
976 		ctlr->rdba = mallocalign(ctlr->nrd * sizeof *ctlr->rdba,
977 			Descalign, 0, 0);
978 		ctlr->tdba = mallocalign(ctlr->ntd * sizeof *ctlr->tdba,
979 			Descalign, 0, 0);
980 		ctlr->rb = malloc(ctlr->nrd * sizeof(Block *));
981 		ctlr->tb = malloc(ctlr->ntd * sizeof(Block *));
982 		if (ctlr->rdba == nil || ctlr->tdba == nil ||
983 		    ctlr->rb == nil || ctlr->tb == nil)
984 			error(Enomem);
985 
986 		for(ctlr->nrb = 0; ctlr->nrb < 2*Nrb; ctlr->nrb++){
987 			b = allocb(ctlr->rbsz + BY2PG);	/* see rbfree() */
988 			if(b == nil)
989 				error(Enomem);
990 			b->free = rbfree;
991 			freeb(b);
992 		}
993 	}
994 	if (!ctlr->attached) {
995 		rxinit(ctlr);
996 		txinit(ctlr);
997 		nrbfull = 0;
998 		if (!ctlr->procsrunning) {
999 			snprint(buf, sizeof buf, "#l%dl", e->ctlrno);
1000 			kproc(buf, lproc, e);
1001 			snprint(buf, sizeof buf, "#l%dr", e->ctlrno);
1002 			kproc(buf, rproc, e);
1003 			snprint(buf, sizeof buf, "#l%dt", e->ctlrno);
1004 			kproc(buf, tproc, e);
1005 			ctlr->procsrunning = 1;
1006 		}
1007 		initmark(&ctlr->wmrb, Nrb, "rcv bufs unprocessed");
1008 		initmark(&ctlr->wmrd, Nrd-1, "rcv descrs processed at once");
1009 		initmark(&ctlr->wmtd, Ntd-1, "xmit descr queue len");
1010 		ctlr->attached = 1;
1011 	}
1012 	qunlock(&ctlr->alock);
1013 	poperror();
1014 }
1015 
1016 static void
interrupt(Ureg *,void * v)1017 interrupt(Ureg*, void *v)
1018 {
1019 	int icr, im;
1020 	Ctlr *ctlr;
1021 	Ether *e;
1022 
1023 	e = v;
1024 	ctlr = e->ctlr;
1025 	ilock(&ctlr->imlock);
1026 	ctlr->reg[Imc] = ~0;			/* disable all intrs */
1027 	im = ctlr->im;
1028 	while((icr = ctlr->reg[Icr] & ctlr->im) != 0){
1029 		if(icr & Irx0){
1030 			im &= ~Irx0;
1031 			ctlr->rim = Irx0;
1032 			wakeup(&ctlr->rrendez);
1033 		}
1034 		if(icr & Itx0){
1035 			im &= ~Itx0;
1036 			ctlr->tim = Itx0;
1037 			wakeup(&ctlr->trendez);
1038 		}
1039 		if(icr & Lsc){
1040 			im &= ~Lsc;
1041 			ctlr->lim = Lsc;
1042 			wakeup(&ctlr->lrendez);
1043 		}
1044 	}
1045 	ctlr->reg[Ims] = ctlr->im = im; /* enable only intrs we didn't service */
1046 	iunlock(&ctlr->imlock);
1047 }
1048 
1049 static void
scan(void)1050 scan(void)
1051 {
1052 	int pciregs, pcimsix, type;
1053 	ulong io, iomsi;
1054 	void *mem, *memmsi;
1055 	Ctlr *ctlr;
1056 	Pcidev *p;
1057 
1058 	p = 0;
1059 	while(p = pcimatch(p, Vintel, 0)){
1060 		switch(p->did){
1061 		case 0x10b6:		/* 82598 backplane */
1062 		case 0x10c6:		/* 82598 af dual port */
1063 		case 0x10c7:		/* 82598 af single port */
1064 		case 0x10dd:		/* 82598 at cx4 */
1065 		case 0x10ec:		/* 82598 at cx4 dual port */
1066 			pcimsix = 3;
1067 			type = I82598;
1068 			break;
1069 		case 0x10f7:		/* 82599 kx/kx4 */
1070 		case 0x10f8:		/* 82599 kx/kx4/kx */
1071 		case 0x10f9:		/* 82599 cx4 */
1072 		case 0x10fb:		/* 82599 sfi/sfp+ */
1073 		case 0x10fc:		/* 82599 xaui/bx4 */
1074 		case 0x1557:		/* 82599 single-port sfi */
1075 			pcimsix = 4;
1076 			type = I82599;
1077 			break;
1078 		default:
1079 			continue;
1080 		}
1081 		pciregs = 0;
1082 		if(nctlr >= nelem(ctlrtab)){
1083 			print("i82598: too many controllers\n");
1084 			return;
1085 		}
1086 
1087 		io = p->mem[pciregs].bar & ~0xf;
1088 		mem = vmap(io, p->mem[pciregs].size);
1089 		if(mem == nil){
1090 			print("i82598: can't map regs %#p\n",
1091 				p->mem[pciregs].bar);
1092 			continue;
1093 		}
1094 
1095 		iomsi = p->mem[pcimsix].bar & ~0xf;
1096 		memmsi = vmap(iomsi, p->mem[pcimsix].size);
1097 		if(memmsi == nil){
1098 			print("i82598: can't map msi-x regs %#p\n",
1099 				p->mem[pcimsix].bar);
1100 			vunmap(mem, p->mem[pciregs].size);
1101 			continue;
1102 		}
1103 
1104 		ctlr = malloc(sizeof *ctlr);
1105 		if(ctlr == nil) {
1106 			vunmap(mem, p->mem[pciregs].size);
1107 			vunmap(memmsi, p->mem[pcimsix].size);
1108 			error(Enomem);
1109 		}
1110 		ctlr->p = p;
1111 		ctlr->type = type;
1112 		ctlr->physreg = (u32int*)io;
1113 		ctlr->physmsix = (u32int*)iomsi;
1114 		ctlr->reg = (u32int*)mem;
1115 		ctlr->msix = (u32int*)memmsi;	/* unused */
1116 		ctlr->rbsz = Rbsz;
1117 		if(reset(ctlr)){
1118 			print("i82598: can't reset\n");
1119 			free(ctlr);
1120 			vunmap(mem, p->mem[pciregs].size);
1121 			vunmap(memmsi, p->mem[pcimsix].size);
1122 			continue;
1123 		}
1124 		pcisetbme(p);
1125 		ctlrtab[nctlr++] = ctlr;
1126 	}
1127 }
1128 
1129 static int
pnp(Ether * e)1130 pnp(Ether *e)
1131 {
1132 	int i;
1133 	Ctlr *ctlr;
1134 
1135 	if(nctlr == 0)
1136 		scan();
1137 	ctlr = nil;
1138 	for(i = 0; i < nctlr; i++){
1139 		ctlr = ctlrtab[i];
1140 		if(ctlr == nil || ctlr->flag & Factive)
1141 			continue;
1142 		if(e->port == 0 || e->port == (ulong)ctlr->reg)
1143 			break;
1144 	}
1145 	if (i >= nctlr)
1146 		return -1;
1147 	ctlr->flag |= Factive;
1148 	e->ctlr = ctlr;
1149 	e->port = (uintptr)ctlr->physreg;
1150 	e->irq = ctlr->p->intl;
1151 	e->tbdf = ctlr->p->tbdf;
1152 	e->mbps = 10000;
1153 	e->maxmtu = ETHERMAXTU;
1154 	memmove(e->ea, ctlr->ra, Eaddrlen);
1155 
1156 	e->arg = e;
1157 	e->attach = attach;
1158 	e->detach = shutdown;
1159 	e->transmit = transmit;
1160 	e->interrupt = interrupt;
1161 	e->ifstat = ifstat;
1162 	e->shutdown = shutdown;
1163 	e->ctl = ctl;
1164 	e->multicast = multicast;
1165 	e->promiscuous = promiscuous;
1166 
1167 	return 0;
1168 }
1169 
1170 void
ether82598link(void)1171 ether82598link(void)
1172 {
1173 	addethercard("i82598", pnp);
1174 	addethercard("i10gbe", pnp);
1175 }
1176