ChangeSet 1.823.3.10, 2002/11/11 17:21:19-08:00, david-b@pacbell.net

[PATCH] ehci-hcd, use dummy td when queueing

What it does is give up on catching all the "race with HC" cases
when appending to a live QH, by switching to using a disabled
"dummy" TD at the end of all hardware queues.  The HC won't cache
disabled TDs, so it just sees "always good" pointers: no races.

As a side benefit this also makes it safe to not irq on completion
of most TDs that are queued using the scatterlist calls, so it'll
be typical for one 64 KByte usb-storage request to mean just one
irq (or less!) even without tuning ehci irq latency (for the DATA
stage, that is).


diff -Nru a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
--- a/drivers/usb/host/ehci-dbg.c	Thu Nov 14 14:12:39 2002
+++ b/drivers/usb/host/ehci-dbg.c	Thu Nov 14 14:12:39 2002
@@ -272,6 +272,7 @@
 static void qh_lines (struct ehci_qh *qh, char **nextp, unsigned *sizep)
 {
 	u32			scratch;
+	u32			hw_curr;
 	struct list_head	*entry;
 	struct ehci_qtd		*td;
 	unsigned		temp;
@@ -279,20 +280,22 @@
 	char			*next = *nextp;
 
 	scratch = cpu_to_le32p (&qh->hw_info1);
-	temp = snprintf (next, size, "qh/%p dev%d %cs ep%d %08x %08x",
+	hw_curr = cpu_to_le32p (&qh->hw_current);
+	temp = snprintf (next, size, "qh/%p dev%d %cs ep%d %08x %08x (%08x %08x)",
 			qh, scratch & 0x007f,
 			speed_char (scratch),
 			(scratch >> 8) & 0x000f,
-			scratch, cpu_to_le32p (&qh->hw_info2));
+			scratch, cpu_to_le32p (&qh->hw_info2),
+			hw_curr, cpu_to_le32p (&qh->hw_token));
 	size -= temp;
 	next += temp;
 
 	list_for_each (entry, &qh->qtd_list) {
-		td = list_entry (entry, struct ehci_qtd,
-				qtd_list);
+		td = list_entry (entry, struct ehci_qtd, qtd_list);
 		scratch = cpu_to_le32p (&td->hw_token);
 		temp = snprintf (next, size,
-				"\n\ttd/%p %s len=%d %08x urb %p",
+				"\n\t%std/%p %s len=%d %08x urb %p",
+				(hw_curr == td->qtd_dma) ? "*" : "",
 				td, ({ char *tmp;
 				 switch ((scratch>>8)&0x03) {
 				 case 0: tmp = "out"; break;
@@ -552,8 +555,8 @@
 	size -= temp;
 	next += temp;
 
-	temp = snprintf (next, size, "complete %ld unlink %ld qpatch %ld\n",
-		ehci->stats.complete, ehci->stats.unlink, ehci->stats.qpatch);
+	temp = snprintf (next, size, "complete %ld unlink %ld\n",
+		ehci->stats.complete, ehci->stats.unlink);
 	size -= temp;
 	next += temp;
 #endif
diff -Nru a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
--- a/drivers/usb/host/ehci-hcd.c	Thu Nov 14 14:12:39 2002
+++ b/drivers/usb/host/ehci-hcd.c	Thu Nov 14 14:12:39 2002
@@ -494,8 +494,8 @@
 #ifdef	EHCI_STATS
 	dbg ("irq normal %ld err %ld reclaim %ld",
 		ehci->stats.normal, ehci->stats.error, ehci->stats.reclaim);
-	dbg ("complete %ld unlink %ld qpatch %ld",
-		ehci->stats.complete, ehci->stats.unlink, ehci->stats.qpatch);
+	dbg ("complete %ld unlink %ld",
+		ehci->stats.complete, ehci->stats.unlink);
 #endif
 
 	dbg_status (ehci, "ehci_stop completed", readl (&ehci->regs->status));
diff -Nru a/drivers/usb/host/ehci-mem.c b/drivers/usb/host/ehci-mem.c
--- a/drivers/usb/host/ehci-mem.c	Thu Nov 14 14:12:39 2002
+++ b/drivers/usb/host/ehci-mem.c	Thu Nov 14 14:12:39 2002
@@ -58,19 +58,23 @@
 
 /* Allocate the key transfer structures from the previously allocated pool */
 
+static void ehci_qtd_init (struct ehci_qtd *qtd, dma_addr_t dma)
+{
+	memset (qtd, 0, sizeof *qtd);
+	qtd->qtd_dma = dma;
+	qtd->hw_next = EHCI_LIST_END;
+	qtd->hw_alt_next = EHCI_LIST_END;
+	INIT_LIST_HEAD (&qtd->qtd_list);
+}
+
 static struct ehci_qtd *ehci_qtd_alloc (struct ehci_hcd *ehci, int flags)
 {
 	struct ehci_qtd		*qtd;
 	dma_addr_t		dma;
 
 	qtd = pci_pool_alloc (ehci->qtd_pool, flags, &dma);
-	if (qtd != 0) {
-		memset (qtd, 0, sizeof *qtd);
-		qtd->qtd_dma = dma;
-		qtd->hw_next = EHCI_LIST_END;
-		qtd->hw_alt_next = EHCI_LIST_END;
-		INIT_LIST_HEAD (&qtd->qtd_list);
-	}
+	if (qtd != 0)
+		ehci_qtd_init (qtd, dma);
 	return qtd;
 }
 
@@ -87,12 +91,21 @@
 
 	qh = (struct ehci_qh *)
 		pci_pool_alloc (ehci->qh_pool, flags, &dma);
-	if (qh) {
-		memset (qh, 0, sizeof *qh);
-		atomic_set (&qh->refcount, 1);
-		qh->qh_dma = dma;
-		// INIT_LIST_HEAD (&qh->qh_list);
-		INIT_LIST_HEAD (&qh->qtd_list);
+	if (!qh)
+		return qh;
+
+	memset (qh, 0, sizeof *qh);
+	atomic_set (&qh->refcount, 1);
+	qh->qh_dma = dma;
+	// INIT_LIST_HEAD (&qh->qh_list);
+	INIT_LIST_HEAD (&qh->qtd_list);
+
+	/* dummy td enables safe urb queuing */
+	qh->dummy = ehci_qtd_alloc (ehci, flags);
+	if (qh->dummy == 0) {
+		dbg ("no dummy td");
+		pci_pool_free (ehci->qh_pool, qh, qh->qh_dma);
+		qh = 0;
 	}
 	return qh;
 }
@@ -115,6 +128,8 @@
 		dbg ("unused qh not empty!");
 		BUG ();
 	}
+	if (qh->dummy)
+		ehci_qtd_free (ehci, qh->dummy);
 	pci_pool_free (ehci->qh_pool, qh, qh->qh_dma);
 }
 
diff -Nru a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
--- a/drivers/usb/host/ehci-q.c	Thu Nov 14 14:12:39 2002
+++ b/drivers/usb/host/ehci-q.c	Thu Nov 14 14:12:39 2002
@@ -85,7 +85,7 @@
 
 /* update halted (but potentially linked) qh */
 
-static inline void qh_update (struct ehci_qh *qh, struct ehci_qtd *qtd)
+static void qh_update (struct ehci_qh *qh, struct ehci_qtd *qtd)
 {
 	qh->hw_current = 0;
 	qh->hw_qtd_next = QTD_NEXT (qtd->qtd_dma);
@@ -221,17 +221,6 @@
 		struct urb	*urb = qtd->urb;
 		u32		token = 0;
 
-		/* hc's on-chip qh overlay cache can overwrite our idea of
-		 * next qtd ptrs, if we appended a qtd while the queue was
-		 * advancing.  (because we don't use dummy qtds.)
-		 */
-		if (cpu_to_le32 (qtd->qtd_dma) == qh->hw_current
-				&& qtd->hw_next != qh->hw_qtd_next) {
-			qh->hw_alt_next = qtd->hw_alt_next;
-			qh->hw_qtd_next = qtd->hw_next;
-			COUNT (ehci->stats.qpatch);
-		}
-
 		/* clean up any state from previous QTD ...*/
 		if (last) {
 			if (likely (last->urb != urb)) {
@@ -495,8 +484,7 @@
 	}
 
 	/* by default, enable interrupt on urb completion */
-// ... do it always, unless we switch over to dummy qtds
-//	if (likely (!(urb->transfer_flags & URB_NO_INTERRUPT)))
+	if (likely (!(urb->transfer_flags & URB_NO_INTERRUPT)))
 		qtd->hw_token |= __constant_cpu_to_le32 (QTD_IOC);
 	return head;
 
@@ -661,8 +649,15 @@
 
 	/* initialize sw and hw queues with these qtds */
 	if (!list_empty (qtd_list)) {
+		struct ehci_qtd		*qtd;
+
+		/* hc's list view ends with dummy td; we might update it */
+		qtd = list_entry (qtd_list->prev, struct ehci_qtd, qtd_list);
+		qtd->hw_next = QTD_NEXT (qh->dummy->qtd_dma);
+
 		list_splice (qtd_list, &qh->qtd_list);
-		qh_update (qh, list_entry (qtd_list->next, struct ehci_qtd, qtd_list));
+		qtd = list_entry (qtd_list->next, struct ehci_qtd, qtd_list);
+		qh_update (qh, qtd);
 	} else {
 		qh->hw_qtd_next = qh->hw_alt_next = EHCI_LIST_END;
 	}
@@ -767,39 +762,48 @@
 
 		/* append to tds already queued to this qh? */
 		if (unlikely (!list_empty (&qh->qtd_list) && qtd)) {
-			struct ehci_qtd		*last_qtd;
-			u32			hw_next;
+			struct ehci_qtd		*dummy;
+			dma_addr_t		dma;
+			u32			token;
+
+			/* to avoid racing the HC, use the dummy td instead of
+			 * the first td of our list (becomes new dummy).  both
+			 * tds stay deactivated until we're done, when the
+			 * HC is allowed to fetch the old dummy (4.10.2).
+			 */
+			token = qtd->hw_token;
+			qtd->hw_token = 0;
+			dummy = qh->dummy;
+			// dbg ("swap td %p with dummy %p", qtd, dummy);
+
+			dma = dummy->qtd_dma;
+			*dummy = *qtd;
+			dummy->qtd_dma = dma;
+			list_del (&qtd->qtd_list);
+			list_add (&dummy->qtd_list, qtd_list);
 
-			/* update the last qtd's "next" pointer */
-			// dbg_qh ("non-empty qh", ehci, qh);
-			last_qtd = list_entry (qh->qtd_list.prev,
-					struct ehci_qtd, qtd_list);
-			hw_next = QTD_NEXT (qtd->qtd_dma);
-			last_qtd->hw_next = hw_next;
+			ehci_qtd_init (qtd, qtd->qtd_dma);
+			qh->dummy = qtd;
 
-			/* previous urb allows short rx? maybe optimize. */
-			if (!(last_qtd->urb->transfer_flags & URB_SHORT_NOT_OK)
-					&& (epnum & 0x10)) {
-				// only the last QTD for now
-				last_qtd->hw_alt_next = hw_next;
-			}
+			/* hc must see the new dummy at list end */
+			qtd = list_entry (qh->qtd_list.prev,
+					struct ehci_qtd, qtd_list);
+			qtd->hw_next = QTD_NEXT (dma);
 
-			/* qh_completions() may need to patch the qh overlay if
-			 * the hc was advancing this queue while we appended.
-			 * we know it can: last_qtd->hw_token has IOC set.
-			 *
-			 * or:  use a dummy td (so the overlay gets the next td
-			 * only when we set its active bit); fewer irqs.
-			 */
+			/* let the hc process these next qtds */
 			wmb ();
+			dummy->hw_token = token;
 
 		/* no URB queued */
 		} else {
-			// dbg_qh ("empty qh", ehci, qh);
+			struct ehci_qtd		*last_qtd;
 
-			/* NOTE: we already canceled any queued URBs
-			 * when the endpoint halted.
-			 */
+			/* make sure hc sees current dummy at the end */
+			last_qtd = list_entry (qtd_list->prev,
+					struct ehci_qtd, qtd_list);
+			last_qtd->hw_next = QTD_NEXT (qh->dummy->qtd_dma);
+
+			// dbg_qh ("empty qh", ehci, qh);
 
 			/* usb_clear_halt() means qh data toggle gets reset */
 			if (unlikely (!usb_gettoggle (urb->dev,
diff -Nru a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
--- a/drivers/usb/host/ehci.h	Thu Nov 14 14:12:39 2002
+++ b/drivers/usb/host/ehci.h	Thu Nov 14 14:12:39 2002
@@ -31,11 +31,6 @@
 	/* termination of urbs from core */
 	unsigned long		complete;
 	unsigned long		unlink;
-
-	/* qhs patched to recover from td queueing race
-	 * (can avoid by using 'dummy td', allowing fewer irqs)
-	 */
-	unsigned long		qpatch;
 };
 
 /* ehci_hcd->lock guards shared data against other CPUs:
@@ -311,6 +306,7 @@
 	dma_addr_t		qh_dma;		/* address of qh */
 	union ehci_shadow	qh_next;	/* ptr to qh; or periodic */
 	struct list_head	qtd_list;	/* sw qtd list */
+	struct ehci_qtd		*dummy;
 
 	atomic_t		refcount;
 
