Keep track of the number of ios that have completed, and add in a
fudge factor that assumes all in flight ios finished *now*.  This
gives nice stable results.
--- diff/drivers/md/dm-iostats.c	2002-12-17 17:44:14.000000000 +0000
+++ source/drivers/md/dm-iostats.c	2002-12-17 17:38:52.000000000 +0000
@@ -28,6 +28,7 @@
 	 * These fields are only present if we are recording the
 	 * io latency.
 	 */
+	unsigned long long end_counts[2];
 	unsigned long long start[2];
 	unsigned long long end[2];
 };
@@ -56,7 +57,7 @@
 	if (latency)
 		size = sizeof(*ic);
 	else
-		size = (size_t) &((struct iostats_c *) NULL)->start;
+		size = (size_t) &((struct iostats_c *) NULL)->end_counts;
 
 	ic = kmalloc(size, GFP_KERNEL);
 	if (!ic) {
@@ -132,8 +133,10 @@
 
 	spin_lock_irqsave(&ic->lock, flags);
 
-	if (test_bit(IOF_LATENCY, &ic->flags))
+	if (test_bit(IOF_LATENCY, &ic->flags)) {
+		ic->end_counts[rw]++;
 		ic->end[rw] += (unsigned long long) jiffies;
+	}
 
 	spin_unlock_irqrestore(&ic->lock, flags);
 }
@@ -161,15 +164,19 @@
  */
 static unsigned long calc_latency(struct iostats_c *ic, int index)
 {
-	unsigned long long delta, n;
+	unsigned long long average_start, average_end;
 	unsigned long latency;
 
 	if (!ic->ios[index])
 		return 0;
 
-	delta = ic->end[index] - ic->start[index];
-	div64(delta, ic->ios[index], &n);
-	latency = (unsigned long) n;
+	div64(ic->start[index], ic->ios[index], &average_start);
+
+	/* add a little fudge factor in here for the still pending ios */
+	div64(ic->end[index] +
+	      (jiffies * (ic->ios[index] - ic->end_counts[index])),
+	      ic->ios[index], &average_end);
+	latency = (unsigned long) (average_end - average_start);
 
 	latency *= 1000;
 	latency /= HZ;
