Date:	Mon, 9 Feb 1998 23:44:53 +0100 (CET)
From:	MOLNAR Ingo <mingo@chiara.csoma.elte.hu>
To:	linux-smp@vger.rutgers.edu
Subject: [Patch] SMP-IOAPIC NMI Software Watchdog driver, io-apic-patch-2.1.85-D


'look ma, no more hard lockups in Linux'

this patch brings a vanilla 2.1.85 kernel up to the latest IOAPIC code.

Probably the most interesting change is the new software watchdog driver,
which uses the IO-APIC to send periodic broadcast NMI interrupts to all
CPUs in the system. The driver detects all cases when there is a soft/hard
lockup on a CPU, and generates an artificial oops: 

  LOCKUP on CPU1, forcing oops
  Unable to handle kernel NULL pointer dereference at virtual address 00000000
  current->tss.cr3 = 016ee000, %cr3 = 016ee000
  *pde = 00000000
  Oops: 0002
  CPU:    1
  EIP:    0010:[<c010b10c>]
  EFLAGS: 00013082
  [...]

as these broadcast interrupts are NMI interrupts, all lockups are detected,
no matter wether the CPU has it's IRQs locked or not. The logic wether there
is a lockup on a CPU is quite straightforward as well: the local APIC timer
IRQs have to be served periodically. When they are for some reason delayed
for more than 500 milliseconds, the NMI-Watchdog generates an oops.

the 'watchdog source IRQ' is default IRQ0, but to have this driver on
boards with broken IO-APIC's, i've made it configurable, and the driver
detects all these buggy cases correctly, and turns off the feature if
necessary. IRQ1 is the next most natural choice. [when there is a lockup,
pound the keyboard for a while to generate enough IRQs]

comments, suggestions, reports welcome,

-- mingo


--- 2.1.85/linux/drivers/char/Config.in	Fri Feb  6 02:28:17 1998
+++ linux/drivers/char/Config.in	Sun Feb 15 11:01:29 1998
@@ -103,6 +103,12 @@
   tristate '   Software Watchdog' CONFIG_SOFT_WATCHDOG
   tristate '   Berkshire Products PC Watchdog' CONFIG_PCWATCHDOG
   tristate '   Acquire SBC Watchdog Timer' CONFIG_ACQUIRE_WDT
+  if [ "$SMP" = "1" ]; then
+     bool  '   SMP-IOAPIC NMI Software Watchdog' CONFIG_NMI_WATCHDOG
+     if [ "$CONFIG_NMI_WATCHDOG" = "y" ]; then
+        int '     watchdog source IRQ' CONFIG_NMI_WATCHDOG_IRQ 0
+     fi
+  fi
 fi
 bool 'Enhanced Real Time Clock Support' CONFIG_RTC
 if [ "$CONFIG_ALPHA_BOOK1" = "y" ]; then
--- 2.1.85/linux/arch/i386/kernel/traps.c	Sat Jan  3 09:44:18 1998
+++ linux/arch/i386/kernel/traps.c	Sun Feb 15 11:24:22 1998
@@ -22,6 +22,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -238,12 +239,15 @@
 	unlock_kernel();
 }
 
+#ifndef CONFIG_NMI_WATCHDOG
 static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
 	printk("You probably have a hardware problem with your RAM chips\n");
-}	
+}
+#endif	
 
+#ifndef CONFIG_NMI_WATCHDOG
 static void io_check_error(unsigned char reason, struct pt_regs * regs)
 {
 	unsigned long i;
@@ -259,14 +263,18 @@
 	reason &= ~8;
 	outb(reason, 0x61);
 }
+#endif
 
+#ifndef CONFIG_NMI_WATCHDOG
 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
 	printk("Dazed and confused, but trying to continue\n");
 	printk("Do you have a strange power saving mode enabled?\n");
 }
+#endif
 
+#ifndef CONFIG_NMI_WATCHDOG
 asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
 {
 	unsigned char reason = inb(0x61);
@@ -280,6 +288,59 @@
 	if (!(reason & 0xc0))
 		unknown_nmi_error(reason, regs);
 }
+#else
+
+/*
+ * FIXME: we assume here that the NMI came from the IO-APIC. It's a quite safe
+ * assumption in most cases, but if anyone knows a way to distinguish between
+ * NMI reasons, please speak up ... [i doubt that the IO-APIC does IO port 0x61
+ * correctly]
+ */
+
+extern atomic_t apic_timer_irqs [NR_CPUS];
+extern spinlock_t console_lock;
+
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+	/*
+	 * the best way to detect wether a CPU has a 'hard lockup' problem
+	 * is to check it's local APIC timer IRQ counts. If they are not
+	 * changing then that CPU has some problem.
+	 *
+	 * as these watchdog NMI IRQs are broadcasted to every CPU, here
+	 * we only have to check the current processor.
+	 *
+	 * since NMIs dont listen to _any_ locks, we have to be extremely
+	 * careful not to rely on unsafe variables. The printk might lock
+	 * up though, so we have to break up console_lock first ...
+	 * [when there will be more tty-related locks, break them up
+	 *  here too!]
+	 */
+
+	static atomic_t last_irq_sums [NR_CPUS] = { ATOMIC_INIT(0), };
+	static atomic_t alert_counter [NR_CPUS] = { ATOMIC_INIT(0), };
+
+	int sum, cpu = hard_smp_processor_id();
+
+	sum = atomic_read(apic_timer_irqs+cpu);
+
+	if (atomic_read(last_irq_sums+cpu) == sum) {
+		/*
+		 * Ayiee, looks like this CPU is stuck ...
+		 * wait a few IRQs (half a second) before doing the oops ...
+		 */
+		atomic_inc(alert_counter+cpu);
+		if (atomic_read(alert_counter+cpu) == HZ/2) {
+			spin_unlock(&console_lock);
+			printk("NMI Watchdog detected LOCKUP on CPU%d, forcing oops\n", cpu);
+			*(int *)0=0;
+		}
+	} else {
+		atomic_set(last_irq_sums+cpu,sum);
+		atomic_set(alert_counter+cpu,0);
+	}
+}
+#endif
 
 asmlinkage void do_debug(struct pt_regs * regs, long error_code)
 {
--- 2.1.85/linux/arch/i386/kernel/irq.c	Tue Feb 10 08:43:02 1998
+++ linux/arch/i386/kernel/irq.c	Fri Feb 13 09:06:03 1998
@@ -719,6 +719,14 @@
 	irq_handles[irq]->enable(irq);
 }
 
+void make_8259A_irq (unsigned int irq)
+{
+	io_apic_irqs &= ~(1<<irq);
+	irq_handles[irq] = &i8259A_irq_type;
+	disable_irq(irq);
+	enable_irq(irq);
+}
+
 /*
  * Careful! The 8259A is a fragile beast, it pretty
  * much _has_ to be done exactly like this (mask it
--- 2.1.85/linux/arch/i386/kernel/smp.c	Tue Feb 10 08:43:02 1998
+++ linux/arch/i386/kernel/smp.c	Sun Feb 15 11:22:44 1998
@@ -148,6 +148,8 @@
 int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
 extern int mp_irq_entries;
 extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { -1, };
+int mp_current_pci_id = 0;
 
 /* #define SMP_DEBUG */
 
@@ -336,9 +338,13 @@
 					mp_bus_id_to_type[m->mpc_busid] =
 						MP_BUS_ISA;
 				else
-				if (strncmp(m->mpc_bustype,"PCI",3) == 0)
+				if (strncmp(m->mpc_bustype,"PCI",3) == 0) {
 					mp_bus_id_to_type[m->mpc_busid] =
 						MP_BUS_PCI;
+					mp_bus_id_to_pci_bus[m->mpc_busid] =
+						mp_current_pci_id;
+					mp_current_pci_id++;
+				}
 				mpt+=sizeof(*m);
 				count+=sizeof(*m);
 				break;
@@ -1404,6 +1410,9 @@
  * [ if a single-CPU system runs an SMP kernel then we call the local
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
+#ifdef CONFIG_NMI_WATCHDOG
+atomic_t apic_timer_irqs [NR_CPUS] = { ATOMIC_INIT(0), };
+#endif
 void smp_apic_timer_interrupt(struct pt_regs * regs)
 {
 	/*
@@ -1412,7 +1421,17 @@
 	 * want to be able to accept NMI tlb invalidates
 	 * during this time.
 	 */
+
+#ifdef CONFIG_NMI_WATCHDOG
+	int cpu = hard_smp_processor_id();
+	/*
+	 * the only thing that can lock an NMI is an unACK-ed APIC ...
+	 */
+	atomic_inc(apic_timer_irqs+cpu);
+#endif
+
 	ack_APIC_irq ();
+
 
 	smp_local_timer_interrupt(regs);
 }
--- 2.1.85/linux/arch/i386/kernel/irq.h	Tue Feb 10 08:43:02 1998
+++ linux/arch/i386/kernel/irq.h	Fri Feb 13 09:06:00 1998
@@ -18,6 +18,7 @@
 void setup_IO_APIC (void);
 void init_IO_APIC_traps(void);
 int IO_APIC_get_PCI_irq_vector (int bus, int slot, int fn);
+void make_8259A_irq (unsigned int irq);
 
 #ifdef __SMP__
  extern unsigned int io_apic_irqs;
@@ -34,6 +35,7 @@
 	MP_BUS_PCI
 };
 extern int mp_bus_id_to_type [MAX_MP_BUSSES];
+extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 extern char ioapic_OEM_ID [16];
 extern char ioapic_Product_ID [16];
 
--- 2.1.85/linux/arch/i386/kernel/io_apic.c	Tue Feb 10 08:43:01 1998
+++ linux/arch/i386/kernel/io_apic.c	Sun Feb 15 11:12:02 1998
@@ -35,6 +35,9 @@
  * spontaneously, GCC should not cache it
  */
 volatile unsigned int * io_apic_reg = NULL;
+#ifdef CONFIG_NMI_WATCHDOG
+int nmi_pin = -1;
+#endif
 
 /*
  * The structure of the IO-APIC:
@@ -62,6 +65,7 @@
 	__u32	vector		:  8,
 		delivery_mode	:  3,	/* 000: FIXED
 					 * 001: lowest prio
+					 * 100: NMI
 					 * 111: ExtInt
 					 */
 		dest_mode	:  1,	/* 0: physical, 1: logical */
@@ -191,10 +195,12 @@
 {
 	int i;
 
-	for (i=mp_irq_entries-1; i>=0; i--) {
-		if (mp_irqs[i].mpc_dstirq == pin)
+	for (i=0; i<mp_irq_entries; i++)
+		if ( (mp_irqs[i].mpc_irqtype == 0x00) &&
+			(mp_irqs[i].mpc_dstirq == pin))
+
 			return i;
-	}
+
 	return -1;
 }
 
@@ -268,6 +274,21 @@
 
 		if (!IO_APIC_IRQ(irq))
 			continue;
+		if (mp_irqs[i].mpc_irqtype)
+			continue;
+
+#ifdef CONFIG_NMI_WATCHDOG
+		if (irq==CONFIG_NMI_WATCHDOG_IRQ) {
+			entry.delivery_mode = 4; /* broadcast NMI */
+			make_8259A_irq(irq);
+			/*
+			 * Remember which register has the NMI IRQ entry,
+			 * so we can turn it off in case there is some
+			 * incompatibility
+			 */
+			nmi_pin = i;
+		}
+#endif
 
 		entry.vector = IO_APIC_GATE_OFFSET + (irq<<3);
 
@@ -397,11 +418,11 @@
 	for (i=0; i<mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].mpc_srcbus;
 
-		if (IO_APIC_IRQ(i) &&
+		if (IO_APIC_IRQ(mp_irqs[i].mpc_dstirq) &&
 		    (mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
 		    !mp_irqs[i].mpc_irqtype &&
-		    (bus == mp_irqs[i].mpc_srcbus) &&
-		    (slot == (mp_irqs[i].mpc_srcbusirq >> 2)) &&
+		    (bus == mp_bus_id_to_pci_bus[mp_irqs[i].mpc_srcbus]) &&
+		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f)) &&
 		    (pci_pin == (mp_irqs[i].mpc_srcbusirq & 3)))
 
 			return mp_irqs[i].mpc_dstirq;
@@ -409,6 +430,30 @@
 	return -1;
 }
 
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *	- timer IRQ defaults to IO-APIC IRQ
+ *	- if this function detects that timer IRQs are defunct, then we fall
+ *	  back to ISA timer IRQs
+ */
+static int timer_irq_works (void)
+{
+	unsigned int t1=jiffies;
+	unsigned long flags;
+
+	save_flags(flags);
+	sti();
+
+	udelay(100*1000);
+
+	if (jiffies-t1>1)
+		return 1;
+
+	return 0;
+}
+
 void print_IO_APIC (void)
 {
 	int i;
@@ -579,7 +624,7 @@
 		pirqs_enabled)
 	{
 		printk("ENABLING IO-APIC IRQs\n");
-		io_apic_irqs = ~((1<<0)|(1<<2)|(1<<13));
+		io_apic_irqs = ~((1<<2)|(1<<13));
 	} else {
 		if (ioapic_blacklisted())
 			printk(" blacklisted board, DISABLING IO-APIC IRQs\n");
@@ -592,6 +637,26 @@
 
 	init_IO_APIC_traps();
 	setup_IO_APIC_irqs ();
+
+#ifdef CONFIG_NMI_WATCHDOG
+	if (nmi_pin == -1)
+		printk(".. NMI watchdog has invalid source IRQ.\n");
+	else
+		printk("NMI Watchdog activated on source IRQ %d\n",
+						CONFIG_NMI_WATCHDOG_IRQ);
+#endif
+
+	if (!timer_irq_works ()) {
+		make_8259A_irq(0);
+		if (!timer_irq_works ())
+			panic("IO-APIC + timer doesnt work!");
+		printk("..MP-BIOS bug: i8254 timer not connected to IO-APIC\n");
+		printk("..falling back to 8259A-based timer interrupt\n");
+#ifdef CONFIG_NMI_WATCHDOG
+		if ((nmi_pin != -1) && (CONFIG_NMI_WATCHDOG_IRQ == 0))
+			printk(".. NMI Watchdog disabled as source IRQ is timer!\n");
+#endif
+	}
 
 	printk("nr of MP irq sources: %d.\n", mp_irq_entries);
 	printk("nr of IOAPIC registers: %d.\n", nr_ioapic_registers);