[PATCH] x86-64: Allow to run a program when a machine check event is detected

author Andi Kleen <ak@suse.de>

Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)

committer Andi Kleen <andi@basil.nowhere.org>

Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)
author Andi Kleen <ak@suse.de>
Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)
committer Andi Kleen <andi@basil.nowhere.org>
Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck

new file mode 100644 (file)

index 0000000..068a6d9
--- /dev/null
+++ b/Documentation/x86_64/machinecheck
@@ -0,0 +1,70 @@
+
+Configurable sysfs parameters for the x86-64 machine check code.
+
+Machine checks report internal hardware error conditions detected
+by the CPU. Uncorrected errors typically cause a machine check
+(often with panic), corrected ones cause a machine check log entry.
+
+Machine checks are organized in banks (normally associated with
+a hardware subsystem) and subevents in a bank. The exact meaning
+of the banks and subevent is CPU specific.
+
+mcelog knows how to decode them.
+
+When you see the "Machine check errors logged" message in the system
+log then mcelog should run to collect and decode machine check entries
+from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
+
+Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
+(N = CPU number)
+
+The directory contains some configurable entries:
+
+Entries:
+
+bankNctl
+(N bank number)
+       64bit Hex bitmask enabling/disabling specific subevents for bank N
+       When a bit in the bitmask is zero then the respective
+       subevent will not be reported.
+       By default all events are enabled.
+       Note that BIOS maintain another mask to disable specific events
+       per bank.  This is not visible here
+
+The following entries appear for each CPU, but they are truly shared
+between all CPUs.
+
+check_interval
+       How often to poll for corrected machine check errors, in seconds
+       (Note output is hexademical). Default 5 minutes.
+
+tolerant
+       Tolerance level. When a machine check exception occurs for a non
+       corrected machine check the kernel can take different actions.
+       Since machine check exceptions can happen any time it is sometimes
+       risky for the kernel to kill a process because it defies
+       normal kernel locking rules. The tolerance level configures
+       how hard the kernel tries to recover even at some risk of deadlock.
+
+       0: always panic,
+       1: panic if deadlock possible,
+       2: try to avoid panic,
+       3: never panic or exit (for testing only)
+
+       Default: 1
+
+       Note this only makes a difference if the CPU allows recovery
+       from a machine check exception. Current x86 CPUs generally do not.
+
+trigger
+       Program to run when a machine check event is detected.
+       This is an alternative to running mcelog regularly from cron
+       and allows to detect events faster.
+
+TBD document entries for AMD threshold interrupt configuration
+
+For more details about the x86 machine check architecture
+see the Intel and AMD architecture manuals from their developer websites.
+
+For more details about the architecture see
+see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c

index bdb54a2c9f186be6acbb4eed72edcf943de9e8f7..8011a8e1c7d41d8a5baf2876c87697b37eec39ca 100644 (file)
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
  #include <linux/cpu.h>
  #include <linux/percpu.h>
  #include <linux/ctype.h>
+#include <linux/kmod.h>
  #include <asm/processor.h> 
  #include <asm/msr.h>
  #include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
  static int notify_user;
  static int rip_msr;
  static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
  
  /*
   * Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
  void mce_log(struct mce *mce)
  {
         unsigned next, entry;
+       atomic_inc(&mce_events);
         mce->finished = 0;
         wmb();
         for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
         }
  }
  
+static void do_mce_trigger(void)
+{
+       static atomic_t mce_logged;
+       int events = atomic_read(&mce_events);
+       if (events != atomic_read(&mce_logged) && trigger[0]) {
+               /* Small race window, but should be harmless.  */
+               atomic_set(&mce_logged, events);
+               call_usermodehelper(trigger, trigger_argv, NULL, -1);
+       }
+}
+
  /* 
   * The actual machine check handler
   */
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         }
  
         /* Never do anything final in the polling timer */
-       if (!regs)
+       if (!regs) {
+               /* Normal interrupt context here. Call trigger for any new
+                  events. */
+               do_mce_trigger();
                 goto out;
+       }
  
         /* If we didn't find an uncorrectable error, pick
            the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
         }                                                                          \
         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  
+/* TBD should generate these dynamically based on number of available banks */
  ACCESSOR(bank0ctl,bank[0],mce_restart())
  ACCESSOR(bank1ctl,bank[1],mce_restart())
  ACCESSOR(bank2ctl,bank[2],mce_restart())
  ACCESSOR(bank3ctl,bank[3],mce_restart())
  ACCESSOR(bank4ctl,bank[4],mce_restart())
  ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
-       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+       strcpy(buf, trigger);
+       strcat(buf, "\n");
+       return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+       char *p;
+       int len;
+       strncpy(trigger, buf, sizeof(trigger));
+       trigger[sizeof(trigger)-1] = 0;
+       len = strlen(trigger);
+       p = strchr(trigger, '\n');
+       if (*p) *p = 0;
+       return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
  ACCESSOR(tolerant,tolerant,)
  ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+       &attr_tolerant, &attr_check_interval, &attr_trigger,
+       NULL
+};
  
  /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
  static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
         err = sysdev_register(&per_cpu(device_mce,cpu));
  
         if (!err) {
-               for (i = 0; i < banks; i++)
+               for (i = 0; mce_attributes[i]; i++)
                         sysdev_create_file(&per_cpu(device_mce,cpu),
-                               bank_attributes[i]);
-               sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
-               sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+                               mce_attributes[i]);
         }
         return err;
  }
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
  {
         int i;
  
-       for (i = 0; i < banks; i++)
+       for (i = 0; mce_attributes[i]; i++)
                 sysdev_remove_file(&per_cpu(device_mce,cpu),
-                       bank_attributes[i]);
-       sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
-       sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
+                       mce_attributes[i]);
         sysdev_unregister(&per_cpu(device_mce,cpu));
         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
  }
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c

index cd8dbe57b33a18c75b63263c2a35fce2a559d6fc..d0bd5d66e103d8d491dc728cb06a82dec86425ba 100644 (file)
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void)
                              (high & MASK_LOCKED_HI))
                                 continue;
  
+                       /* Log the machine check that caused the threshold
+                          event. */
+                       do_machine_check(NULL, 0);
+
                         if (high & MASK_OVERFLOW_HI) {
                                 rdmsrl(address, m.misc);
                                 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h

index 5a11146d6d9cc753f6eb82face2fb15aba39c9c6..177e92b4019beba32d3eef9b645cba0c66c8cd41 100644 (file)
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
  
  extern atomic_t mce_entry;
  
+extern void do_machine_check(struct pt_regs *, long);
+
  #endif
  
  #endif
diff --git a/kernel/kmod.c b/kernel/kmod.c

index 3a7379aa31ca6b0a92bec377a23e5c9debed72db..796276141e51902bd466b1f5b81b5220c109ca4a 100644 (file)
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
                         sub_info->retval = ret;
         }
  
-       complete(sub_info->complete);
+       if (sub_info->wait < 0)
+               kfree(sub_info);
+       else
+               complete(sub_info->complete);
         return 0;
  }
  
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
                 pid = kernel_thread(____call_usermodehelper, sub_info,
                                     CLONE_VFORK | SIGCHLD);
  
+       if (wait < 0)
+               return;
+
         if (pid < 0) {
                 sub_info->retval = pid;
                 complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
   * @envp: null-terminated environment list
   * @session_keyring: session keyring for process (NULL for an empty keyring)
   * @wait: wait for the application to finish and return status.
+ *        when -1 don't wait at all, but you get no useful error back when
+ *        the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
   *
   * Runs a user-space application.  The application is started
   * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
                              struct key *session_keyring, int wait)
  {
         DECLARE_COMPLETION_ONSTACK(done);
-       struct subprocess_info sub_info = {
-               .work           = __WORK_INITIALIZER(sub_info.work,
-                                                    __call_usermodehelper),
-               .complete       = &done,
-               .path           = path,
-               .argv           = argv,
-               .envp           = envp,
-               .ring           = session_keyring,
-               .wait           = wait,
-               .retval         = 0,
-       };
+       struct subprocess_info *sub_info;
+       int retval;
  
         if (!khelper_wq)
                 return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
         if (path[0] == '\0')
                 return 0;
  
-       queue_work(khelper_wq, &sub_info.work);
+       sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+       if (!sub_info)
+               return -ENOMEM;
+
+       INIT_WORK(&sub_info->work, __call_usermodehelper);
+       sub_info->complete = &done;
+       sub_info->path = path;
+       sub_info->argv = argv;
+       sub_info->envp = envp;
+       sub_info->ring = session_keyring;
+       sub_info->wait = wait;
+
+       queue_work(khelper_wq, &sub_info->work);
+       if (wait < 0) /* task has freed sub_info */
+               return 0;
         wait_for_completion(&done);
-       return sub_info.retval;
+       retval = sub_info->retval;
+       kfree(sub_info);
+       return retval;
  }
  EXPORT_SYMBOL(call_usermodehelper_keys);
author	Andi Kleen <ak@suse.de>
	Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)
committer	Andi Kleen <andi@basil.nowhere.org>
	Tue, 13 Feb 2007 12:26:23 +0000 (13:26 +0100)
Documentation/x86_64/machinecheck	[new file with mode: 0644]	patch \| blob
arch/x86_64/kernel/mce.c		patch \| blob \| history
arch/x86_64/kernel/mce_amd.c		patch \| blob \| history
include/asm-x86_64/mce.h		patch \| blob \| history
kernel/kmod.c		patch \| blob \| history