Index: linux-2.6.22-rc5/arch/i386/Kconfig =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/Kconfig 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/Kconfig 2007-06-17 08:52:08.000000000 +0200 @@ -18,6 +18,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config CLOCKSOURCE_WATCHDOG bool default y @@ -1053,6 +1057,8 @@ endif # APM source "arch/i386/kernel/cpu/cpufreq/Kconfig" +source "drivers/cpuidle/Kconfig" + endmenu menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" Index: linux-2.6.22-rc5/arch/x86_64/Kconfig =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/Kconfig 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/Kconfig 2007-06-17 08:52:10.000000000 +0200 @@ -28,10 +28,22 @@ config GENERIC_TIME bool default y +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + +config GENERIC_CLOCKEVENTS + bool + default y + config GENERIC_TIME_VSYSCALL bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config ZONE_DMA32 bool default y @@ -126,6 +138,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + choice prompt "Subarchitecture Type" default X86_PC @@ -698,6 +712,8 @@ source "drivers/acpi/Kconfig" source "arch/x86_64/kernel/cpufreq/Kconfig" +source "drivers/cpuidle/Kconfig" + endmenu menu "Bus options (PCI etc.)" Index: linux-2.6.22-rc5/drivers/Makefile =================================================================== --- linux-2.6.22-rc5.orig/drivers/Makefile 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/drivers/Makefile 2007-06-17 08:52:04.000000000 +0200 @@ -70,6 +70,7 @@ obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_NEW_LEDS) += leds/ obj-$(CONFIG_INFINIBAND) += infiniband/ Index: linux-2.6.22-rc5/drivers/cpuidle/Kconfig =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/Kconfig 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,39 @@ +menu "CPU idle PM support" + +config CPU_IDLE + bool "CPU idle PM support" + help + CPU idle is a generic framework for supporting software-controlled + idle processor power management. It includes modular cross-platform + governors that can be swapped during runtime. + + If you're using a mobile platform that supports CPU idle PM (e.g. + an ACPI-capable notebook), you should say Y here. + +if CPU_IDLE + +comment "Governors" + +config CPU_IDLE_GOV_LADDER + tristate "'ladder' governor" + depends on CPU_IDLE + default y + help + This cpuidle governor promotes and demotes through the supported idle + states using residency time and bus master activity as metrics. This + algorithm was originally introduced in the old ACPI processor driver. + +config CPU_IDLE_GOV_MENU + tristate "'menu' governor" + depends on CPU_IDLE && NO_HZ + default y + help + This cpuidle governor evaluates all available states and chooses the + deepest state that meets all of the following constraints: BM activity, + expected time until next timer interrupt, and last break event time + delta. It is designed to minimize power consumption. Currently + dynticks is required. + +endif # CPU_IDLE + +endmenu Index: linux-2.6.22-rc5/drivers/cpuidle/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/Makefile 2007-06-17 08:52:04.000000000 +0200 @@ -0,0 +1,5 @@ +# +# Makefile for cpuidle. +# + +obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ Index: linux-2.6.22-rc5/drivers/cpuidle/cpuidle.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/cpuidle.c 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,306 @@ +/* + * cpuidle.c - core cpuidle infrastructure + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cpuidle.h" + +DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices); +EXPORT_PER_CPU_SYMBOL_GPL(cpuidle_devices); + +DEFINE_MUTEX(cpuidle_lock); +LIST_HEAD(cpuidle_detected_devices); +static void (*pm_idle_old)(void); + +/** + * cpuidle_idle_call - the main idle loop + * + * NOTE: no locks or semaphores should be used here + */ +static void cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __get_cpu_var(cpuidle_devices); + struct cpuidle_state *target_state; + int next_state; + + /* check if the device is ready */ + if (!dev || dev->status != CPUIDLE_STATUS_DOIDLE) { + if (pm_idle_old) + pm_idle_old(); + else + local_irq_enable(); + return; + } + + /* ask the governor for the next state */ + next_state = cpuidle_curr_governor->select(dev); + if (need_resched()) + return; + target_state = &dev->states[next_state]; + + /* enter the state and update stats */ + dev->last_residency = target_state->enter(dev, target_state); + dev->last_state = target_state; + target_state->time += dev->last_residency; + target_state->usage++; + + /* give the governor an opportunity to reflect on the outcome */ + if (cpuidle_curr_governor->reflect) + cpuidle_curr_governor->reflect(dev); +} + +/** + * cpuidle_install_idle_handler - installs the cpuidle idle loop handler + */ +void cpuidle_install_idle_handler(void) +{ + if (pm_idle != cpuidle_idle_call) { + /* Make sure all changes finished before we switch to new idle */ + smp_wmb(); + pm_idle = cpuidle_idle_call; + } +} + +/** + * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler + */ +void cpuidle_uninstall_idle_handler(void) +{ + if (pm_idle != pm_idle_old) { + pm_idle = pm_idle_old; + cpu_idle_wait(); + } +} + +/** + * cpuidle_rescan_device - prepares for a new state configuration + * @dev: the target device + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_rescan_device(struct cpuidle_device *dev) +{ + int i; + + if (cpuidle_curr_governor->scan) + cpuidle_curr_governor->scan(dev); + + for (i = 0; i < dev->state_count; i++) { + dev->states[i].usage = 0; + dev->states[i].time = 0; + } +} + +/** + * cpuidle_add_device - attaches the driver to a CPU instance + * @sys_dev: the system device (driver model CPU representation) + */ +static int cpuidle_add_device(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + + mutex_lock(&cpuidle_lock); + if (cpu_is_offline(cpu)) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + if (!dev) { + dev = kzalloc(sizeof(struct cpuidle_device), GFP_KERNEL); + if (!dev) { + mutex_unlock(&cpuidle_lock); + return -ENOMEM; + } + init_completion(&dev->kobj_unregister); + per_cpu(cpuidle_devices, cpu) = dev; + } + dev->cpu = cpu; + + if (dev->status & CPUIDLE_STATUS_DETECTED) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + cpuidle_add_sysfs(sys_dev); + + if (cpuidle_curr_driver) { + if (cpuidle_attach_driver(dev)) + goto err_ret; + } + + if (cpuidle_curr_governor) { + if (cpuidle_attach_governor(dev)) { + cpuidle_detach_driver(dev); + goto err_ret; + } + } + + if (cpuidle_device_can_idle(dev)) + cpuidle_install_idle_handler(); + + list_add(&dev->device_list, &cpuidle_detected_devices); + dev->status |= CPUIDLE_STATUS_DETECTED; + +err_ret: + mutex_unlock(&cpuidle_lock); + + return 0; +} + +/** + * __cpuidle_remove_device - detaches the driver from a CPU instance + * @sys_dev: the system device (driver model CPU representation) + * + * Must be called with cpuidle_lock aquired. + */ +static int __cpuidle_remove_device(struct sys_device *sys_dev) +{ + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, sys_dev->id); + + if (!(dev->status & CPUIDLE_STATUS_DETECTED)) { + return 0; + } + dev->status &= ~CPUIDLE_STATUS_DETECTED; + /* NOTE: we don't wait because the cpu is already offline */ + if (cpuidle_curr_governor) + cpuidle_detach_governor(dev); + if (cpuidle_curr_driver) + cpuidle_detach_driver(dev); + cpuidle_remove_sysfs(sys_dev); + list_del(&dev->device_list); + wait_for_completion(&dev->kobj_unregister); + per_cpu(cpuidle_devices, sys_dev->id) = NULL; + kfree(dev); + + return 0; +} + +/** + * cpuidle_remove_device - detaches the driver from a CPU instance + * @sys_dev: the system device (driver model CPU representation) + */ +static int cpuidle_remove_device(struct sys_device *sys_dev) +{ + int ret; + mutex_lock(&cpuidle_lock); + ret = __cpuidle_remove_device(sys_dev); + mutex_unlock(&cpuidle_lock); + + return ret; +} + +static struct sysdev_driver cpuidle_sysdev_driver = { + .add = cpuidle_add_device, + .remove = cpuidle_remove_device, +}; + +static int cpuidle_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev((unsigned long)hcpu); + + switch (action) { + case CPU_ONLINE: + cpuidle_add_device(sys_dev); + break; + case CPU_DOWN_PREPARE: + mutex_lock(&cpuidle_lock); + break; + case CPU_DEAD: + __cpuidle_remove_device(sys_dev); + mutex_unlock(&cpuidle_lock); + break; + case CPU_DOWN_FAILED: + mutex_unlock(&cpuidle_lock); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpuidle_cpu_notifier = +{ + .notifier_call = cpuidle_cpu_callback, +}; + +#ifdef CONFIG_SMP + +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency + * requirement. This means we need to get all processors out of their C-state, + * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that + * wakes them all right up. + */ +static int cpuidle_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 0, 1); + return NOTIFY_OK; +} + +static struct notifier_block cpuidle_latency_notifier = { + .notifier_call = cpuidle_latency_notify, +}; + +#define latency_notifier_init(x) do { register_latency_notifier(x); } while (0) + +#else /* CONFIG_SMP */ + +#define latency_notifier_init(x) do { } while (0) + +#endif /* CONFIG_SMP */ + +/** + * cpuidle_init - core initializer + */ +static int __init cpuidle_init(void) +{ + int ret; + + pm_idle_old = pm_idle; + + ret = cpuidle_add_class_sysfs(&cpu_sysdev_class); + if (ret) + return ret; + + register_hotcpu_notifier(&cpuidle_cpu_notifier); + + ret = sysdev_driver_register(&cpu_sysdev_class, &cpuidle_sysdev_driver); + + if (ret) { + cpuidle_remove_class_sysfs(&cpu_sysdev_class); + printk(KERN_ERR "cpuidle: failed to initialize\n"); + return ret; + } + + latency_notifier_init(&cpuidle_latency_notifier); + + return 0; +} + +core_initcall(cpuidle_init); Index: linux-2.6.22-rc5/drivers/cpuidle/cpuidle.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/cpuidle.h 2007-06-17 08:52:04.000000000 +0200 @@ -0,0 +1,50 @@ +/* + * cpuidle.h - The internal header file + */ + +#ifndef __DRIVER_CPUIDLE_H +#define __DRIVER_CPUIDLE_H + +#include + +/* For internal use only */ +extern struct cpuidle_governor *cpuidle_curr_governor; +extern struct cpuidle_driver *cpuidle_curr_driver; +extern struct list_head cpuidle_drivers; +extern struct list_head cpuidle_governors; +extern struct list_head cpuidle_detected_devices; +extern struct mutex cpuidle_lock; + +/* idle loop */ +extern void cpuidle_install_idle_handler(void); +extern void cpuidle_uninstall_idle_handler(void); +extern void cpuidle_rescan_device(struct cpuidle_device *dev); + +/* drivers */ +extern int cpuidle_attach_driver(struct cpuidle_device *dev); +extern void cpuidle_detach_driver(struct cpuidle_device *dev); +extern int cpuidle_switch_driver(struct cpuidle_driver *drv); + +/* governors */ +extern int cpuidle_attach_governor(struct cpuidle_device *dev); +extern void cpuidle_detach_governor(struct cpuidle_device *dev); +extern int cpuidle_switch_governor(struct cpuidle_governor *gov); + +/* sysfs */ +extern int cpuidle_add_class_sysfs(struct sysdev_class *cls); +extern void cpuidle_remove_class_sysfs(struct sysdev_class *cls); +extern int cpuidle_add_driver_sysfs(struct cpuidle_device *device); +extern void cpuidle_remove_driver_sysfs(struct cpuidle_device *device); +extern int cpuidle_add_sysfs(struct sys_device *sysdev); +extern void cpuidle_remove_sysfs(struct sys_device *sysdev); + +/** + * cpuidle_device_can_idle - determines if a CPU can utilize the idle loop + * @dev: the target CPU + */ +static inline int cpuidle_device_can_idle(struct cpuidle_device *dev) +{ + return (dev->status == CPUIDLE_STATUS_DOIDLE); +} + +#endif /* __DRIVER_CPUIDLE_H */ Index: linux-2.6.22-rc5/drivers/cpuidle/driver.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/driver.c 2007-06-17 08:52:05.000000000 +0200 @@ -0,0 +1,276 @@ +/* + * driver.c - driver support + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include + +#include "cpuidle.h" + +LIST_HEAD(cpuidle_drivers); +struct cpuidle_driver *cpuidle_curr_driver; + + +/** + * cpuidle_attach_driver - attaches a driver to a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_attach_driver(struct cpuidle_device *dev) +{ + int ret; + + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) + return -EIO; + + if (!try_module_get(cpuidle_curr_driver->owner)) + return -EINVAL; + + ret = cpuidle_curr_driver->init(dev); + if (ret) { + module_put(cpuidle_curr_driver->owner); + printk(KERN_INFO "cpuidle: driver %s failed to attach to " + "cpu %d\n", cpuidle_curr_driver->name, dev->cpu); + } else { + if (dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) + cpuidle_rescan_device(dev); + smp_wmb(); + dev->status |= CPUIDLE_STATUS_DRIVER_ATTACHED; + cpuidle_add_driver_sysfs(dev); + } + + return ret; +} + +/** + * cpuidle_detach_govenor - detaches a driver from a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_detach_driver(struct cpuidle_device *dev) +{ + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) { + cpuidle_remove_driver_sysfs(dev); + dev->status &= ~CPUIDLE_STATUS_DRIVER_ATTACHED; + if (cpuidle_curr_driver->exit) + cpuidle_curr_driver->exit(dev); + module_put(cpuidle_curr_driver->owner); + } +} + +/** + * __cpuidle_find_driver - finds a driver of the specified name + * @str: the name + * + * Must be called with cpuidle_lock aquired. + */ +static struct cpuidle_driver * __cpuidle_find_driver(const char *str) +{ + struct cpuidle_driver *drv; + + list_for_each_entry(drv, &cpuidle_drivers, driver_list) + if (!strnicmp(str, drv->name, CPUIDLE_NAME_LEN)) + return drv; + + return NULL; +} + +/** + * cpuidle_switch_driver - changes the driver + * @drv: the new target driver + * + * NOTE: "drv" can be NULL to specify disabled + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_switch_driver(struct cpuidle_driver *drv) +{ + struct cpuidle_device *dev; + + if (drv == cpuidle_curr_driver) + return -EINVAL; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_driver) + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_detach_driver(dev); + + cpuidle_curr_driver = drv; + + if (drv) { + int ret = 1; + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + if (cpuidle_attach_driver(dev) == 0) + ret = 0; + + /* If attach on all devices fail, switch to NULL driver */ + if (ret) + cpuidle_curr_driver = NULL; + + if (cpuidle_curr_driver && cpuidle_curr_governor) { + printk(KERN_INFO "cpuidle: using driver %s\n", + drv->name); + cpuidle_install_idle_handler(); + } + } + + return 0; +} + +/** + * cpuidle_register_driver - registers a driver + * @drv: the driver + */ +int cpuidle_register_driver(struct cpuidle_driver *drv) +{ + int ret = -EEXIST; + + if (!drv || !drv->init) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + if (__cpuidle_find_driver(drv->name) == NULL) { + ret = 0; + list_add_tail(&drv->driver_list, &cpuidle_drivers); + if (!cpuidle_curr_driver) + cpuidle_switch_driver(drv); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_driver); + +/** + * cpuidle_unregister_driver - unregisters a driver + * @drv: the driver + */ +void cpuidle_unregister_driver(struct cpuidle_driver *drv) +{ + if (!drv) + return; + + mutex_lock(&cpuidle_lock); + if (drv == cpuidle_curr_driver) + cpuidle_switch_driver(NULL); + list_del(&drv->driver_list); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_driver); + +static void __cpuidle_force_redetect(struct cpuidle_device *dev) +{ + cpuidle_remove_driver_sysfs(dev); + cpuidle_curr_driver->redetect(dev); + cpuidle_add_driver_sysfs(dev); +} + +/** + * cpuidle_force_redetect - redetects the idle states of a CPU + * + * @dev: the CPU to redetect + * @drv: the target driver + * + * Generally, the driver will call this when the supported states set has + * changed. (e.g. as the result of an ACPI transition to battery power) + */ +int cpuidle_force_redetect(struct cpuidle_device *dev, + struct cpuidle_driver *drv) +{ + int uninstalled = 0; + + mutex_lock(&cpuidle_lock); + + if (drv != cpuidle_curr_driver) { + mutex_unlock(&cpuidle_lock); + return 0; + } + + if (!(dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) || + !cpuidle_curr_driver->redetect) { + mutex_unlock(&cpuidle_lock); + return -EIO; + } + + if (cpuidle_device_can_idle(dev)) { + uninstalled = 1; + cpuidle_uninstall_idle_handler(); + } + + __cpuidle_force_redetect(dev); + + if (cpuidle_device_can_idle(dev)) { + cpuidle_rescan_device(dev); + cpuidle_install_idle_handler(); + } + + /* other devices are still ok */ + if (uninstalled) + cpuidle_install_idle_handler(); + + mutex_unlock(&cpuidle_lock); + + return 0; +} + +EXPORT_SYMBOL_GPL(cpuidle_force_redetect); + +/** + * cpuidle_force_redetect_devices - redetects the idle states of all CPUs + * + * @drv: the target driver + * + * Generally, the driver will call this when the supported states set has + * changed. (e.g. as the result of an ACPI transition to battery power) + */ +int cpuidle_force_redetect_devices(struct cpuidle_driver *drv) +{ + struct cpuidle_device *dev; + int ret = 0; + + mutex_lock(&cpuidle_lock); + + if (drv != cpuidle_curr_driver) + goto out; + + if (!cpuidle_curr_driver->redetect) { + ret = -EIO; + goto out; + } + + cpuidle_uninstall_idle_handler(); + + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + __cpuidle_force_redetect(dev); + + cpuidle_install_idle_handler(); +out: + mutex_unlock(&cpuidle_lock); + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_force_redetect_devices); + +/** + * cpuidle_get_bm_activity - determines if BM activity has occured + */ +int cpuidle_get_bm_activity(void) +{ + if (cpuidle_curr_driver->bm_check) + return cpuidle_curr_driver->bm_check(); + else + return 0; +} +EXPORT_SYMBOL_GPL(cpuidle_get_bm_activity); + Index: linux-2.6.22-rc5/drivers/cpuidle/governor.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/governor.c 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,187 @@ +/* + * governor.c - governor support + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include + +#include "cpuidle.h" + +LIST_HEAD(cpuidle_governors); +struct cpuidle_governor *cpuidle_curr_governor; + + +/** + * cpuidle_attach_governor - attaches a governor to a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_attach_governor(struct cpuidle_device *dev) +{ + int ret = 0; + + if(dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) + return -EIO; + + if (!try_module_get(cpuidle_curr_governor->owner)) + return -EINVAL; + + if (cpuidle_curr_governor->init) + ret = cpuidle_curr_governor->init(dev); + if (ret) { + module_put(cpuidle_curr_governor->owner); + printk(KERN_ERR "cpuidle: governor %s failed to attach to cpu %d\n", + cpuidle_curr_governor->name, dev->cpu); + } else { + if (dev->status & CPUIDLE_STATUS_DRIVER_ATTACHED) + cpuidle_rescan_device(dev); + smp_wmb(); + dev->status |= CPUIDLE_STATUS_GOVERNOR_ATTACHED; + } + + return ret; +} + +/** + * cpuidle_detach_govenor - detaches a governor from a CPU + * @dev: the target CPU + * + * Must be called with cpuidle_lock aquired. + */ +void cpuidle_detach_governor(struct cpuidle_device *dev) +{ + if (dev->status & CPUIDLE_STATUS_GOVERNOR_ATTACHED) { + dev->status &= ~CPUIDLE_STATUS_GOVERNOR_ATTACHED; + if (cpuidle_curr_governor->exit) + cpuidle_curr_governor->exit(dev); + module_put(cpuidle_curr_governor->owner); + } +} + +/** + * __cpuidle_find_governor - finds a governor of the specified name + * @str: the name + * + * Must be called with cpuidle_lock aquired. + */ +static struct cpuidle_governor * __cpuidle_find_governor(const char *str) +{ + struct cpuidle_governor *gov; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) + if (!strnicmp(str, gov->name, CPUIDLE_NAME_LEN)) + return gov; + + return NULL; +} + +/** + * cpuidle_switch_governor - changes the governor + * @gov: the new target governor + * + * NOTE: "gov" can be NULL to specify disabled + * Must be called with cpuidle_lock aquired. + */ +int cpuidle_switch_governor(struct cpuidle_governor *gov) +{ + struct cpuidle_device *dev; + + if (gov == cpuidle_curr_governor) + return -EINVAL; + + cpuidle_uninstall_idle_handler(); + + if (cpuidle_curr_governor) + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_detach_governor(dev); + + cpuidle_curr_governor = gov; + + if (gov) { + list_for_each_entry(dev, &cpuidle_detected_devices, device_list) + cpuidle_attach_governor(dev); + if (cpuidle_curr_driver) + cpuidle_install_idle_handler(); + printk(KERN_INFO "cpuidle: using governor %s\n", gov->name); + } + + return 0; +} + +/** + * cpuidle_register_governor - registers a governor + * @gov: the governor + */ +int cpuidle_register_governor(struct cpuidle_governor *gov) +{ + int ret = -EEXIST; + + if (!gov || !gov->select) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + if (__cpuidle_find_governor(gov->name) == NULL) { + ret = 0; + list_add_tail(&gov->governor_list, &cpuidle_governors); + if (!cpuidle_curr_governor || + cpuidle_curr_governor->rating < gov->rating) + cpuidle_switch_governor(gov); + } + mutex_unlock(&cpuidle_lock); + + return ret; +} + +EXPORT_SYMBOL_GPL(cpuidle_register_governor); + +/** + * cpuidle_replace_governor - find a replacement governor + * @exclude_rating: the rating that will be skipped while looking for + * new governor. + */ +struct cpuidle_governor *cpuidle_replace_governor(int exclude_rating) +{ + struct cpuidle_governor *gov; + struct cpuidle_governor *ret_gov = NULL; + unsigned int max_rating = 0; + + list_for_each_entry(gov, &cpuidle_governors, governor_list) { + if (gov->rating == exclude_rating) + continue; + if (gov->rating > max_rating) { + max_rating = gov->rating; + ret_gov = gov; + } + } + + return ret_gov; +} + +/** + * cpuidle_unregister_governor - unregisters a governor + * @gov: the governor + */ +void cpuidle_unregister_governor(struct cpuidle_governor *gov) +{ + if (!gov) + return; + + mutex_lock(&cpuidle_lock); + if (gov == cpuidle_curr_governor) { + struct cpuidle_governor *new_gov; + new_gov = cpuidle_replace_governor(gov->rating); + cpuidle_switch_governor(new_gov); + } + list_del(&gov->governor_list); + mutex_unlock(&cpuidle_lock); +} + +EXPORT_SYMBOL_GPL(cpuidle_unregister_governor); Index: linux-2.6.22-rc5/drivers/cpuidle/governors/Makefile =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/governors/Makefile 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,6 @@ +# +# Makefile for cpuidle governors. +# + +obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o +obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o Index: linux-2.6.22-rc5/drivers/cpuidle/governors/ladder.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/governors/ladder.c 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,228 @@ +/* + * ladder.c - the residency ladder algorithm + * + * Copyright (C) 2001, 2002 Andy Grover + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2004, 2005 Dominik Brodowski + * + * (C) 2006-2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define PROMOTION_COUNT 4 +#define DEMOTION_COUNT 1 + +/* + * bm_history -- bit-mask with a bit per jiffy of bus-master activity + * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms + * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms + * 100 HZ: 0x0000000F: 4 jiffies = 40ms + * reduce history for more aggressive entry into C3 + */ +static unsigned int bm_history __read_mostly = + (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); +module_param(bm_history, uint, 0644); + +struct ladder_device_state { + struct { + u32 promotion_count; + u32 demotion_count; + u32 promotion_time; + u32 demotion_time; + u32 bm; + } threshold; + struct { + int promotion_count; + int demotion_count; + } stats; +}; + +struct ladder_device { + struct ladder_device_state states[CPUIDLE_STATE_MAX]; + unsigned int bm_check:1; + unsigned long bm_check_timestamp; + unsigned long bm_activity; /* FIXME: bm activity should be global */ + int last_state_idx; +}; + +/** + * ladder_do_selection - prepares private data for a state change + * @ldev: the ladder device + * @old_idx: the current state index + * @new_idx: the new target state index + */ +static inline void ladder_do_selection(struct ladder_device *ldev, + int old_idx, int new_idx) +{ + ldev->states[old_idx].stats.promotion_count = 0; + ldev->states[old_idx].stats.demotion_count = 0; + ldev->last_state_idx = new_idx; +} + +/** + * ladder_select_state - selects the next state to enter + * @dev: the CPU + */ +static int ladder_select_state(struct cpuidle_device *dev) +{ + struct ladder_device *ldev = dev->governor_data; + struct ladder_device_state *last_state; + int last_residency, last_idx = ldev->last_state_idx; + + if (unlikely(!ldev)) + return 0; + + last_state = &ldev->states[last_idx]; + + /* demote if within BM threshold */ + if (ldev->bm_check) { + unsigned long diff; + + diff = jiffies - ldev->bm_check_timestamp; + if (diff > 31) + diff = 31; + + ldev->bm_activity <<= diff; + if (cpuidle_get_bm_activity()) + ldev->bm_activity |= ((1 << diff) - 1); + + ldev->bm_check_timestamp = jiffies; + if ((last_idx > 0) && + (last_state->threshold.bm & ldev->bm_activity)) { + ladder_do_selection(ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + if (dev->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID) + last_residency = cpuidle_get_last_residency(dev) - dev->states[last_idx].exit_latency; + else + last_residency = last_state->threshold.promotion_time + 1; + + /* consider promotion */ + if (last_idx < dev->state_count - 1 && + last_residency > last_state->threshold.promotion_time && + dev->states[last_idx + 1].exit_latency <= system_latency_constraint()) { + last_state->stats.promotion_count++; + last_state->stats.demotion_count = 0; + if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { + ladder_do_selection(ldev, last_idx, last_idx + 1); + return last_idx + 1; + } + } + + /* consider demotion */ + if (last_idx > 0 && + last_residency < last_state->threshold.demotion_time) { + last_state->stats.demotion_count++; + last_state->stats.promotion_count = 0; + if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { + ladder_do_selection(ldev, last_idx, last_idx - 1); + return last_idx - 1; + } + } + + /* otherwise remain at the current state */ + return last_idx; +} + +/** + * ladder_scan_device - scans a CPU's states and does setup + * @dev: the CPU + */ +static void ladder_scan_device(struct cpuidle_device *dev) +{ + int i, bm_check = 0; + struct ladder_device *ldev = dev->governor_data; + struct ladder_device_state *lstate; + struct cpuidle_state *state; + + ldev->last_state_idx = 0; + ldev->bm_check_timestamp = 0; + ldev->bm_activity = 0; + + for (i = 0; i < dev->state_count; i++) { + state = &dev->states[i]; + lstate = &ldev->states[i]; + + lstate->stats.promotion_count = 0; + lstate->stats.demotion_count = 0; + + lstate->threshold.promotion_count = PROMOTION_COUNT; + lstate->threshold.demotion_count = DEMOTION_COUNT; + + if (i < dev->state_count - 1) + lstate->threshold.promotion_time = state->exit_latency; + if (i > 0) + lstate->threshold.demotion_time = state->exit_latency; + if (state->flags & CPUIDLE_FLAG_CHECK_BM) { + lstate->threshold.bm = bm_history; + bm_check = 1; + } else + lstate->threshold.bm = 0; + } + + ldev->bm_check = bm_check; +} + +/** + * ladder_init_device - initializes a CPU-instance + * @dev: the CPU + */ +static int ladder_init_device(struct cpuidle_device *dev) +{ + dev->governor_data = kmalloc(sizeof(struct ladder_device), GFP_KERNEL); + + return !dev->governor_data; +} + +/** + * ladder_exit_device - exits a CPU-instance + * @dev: the CPU + */ +static void ladder_exit_device(struct cpuidle_device *dev) +{ + kfree(dev->governor_data); +} + +static struct cpuidle_governor ladder_governor = { + .name = "ladder", + .rating = 10, + .init = ladder_init_device, + .exit = ladder_exit_device, + .scan = ladder_scan_device, + .select = ladder_select_state, + .owner = THIS_MODULE, +}; + +/** + * init_ladder - initializes the governor + */ +static int __init init_ladder(void) +{ + return cpuidle_register_governor(&ladder_governor); +} + +/** + * exit_ladder - exits the governor + */ +static void __exit exit_ladder(void) +{ + cpuidle_unregister_governor(&ladder_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_ladder); +module_exit(exit_ladder); Index: linux-2.6.22-rc5/drivers/cpuidle/sysfs.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/sysfs.c 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,393 @@ +/* + * sysfs.c - sysfs support + * + * (C) 2006-2007 Shaohua Li + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include + +#include "cpuidle.h" + +static unsigned int sysfs_switch; +static int __init cpuidle_sysfs_setup(char *unused) +{ + sysfs_switch = 1; + return 1; +} +__setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup); + +static ssize_t show_available_drivers(struct sys_device *dev, char *buf) +{ + ssize_t i = 0; + struct cpuidle_driver *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_drivers, driver_list) { + if (i >= (ssize_t)((PAGE_SIZE/sizeof(char)) - CPUIDLE_NAME_LEN - 2)) + goto out; + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + } +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_available_governors(struct sys_device *dev, char *buf) +{ + ssize_t i = 0; + struct cpuidle_governor *tmp; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (i >= (ssize_t)((PAGE_SIZE/sizeof(char)) - CPUIDLE_NAME_LEN - 2)) + goto out; + i += scnprintf(&buf[i], CPUIDLE_NAME_LEN, "%s ", tmp->name); + } + if (list_empty(&cpuidle_governors)) + i+= sprintf(&buf[i], "no governors"); +out: + i+= sprintf(&buf[i], "\n"); + mutex_unlock(&cpuidle_lock); + return i; +} + +static ssize_t show_current_driver(struct sys_device *dev, char *buf) +{ + ssize_t ret; + + mutex_lock(&cpuidle_lock); + ret = sprintf(buf, "%s\n", cpuidle_curr_driver->name); + mutex_unlock(&cpuidle_lock); + return ret; +} + +static ssize_t store_current_driver(struct sys_device *dev, + const char *buf, size_t count) +{ + char str[CPUIDLE_NAME_LEN]; + int len = count; + struct cpuidle_driver *tmp, *found = NULL; + + if (len > CPUIDLE_NAME_LEN) + len = CPUIDLE_NAME_LEN; + + if (sscanf(buf, "%s", str) != 1) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_drivers, driver_list) { + if (strncmp(tmp->name, str, CPUIDLE_NAME_LEN) == 0) { + found = tmp; + break; + } + } + if (found) + cpuidle_switch_driver(found); + mutex_unlock(&cpuidle_lock); + + return count; +} + +static ssize_t show_current_governor(struct sys_device *dev, char *buf) +{ + ssize_t i; + + mutex_lock(&cpuidle_lock); + if (cpuidle_curr_governor) + i = sprintf(buf, "%s\n", cpuidle_curr_governor->name); + else + i = sprintf(buf, "no governor\n"); + mutex_unlock(&cpuidle_lock); + + return i; +} + +static ssize_t store_current_governor(struct sys_device *dev, + const char *buf, size_t count) +{ + char str[CPUIDLE_NAME_LEN]; + int len = count; + struct cpuidle_governor *tmp, *found = NULL; + + if (len > CPUIDLE_NAME_LEN) + len = CPUIDLE_NAME_LEN; + + if (sscanf(buf, "%s", str) != 1) + return -EINVAL; + + mutex_lock(&cpuidle_lock); + list_for_each_entry(tmp, &cpuidle_governors, governor_list) { + if (strncmp(tmp->name, str, CPUIDLE_NAME_LEN) == 0) { + found = tmp; + break; + } + } + if (found) + cpuidle_switch_governor(found); + mutex_unlock(&cpuidle_lock); + + return count; +} + +static SYSDEV_ATTR(current_driver_ro, 0444, show_current_driver, NULL); +static SYSDEV_ATTR(current_governor_ro, 0444, show_current_governor, NULL); + +static struct attribute *cpuclass_default_attrs[] = { + &attr_current_driver_ro.attr, + &attr_current_governor_ro.attr, + NULL +}; + +static SYSDEV_ATTR(available_drivers, 0444, show_available_drivers, NULL); +static SYSDEV_ATTR(available_governors, 0444, show_available_governors, NULL); +static SYSDEV_ATTR(current_driver, 0644, show_current_driver, + store_current_driver); +static SYSDEV_ATTR(current_governor, 0644, show_current_governor, + store_current_governor); + +static struct attribute *cpuclass_switch_attrs[] = { + &attr_available_drivers.attr, + &attr_available_governors.attr, + &attr_current_driver.attr, + &attr_current_governor.attr, + NULL +}; + +static struct attribute_group cpuclass_attr_group = { + .attrs = cpuclass_default_attrs, + .name = "cpuidle", +}; + +/** + * cpuidle_add_class_sysfs - add CPU global sysfs attributes + */ +int cpuidle_add_class_sysfs(struct sysdev_class *cls) +{ + if (sysfs_switch) + cpuclass_attr_group.attrs = cpuclass_switch_attrs; + + return sysfs_create_group(&cls->kset.kobj, &cpuclass_attr_group); +} + +/** + * cpuidle_remove_class_sysfs - remove CPU global sysfs attributes + */ +void cpuidle_remove_class_sysfs(struct sysdev_class *cls) +{ + sysfs_remove_group(&cls->kset.kobj, &cpuclass_attr_group); +} + +struct cpuidle_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_device *, char *); + ssize_t (*store)(struct cpuidle_device *, const char *, size_t count); +}; + +#define define_one_ro(_name, show) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0444, show, NULL) +#define define_one_rw(_name, show, store) \ + static struct cpuidle_attr attr_##_name = __ATTR(_name, 0644, show, store) + +#define kobj_to_cpuidledev(k) container_of(k, struct cpuidle_device, kobj) +#define attr_to_cpuidleattr(a) container_of(a, struct cpuidle_attr, attr) +static ssize_t cpuidle_show(struct kobject * kobj, struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->show) { + mutex_lock(&cpuidle_lock); + ret = cattr->show(dev, buf); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static ssize_t cpuidle_store(struct kobject * kobj, struct attribute * attr, + const char * buf, size_t count) +{ + int ret = -EIO; + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + struct cpuidle_attr * cattr = attr_to_cpuidleattr(attr); + + if (cattr->store) { + mutex_lock(&cpuidle_lock); + ret = cattr->store(dev, buf, count); + mutex_unlock(&cpuidle_lock); + } + return ret; +} + +static struct sysfs_ops cpuidle_sysfs_ops = { + .show = cpuidle_show, + .store = cpuidle_store, +}; + +static void cpuidle_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_device *dev = kobj_to_cpuidledev(kobj); + + complete(&dev->kobj_unregister); +} + +static struct kobj_type ktype_cpuidle = { + .sysfs_ops = &cpuidle_sysfs_ops, + .release = cpuidle_sysfs_release, +}; + +struct cpuidle_state_attr { + struct attribute attr; + ssize_t (*show)(struct cpuidle_state *, char *); + ssize_t (*store)(struct cpuidle_state *, const char *, size_t); +}; + +#define define_one_state_ro(_name, show) \ +static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL) + +#define define_show_state_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \ +{ \ + return sprintf(buf, "%d\n", state->_name);\ +} + +define_show_state_function(exit_latency) +define_show_state_function(power_usage) +define_show_state_function(usage) +define_show_state_function(time) +define_one_state_ro(latency, show_state_exit_latency); +define_one_state_ro(power, show_state_power_usage); +define_one_state_ro(usage, show_state_usage); +define_one_state_ro(time, show_state_time); + +static struct attribute *cpuidle_state_default_attrs[] = { + &attr_latency.attr, + &attr_power.attr, + &attr_usage.attr, + &attr_time.attr, + NULL +}; + +#define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) +#define kobj_to_state(k) (kobj_to_state_obj(k)->state) +#define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) +static ssize_t cpuidle_state_show(struct kobject * kobj, + struct attribute * attr ,char * buf) +{ + int ret = -EIO; + struct cpuidle_state *state = kobj_to_state(kobj); + struct cpuidle_state_attr * cattr = attr_to_stateattr(attr); + + if (cattr->show) + ret = cattr->show(state, buf); + + return ret; +} + +static struct sysfs_ops cpuidle_state_sysfs_ops = { + .show = cpuidle_state_show, +}; + +static void cpuidle_state_sysfs_release(struct kobject *kobj) +{ + struct cpuidle_state_kobj *state_obj = kobj_to_state_obj(kobj); + + complete(&state_obj->kobj_unregister); +} + +static struct kobj_type ktype_state_cpuidle = { + .sysfs_ops = &cpuidle_state_sysfs_ops, + .default_attrs = cpuidle_state_default_attrs, + .release = cpuidle_state_sysfs_release, +}; + +static void inline cpuidle_free_state_kobj(struct cpuidle_device *device, int i) +{ + kobject_unregister(&device->kobjs[i]->kobj); + wait_for_completion(&device->kobjs[i]->kobj_unregister); + kfree(device->kobjs[i]); + device->kobjs[i] = NULL; +} + +/** + * cpuidle_add_driver_sysfs - adds driver-specific sysfs attributes + * @device: the target device + */ +int cpuidle_add_driver_sysfs(struct cpuidle_device *device) +{ + int i, ret = -ENOMEM; + struct cpuidle_state_kobj *kobj; + + /* state statistics */ + for (i = 0; i < device->state_count; i++) { + kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); + if (!kobj) + goto error_state; + kobj->state = &device->states[i]; + init_completion(&kobj->kobj_unregister); + + kobj->kobj.parent = &device->kobj; + kobj->kobj.ktype = &ktype_state_cpuidle; + kobject_set_name(&kobj->kobj, "state%d", i); + ret = kobject_register(&kobj->kobj); + if (ret) { + kfree(kobj); + goto error_state; + } + device->kobjs[i] = kobj; + } + + return 0; + +error_state: + for (i = i - 1; i >= 0; i--) + cpuidle_free_state_kobj(device, i); + return ret; +} + +/** + * cpuidle_remove_driver_sysfs - removes driver-specific sysfs attributes + * @device: the target device + */ +void cpuidle_remove_driver_sysfs(struct cpuidle_device *device) +{ + int i; + + for (i = 0; i < device->state_count; i++) + cpuidle_free_state_kobj(device, i); +} + +/** + * cpuidle_add_sysfs - creates a sysfs instance for the target device + * @sysdev: the target device + */ +int cpuidle_add_sysfs(struct sys_device *sysdev) +{ + int cpu = sysdev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + dev->kobj.parent = &sysdev->kobj; + dev->kobj.ktype = &ktype_cpuidle; + kobject_set_name(&dev->kobj, "%s", "cpuidle"); + return kobject_register(&dev->kobj); +} + +/** + * cpuidle_remove_sysfs - deletes a sysfs instance on the target device + * @sysdev: the target device + */ +void cpuidle_remove_sysfs(struct sys_device *sysdev) +{ + int cpu = sysdev->id; + struct cpuidle_device *dev; + + dev = per_cpu(cpuidle_devices, cpu); + kobject_unregister(&dev->kobj); +} Index: linux-2.6.22-rc5/include/linux/cpuidle.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/include/linux/cpuidle.h 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,190 @@ +/* + * cpuidle.h - a generic framework for CPU idle power management + * + * (C) 2007 Venkatesh Pallipadi + * Shaohua Li + * Adam Belay + * + * This code is licenced under the GPL. + */ + +#ifndef _LINUX_CPUIDLE_H +#define _LINUX_CPUIDLE_H + +#include +#include +#include +#include +#include + +#define CPUIDLE_STATE_MAX 8 +#define CPUIDLE_NAME_LEN 16 + +struct cpuidle_device; + + +/**************************** + * CPUIDLE DEVICE INTERFACE * + ****************************/ + +struct cpuidle_state { + char name[CPUIDLE_NAME_LEN]; + void *driver_data; + + unsigned int flags; + unsigned int exit_latency; /* in US */ + unsigned int power_usage; /* in mW */ + unsigned int target_residency; /* in US */ + + unsigned int usage; + unsigned int time; /* in US */ + + int (*enter) (struct cpuidle_device *dev, + struct cpuidle_state *state); +}; + +/* Idle State Flags */ +#define CPUIDLE_FLAG_TIME_VALID (0x01) /* is residency time measurable? */ +#define CPUIDLE_FLAG_CHECK_BM (0x02) /* BM activity will exit state */ +#define CPUIDLE_FLAG_SHALLOW (0x10) /* low latency, minimal savings */ +#define CPUIDLE_FLAG_BALANCED (0x20) /* medium latency, moderate savings */ +#define CPUIDLE_FLAG_DEEP (0x40) /* high latency, large savings */ + +#define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000) + +/** + * cpuidle_get_statedata - retrieves private driver state data + * @state: the state + */ +static inline void * cpuidle_get_statedata(struct cpuidle_state *state) +{ + return state->driver_data; +} + +/** + * cpuidle_set_statedata - stores private driver state data + * @state: the state + * @data: the private data + */ +static inline void +cpuidle_set_statedata(struct cpuidle_state *state, void *data) +{ + state->driver_data = data; +} + +struct cpuidle_state_kobj { + struct cpuidle_state *state; + struct completion kobj_unregister; + struct kobject kobj; +}; + +struct cpuidle_device { + unsigned int status; + int cpu; + + int last_residency; + int state_count; + struct cpuidle_state states[CPUIDLE_STATE_MAX]; + struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; + struct cpuidle_state *last_state; + + struct list_head device_list; + struct kobject kobj; + struct completion kobj_unregister; + void *governor_data; +}; + +DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); + +/* Device Status Flags */ +#define CPUIDLE_STATUS_DETECTED (0x1) +#define CPUIDLE_STATUS_DRIVER_ATTACHED (0x2) +#define CPUIDLE_STATUS_GOVERNOR_ATTACHED (0x4) +#define CPUIDLE_STATUS_DOIDLE (CPUIDLE_STATUS_DETECTED | \ + CPUIDLE_STATUS_DRIVER_ATTACHED | \ + CPUIDLE_STATUS_GOVERNOR_ATTACHED) + +/** + * cpuidle_get_last_residency - retrieves the last state's residency time + * @dev: the target CPU + * + * NOTE: this value is invalid if CPUIDLE_FLAG_TIME_VALID isn't set + */ +static inline int cpuidle_get_last_residency(struct cpuidle_device *dev) +{ + return dev->last_residency; +} + + +/**************************** + * CPUIDLE DRIVER INTERFACE * + ****************************/ + +struct cpuidle_driver { + char name[CPUIDLE_NAME_LEN]; + struct list_head driver_list; + + int (*init) (struct cpuidle_device *dev); + void (*exit) (struct cpuidle_device *dev); + int (*redetect) (struct cpuidle_device *dev); + + int (*bm_check) (void); + + struct module *owner; +}; + +#ifdef CONFIG_CPU_IDLE + +extern int cpuidle_register_driver(struct cpuidle_driver *drv); +extern void cpuidle_unregister_driver(struct cpuidle_driver *drv); +extern int cpuidle_force_redetect(struct cpuidle_device *dev, struct cpuidle_driver *drv); +extern int cpuidle_force_redetect_devices(struct cpuidle_driver *drv); + +#else + +static inline int cpuidle_register_driver(struct cpuidle_driver *drv) +{return 0;} +static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { } +static inline int cpuidle_force_redetect(struct cpuidle_device *dev, struct cpuidle_driver *drv) +{return 0;} +static inline int cpuidle_force_redetect_devices(struct cpuidle_driver *drv) +{return 0;} + +#endif + +/****************************** + * CPUIDLE GOVERNOR INTERFACE * + ******************************/ + +struct cpuidle_governor { + char name[CPUIDLE_NAME_LEN]; + struct list_head governor_list; + unsigned int rating; + + int (*init) (struct cpuidle_device *dev); + void (*exit) (struct cpuidle_device *dev); + void (*scan) (struct cpuidle_device *dev); + + int (*select) (struct cpuidle_device *dev); + void (*reflect) (struct cpuidle_device *dev); + + struct module *owner; +}; + +#ifdef CONFIG_CPU_IDLE + +extern int cpuidle_register_governor(struct cpuidle_governor *gov); +extern void cpuidle_unregister_governor(struct cpuidle_governor *gov); +extern int cpuidle_get_bm_activity(void); + +#else + +static inline int cpuidle_register_governor(struct cpuidle_governor *gov) +{return 0;} +static inline void cpuidle_unregister_governor(struct cpuidle_governor *gov) { } +static inline int cpuidle_get_bm_activity(void) +{return 0;} + +#endif + +#endif /* _LINUX_CPUIDLE_H */ Index: linux-2.6.22-rc5/drivers/acpi/processor_core.c =================================================================== --- linux-2.6.22-rc5.orig/drivers/acpi/processor_core.c 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/drivers/acpi/processor_core.c 2007-06-17 08:52:05.000000000 +0200 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1022,11 +1023,15 @@ static int __init acpi_processor_init(vo acpi_processor_ppc_init(); + cpuidle_register_driver(&acpi_idle_driver); + acpi_do_set_cstate_limit = acpi_max_cstate_changed; return 0; } static void __exit acpi_processor_exit(void) { + acpi_do_set_cstate_limit = NULL; + cpuidle_unregister_driver(&acpi_idle_driver); acpi_processor_ppc_exit(); Index: linux-2.6.22-rc5/drivers/acpi/processor_idle.c =================================================================== --- linux-2.6.22-rc5.orig/drivers/acpi/processor_idle.c 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/drivers/acpi/processor_idle.c 2007-06-17 08:52:09.000000000 +0200 @@ -40,6 +40,7 @@ #include /* need_resched() */ #include #include +#include /* * Include the apic definitions for x86 to have the APIC timer related defines @@ -62,25 +63,34 @@ #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); #define ACPI_PROCESSOR_FILE_POWER "power" -#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) -#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ -#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ -static void (*pm_idle_save) (void) __read_mostly; -module_param(max_cstate, uint, 0644); +#define PM_TIMER_TICKS_TO_US(p) (((p) * 1000)/(PM_TIMER_FREQUENCY/1000)) +#define C2_OVERHEAD 1 /* 1us */ +#define C3_OVERHEAD 1 /* 1us */ + +void acpi_max_cstate_changed(void) +{ + /* Driver will reset devices' max cstate limit */ + cpuidle_force_redetect_devices(&acpi_idle_driver); +} + +static int change_max_cstate(const char *val, struct kernel_param *kp) +{ + int max; + + max = simple_strtol(val, NULL, 0); + if (!max) + return -EINVAL; + max_cstate = max; + if (acpi_do_set_cstate_limit) + acpi_do_set_cstate_limit(); + return 0; +} + +module_param_call(max_cstate, change_max_cstate, param_get_uint, &max_cstate, 0644); static unsigned int nocst __read_mostly; module_param(nocst, uint, 0000); -/* - * bm_history -- bit-mask with a bit per jiffy of bus-master activity - * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms - * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms - * 100 HZ: 0x0000000F: 4 jiffies = 40ms - * reduce history for more aggressive entry into C3 - */ -static unsigned int bm_history __read_mostly = - (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); -module_param(bm_history, uint, 0644); /* -------------------------------------------------------------------------- Power Management -------------------------------------------------------------------------- */ @@ -166,88 +176,6 @@ static struct dmi_system_id __cpuinitdat {}, }; -static inline u32 ticks_elapsed(u32 t1, u32 t2) -{ - if (t2 >= t1) - return (t2 - t1); - else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) - return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); - else - return ((0xFFFFFFFF - t1) + t2); -} - -static void -acpi_processor_power_activate(struct acpi_processor *pr, - struct acpi_processor_cx *new) -{ - struct acpi_processor_cx *old; - - if (!pr || !new) - return; - - old = pr->power.state; - - if (old) - old->promotion.count = 0; - new->demotion.count = 0; - - /* Cleanup from old state. */ - if (old) { - switch (old->type) { - case ACPI_STATE_C3: - /* Disable bus master reload */ - if (new->type != ACPI_STATE_C3 && pr->flags.bm_check) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - break; - } - } - - /* Prepare to use new state. */ - switch (new->type) { - case ACPI_STATE_C3: - /* Enable bus master reload */ - if (old->type != ACPI_STATE_C3 && pr->flags.bm_check) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); - break; - } - - pr->power.state = new; - - return; -} - -static void acpi_safe_halt(void) -{ - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (!need_resched()) - safe_halt(); - current_thread_info()->status |= TS_POLLING; -} - -static atomic_t c3_cpu_count; - -/* Common C-state entry for C2, C3, .. */ -static void acpi_cstate_enter(struct acpi_processor_cx *cstate) -{ - if (cstate->space_id == ACPI_CSTATE_FFH) { - /* Call into architectural FFH based C-state */ - acpi_processor_ffh_cstate_enter(cstate); - } else { - int unused; - /* IO port based C-state */ - inb(cstate->address); - /* Dummy wait op - must do something useless after P_LVL2 read - because chipsets cannot guarantee that STPCLK# signal - gets asserted in time to freeze execution properly. */ - unused = inl(acpi_gbl_FADT.xpm_timer_block.address); - } -} - #ifdef ARCH_APICTIMER_STOPS_ON_C3 /* @@ -275,21 +203,12 @@ static void acpi_timer_check_state(int s static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { -#ifdef CONFIG_GENERIC_CLOCKEVENTS unsigned long reason; reason = pr->power.timer_broadcast_on_state < INT_MAX ? CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF; clockevents_notify(reason, &pr->id); -#else - cpumask_t mask = cpumask_of_cpu(pr->id); - - if (pr->power.timer_broadcast_on_state < INT_MAX) - on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); - else - on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); -#endif } /* Power(C) State timer broadcast control */ @@ -297,8 +216,6 @@ static void acpi_state_timer_broadcast(s struct acpi_processor_cx *cx, int broadcast) { -#ifdef CONFIG_GENERIC_CLOCKEVENTS - int state = cx - pr->power.states; if (state >= pr->power.timer_broadcast_on_state) { @@ -308,7 +225,6 @@ static void acpi_state_timer_broadcast(s CLOCK_EVT_NOTIFY_BROADCAST_EXIT; clockevents_notify(reason, &pr->id); } -#endif } #else @@ -323,376 +239,6 @@ static void acpi_state_timer_broadcast(s } #endif - -static void acpi_processor_idle(void) -{ - struct acpi_processor *pr = NULL; - struct acpi_processor_cx *cx = NULL; - struct acpi_processor_cx *next_state = NULL; - int sleep_ticks = 0; - u32 t1, t2 = 0; - - pr = processors[smp_processor_id()]; - if (!pr) - return; - - /* - * Interrupts must be disabled during bus mastering calculations and - * for C2/C3 transitions. - */ - local_irq_disable(); - - /* - * Check whether we truly need to go idle, or should - * reschedule: - */ - if (unlikely(need_resched())) { - local_irq_enable(); - return; - } - - cx = pr->power.state; - if (!cx) { - if (pm_idle_save) - pm_idle_save(); - else - acpi_safe_halt(); - return; - } - - /* - * Check BM Activity - * ----------------- - * Check for bus mastering activity (if required), record, and check - * for demotion. - */ - if (pr->flags.bm_check) { - u32 bm_status = 0; - unsigned long diff = jiffies - pr->power.bm_check_timestamp; - - if (diff > 31) - diff = 31; - - pr->power.bm_activity <<= diff; - - acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); - if (bm_status) { - pr->power.bm_activity |= 0x1; - acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); - } - /* - * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect - * the true state of bus mastering activity; forcing us to - * manually check the BMIDEA bit of each IDE channel. - */ - else if (errata.piix4.bmisx) { - if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01) - || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01)) - pr->power.bm_activity |= 0x1; - } - - pr->power.bm_check_timestamp = jiffies; - - /* - * If bus mastering is or was active this jiffy, demote - * to avoid a faulty transition. Note that the processor - * won't enter a low-power state during this call (to this - * function) but should upon the next. - * - * TBD: A better policy might be to fallback to the demotion - * state (use it for this quantum only) istead of - * demoting -- and rely on duration as our sole demotion - * qualification. This may, however, introduce DMA - * issues (e.g. floppy DMA transfer overrun/underrun). - */ - if ((pr->power.bm_activity & 0x1) && - cx->demotion.threshold.bm) { - local_irq_enable(); - next_state = cx->demotion.state; - goto end; - } - } - -#ifdef CONFIG_HOTPLUG_CPU - /* - * Check for P_LVL2_UP flag before entering C2 and above on - * an SMP system. We do it here instead of doing it at _CST/P_LVL - * detection phase, to work cleanly with logical CPU hotplug. - */ - if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && - !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) - cx = &pr->power.states[ACPI_STATE_C1]; -#endif - - /* - * Sleep: - * ------ - * Invoke the current Cx state to put the processor to sleep. - */ - if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (need_resched()) { - current_thread_info()->status |= TS_POLLING; - local_irq_enable(); - return; - } - } - - switch (cx->type) { - - case ACPI_STATE_C1: - /* - * Invoke C1. - * Use the appropriate idle routine, the one that would - * be used without acpi C-states. - */ - if (pm_idle_save) - pm_idle_save(); - else - acpi_safe_halt(); - - /* - * TBD: Can't get time duration while in C1, as resumes - * go to an ISR rather than here. Need to instrument - * base interrupt handler. - */ - sleep_ticks = 0xFFFFFFFF; - break; - - case ACPI_STATE_C2: - /* Get start time (ticks) */ - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); - /* Invoke C2 */ - acpi_state_timer_broadcast(pr, cx, 1); - acpi_cstate_enter(cx); - /* Get end time (ticks) */ - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); - -#ifdef CONFIG_GENERIC_TIME - /* TSC halts in C2, so notify users */ - mark_tsc_unstable("possible TSC halt in C2"); -#endif - /* Re-enable interrupts */ - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; - acpi_state_timer_broadcast(pr, cx, 0); - break; - - case ACPI_STATE_C3: - - if (pr->flags.bm_check) { - if (atomic_inc_return(&c3_cpu_count) == - num_online_cpus()) { - /* - * All CPUs are trying to go to C3 - * Disable bus master arbitration - */ - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); - } - } else { - /* SMP with no shared cache... Invalidate cache */ - ACPI_FLUSH_CPU_CACHE(); - } - - /* Get start time (ticks) */ - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); - /* Invoke C3 */ - acpi_state_timer_broadcast(pr, cx, 1); - acpi_cstate_enter(cx); - /* Get end time (ticks) */ - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); - if (pr->flags.bm_check) { - /* Enable bus master arbitration */ - atomic_dec(&c3_cpu_count); - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); - } - -#ifdef CONFIG_GENERIC_TIME - /* TSC halts in C3, so notify users */ - mark_tsc_unstable("TSC halts in C3"); -#endif - /* Re-enable interrupts */ - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; - acpi_state_timer_broadcast(pr, cx, 0); - break; - - default: - local_irq_enable(); - return; - } - cx->usage++; - if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0)) - cx->time += sleep_ticks; - - next_state = pr->power.state; - -#ifdef CONFIG_HOTPLUG_CPU - /* Don't do promotion/demotion */ - if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) && - !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) { - next_state = cx; - goto end; - } -#endif - - /* - * Promotion? - * ---------- - * Track the number of longs (time asleep is greater than threshold) - * and promote when the count threshold is reached. Note that bus - * mastering activity may prevent promotions. - * Do not promote above max_cstate. - */ - if (cx->promotion.state && - ((cx->promotion.state - pr->power.states) <= max_cstate)) { - if (sleep_ticks > cx->promotion.threshold.ticks && - cx->promotion.state->latency <= system_latency_constraint()) { - cx->promotion.count++; - cx->demotion.count = 0; - if (cx->promotion.count >= - cx->promotion.threshold.count) { - if (pr->flags.bm_check) { - if (! - (pr->power.bm_activity & cx-> - promotion.threshold.bm)) { - next_state = - cx->promotion.state; - goto end; - } - } else { - next_state = cx->promotion.state; - goto end; - } - } - } - } - - /* - * Demotion? - * --------- - * Track the number of shorts (time asleep is less than time threshold) - * and demote when the usage threshold is reached. - */ - if (cx->demotion.state) { - if (sleep_ticks < cx->demotion.threshold.ticks) { - cx->demotion.count++; - cx->promotion.count = 0; - if (cx->demotion.count >= cx->demotion.threshold.count) { - next_state = cx->demotion.state; - goto end; - } - } - } - - end: - /* - * Demote if current state exceeds max_cstate - * or if the latency of the current state is unacceptable - */ - if ((pr->power.state - pr->power.states) > max_cstate || - pr->power.state->latency > system_latency_constraint()) { - if (cx->demotion.state) - next_state = cx->demotion.state; - } - - /* - * New Cx State? - * ------------- - * If we're going to start using a new Cx state we must clean up - * from the previous and prepare to use the new. - */ - if (next_state != pr->power.state) - acpi_processor_power_activate(pr, next_state); -} - -static int acpi_processor_set_power_policy(struct acpi_processor *pr) -{ - unsigned int i; - unsigned int state_is_set = 0; - struct acpi_processor_cx *lower = NULL; - struct acpi_processor_cx *higher = NULL; - struct acpi_processor_cx *cx; - - - if (!pr) - return -EINVAL; - - /* - * This function sets the default Cx state policy (OS idle handler). - * Our scheme is to promote quickly to C2 but more conservatively - * to C3. We're favoring C2 for its characteristics of low latency - * (quick response), good power savings, and ability to allow bus - * mastering activity. Note that the Cx state policy is completely - * customizable and can be altered dynamically. - */ - - /* startup state */ - for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (!state_is_set) - pr->power.state = cx; - state_is_set++; - break; - } - - if (!state_is_set) - return -ENODEV; - - /* demotion */ - for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (lower) { - cx->demotion.state = lower; - cx->demotion.threshold.ticks = cx->latency_ticks; - cx->demotion.threshold.count = 1; - if (cx->type == ACPI_STATE_C3) - cx->demotion.threshold.bm = bm_history; - } - - lower = cx; - } - - /* promotion */ - for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) { - cx = &pr->power.states[i]; - if (!cx->valid) - continue; - - if (higher) { - cx->promotion.state = higher; - cx->promotion.threshold.ticks = cx->latency_ticks; - if (cx->type >= ACPI_STATE_C2) - cx->promotion.threshold.count = 4; - else - cx->promotion.threshold.count = 10; - if (higher->type == ACPI_STATE_C3) - cx->promotion.threshold.bm = bm_history; - } - - higher = cx; - } - - return 0; -} - static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr) { @@ -910,7 +456,7 @@ static void acpi_processor_power_verify_ * Normalize the C2 latency to expidite policy */ cx->valid = 1; - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = cx->latency; return; } @@ -984,7 +530,7 @@ static void acpi_processor_power_verify_ * use this in our C3 policy */ cx->valid = 1; - cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->latency_ticks = cx->latency; return; } @@ -1050,18 +596,6 @@ static int acpi_processor_get_power_info pr->power.count = acpi_processor_power_verify(pr); /* - * Set Default Policy - * ------------------ - * Now that we know which states are supported, set the default - * policy. Note that this policy can be changed dynamically - * (e.g. encourage deeper sleeps to conserve battery life when - * not on AC). - */ - result = acpi_processor_set_power_policy(pr); - if (result) - return result; - - /* * if one state of type C2 or C3 is available, mark this * CPU as being "idle manageable" */ @@ -1078,9 +612,6 @@ static int acpi_processor_get_power_info int acpi_processor_cst_has_changed(struct acpi_processor *pr) { - int result = 0; - - if (!pr) return -EINVAL; @@ -1091,16 +622,9 @@ int acpi_processor_cst_has_changed(struc if (!pr->flags.power_setup_done) return -ENODEV; - /* Fall back to the default idle loop */ - pm_idle = pm_idle_save; - synchronize_sched(); /* Relies on interrupts forcing exit from idle. */ - - pr->flags.power = 0; - result = acpi_processor_get_power_info(pr); - if ((pr->flags.power == 1) && (pr->flags.power_setup_done)) - pm_idle = acpi_processor_idle; - - return result; + acpi_processor_get_power_info(pr); + return cpuidle_force_redetect(per_cpu(cpuidle_devices, pr->id), + &acpi_idle_driver); } /* proc interface */ @@ -1186,30 +710,6 @@ static const struct file_operations acpi .release = single_release, }; -#ifdef CONFIG_SMP -static void smp_callback(void *v) -{ - /* we already woke the CPU up, nothing more to do */ -} - -/* - * This function gets called when a part of the kernel has a new latency - * requirement. This means we need to get all processors out of their C-state, - * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that - * wakes them all right up. - */ -static int acpi_processor_latency_notify(struct notifier_block *b, - unsigned long l, void *v) -{ - smp_call_function(smp_callback, NULL, 0, 1); - return NOTIFY_OK; -} - -static struct notifier_block acpi_processor_latency_notifier = { - .notifier_call = acpi_processor_latency_notify, -}; -#endif - int __cpuinit acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device) { @@ -1226,9 +726,6 @@ int __cpuinit acpi_processor_power_init( "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; -#ifdef CONFIG_SMP - register_latency_notifier(&acpi_processor_latency_notifier); -#endif } if (!pr) @@ -1245,6 +742,7 @@ int __cpuinit acpi_processor_power_init( acpi_processor_get_power_info(pr); + /* * Install the idle handler if processor power management is supported. * Note that we use previously set idle handler will be used on @@ -1257,11 +755,6 @@ int __cpuinit acpi_processor_power_init( printk(" C%d[C%d]", i, pr->power.states[i].type); printk(")\n"); - - if (pr->id == 0) { - pm_idle_save = pm_idle; - pm_idle = acpi_processor_idle; - } } /* 'power' [R] */ @@ -1289,21 +782,349 @@ int acpi_processor_power_exit(struct acp if (acpi_device_dir(device)) remove_proc_entry(ACPI_PROCESSOR_FILE_POWER, acpi_device_dir(device)); + return 0; +} - /* Unregister the idle handler when processor #0 is removed. */ - if (pr->id == 0) { - pm_idle = pm_idle_save; +/** + * ticks_elapsed - a helper function that determines how many ticks (in US) + * have elapsed between two PM Timer timestamps + * @t1: the start time + * @t2: the end time + */ +static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2) +{ + if (t2 >= t1) + return PM_TIMER_TICKS_TO_US(t2 - t1); + else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) + return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); + else + return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2); +} - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - cpu_idle_wait(); -#ifdef CONFIG_SMP - unregister_latency_notifier(&acpi_processor_latency_notifier); +static inline u32 ticks_elapsed(u32 t1, u32 t2) +{ + if (t2 >= t1) + return (t2 - t1); + else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) + return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); + else + return ((0xFFFFFFFF - t1) + t2); +} + +/** + * acpi_idle_update_bm_rld - updates the BM_RLD bit depending on target state + * @pr: the processor + * @target: the new target state + */ +static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr, + struct acpi_processor_cx *target) +{ + if (pr->flags.bm_rld_set && target->type != ACPI_STATE_C3) { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); + pr->flags.bm_rld_set = 0; + } + + if (!pr->flags.bm_rld_set && target->type == ACPI_STATE_C3) { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); + pr->flags.bm_rld_set = 1; + } +} + +/** + * acpi_idle_do_entry - a helper function that does C2 and C3 type entry + * @cx: cstate data + */ +static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) +{ + if (cx->space_id == ACPI_CSTATE_FFH) { + /* Call into architectural FFH based C-state */ + acpi_processor_ffh_cstate_enter(cx); + } else { + int unused; + /* IO port based C-state */ + inb(cx->address); + /* Dummy wait op - must do something useless after P_LVL2 read + because chipsets cannot guarantee that STPCLK# signal + gets asserted in time to freeze execution properly. */ + unused = inl(acpi_gbl_FADT.xpm_timer_block.address); + } +} + +/** + * acpi_idle_enter_c1 - enters an ACPI C1 state-type + * @dev: the target CPU + * @state: the state data + * + * This is equivalent to the HALT instruction. + */ +static int acpi_idle_enter_c1(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + if (!need_resched()) + safe_halt(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + return 0; +} + +/** + * acpi_idle_enter_c2 - enters an ACPI C2 state-type + * @dev: the target CPU + * @state: the state data + */ +static int acpi_idle_enter_c2(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + u32 t1, t2; + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + local_irq_disable(); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; + } + + t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + acpi_state_timer_broadcast(pr, cx, 1); + acpi_idle_do_entry(cx); + t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C2, so notify users */ + mark_tsc_unstable("possible TSC halt in C2"); #endif + + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + acpi_state_timer_broadcast(pr, cx, 0); + cx->time += ticks_elapsed(t1, t2); + return ticks_elapsed_in_us(t1, t2); +} + +static int c3_cpu_count; +static DEFINE_SPINLOCK(c3_lock); + +/** + * acpi_idle_enter_c3 - enters an ACPI C3 state-type + * @dev: the target CPU + * @state: the state data + * + * Similar to C2 entry, except special bus master handling is needed. + */ +static int acpi_idle_enter_c3(struct cpuidle_device *dev, + struct cpuidle_state *state) +{ + struct acpi_processor *pr; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state); + u32 t1, t2; + pr = processors[smp_processor_id()]; + + if (unlikely(!pr)) + return 0; + + if (pr->flags.bm_check) + acpi_idle_update_bm_rld(pr, cx); + + local_irq_disable(); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we test + * NEED_RESCHED: + */ + smp_mb(); + + if (unlikely(need_resched())) { + current_thread_info()->status |= TS_POLLING; + local_irq_enable(); + return 0; + } + + /* + * Must be done before busmaster disable as we might need to + * access HPET ! + */ + acpi_state_timer_broadcast(pr, cx, 1); + + /* disable bus master */ + if (pr->flags.bm_check) { + spin_lock(&c3_lock); + c3_cpu_count++; + if (c3_cpu_count == num_online_cpus()) { + /* + * All CPUs are trying to go to C3 + * Disable bus master arbitration + */ + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); + } + spin_unlock(&c3_lock); + } else { + /* SMP with no shared cache... Invalidate cache */ + ACPI_FLUSH_CPU_CACHE(); } + /* Get start time (ticks) */ + t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + acpi_idle_do_entry(cx); + t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + + if (pr->flags.bm_check) { + spin_lock(&c3_lock); + /* Enable bus master arbitration */ + if (c3_cpu_count == num_online_cpus()) + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + c3_cpu_count--; + spin_unlock(&c3_lock); + } + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C3, so notify users */ + mark_tsc_unstable("TSC halts in C3"); +#endif + + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + + cx->usage++; + + acpi_state_timer_broadcast(pr, cx, 0); + cx->time += ticks_elapsed(t1, t2); + return ticks_elapsed_in_us(t1, t2); +} + +/** + * acpi_idle_bm_check - checks if bus master activity was detected + */ +static int acpi_idle_bm_check(void) +{ + u32 bm_status = 0; + + acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); + if (bm_status) + acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); + /* + * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect + * the true state of bus mastering activity; forcing us to + * manually check the BMIDEA bit of each IDE channel. + */ + else if (errata.piix4.bmisx) { + if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01) + || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01)) + bm_status = 1; + } + return bm_status; +} + +/** + * acpi_idle_init - attaches the driver to a CPU + * @dev: the CPU + */ +static int acpi_idle_init(struct cpuidle_device *dev) +{ + int cpu = dev->cpu; + int i, count = 0; + struct acpi_processor_cx *cx; + struct cpuidle_state *state; + + struct acpi_processor *pr = processors[cpu]; + + if (!pr->flags.power_setup_done) + return -EINVAL; + + if (pr->flags.power == 0) { + return -EINVAL; + } + + for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) { + cx = &pr->power.states[i]; + state = &dev->states[count]; + + if (!cx->valid) + continue; + +#ifdef CONFIG_HOTPLUG_CPU + if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) && + !pr->flags.has_cst && + !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) + continue; +#endif + cpuidle_set_statedata(state, cx); + + state->exit_latency = cx->latency; + state->target_residency = cx->latency * 6; + state->power_usage = cx->power; + + state->flags = 0; + switch (cx->type) { + case ACPI_STATE_C1: + state->flags |= CPUIDLE_FLAG_SHALLOW; + state->enter = acpi_idle_enter_c1; + break; + + case ACPI_STATE_C2: + state->flags |= CPUIDLE_FLAG_BALANCED; + state->flags |= CPUIDLE_FLAG_TIME_VALID; + state->enter = acpi_idle_enter_c2; + break; + + case ACPI_STATE_C3: + state->flags |= CPUIDLE_FLAG_DEEP; + state->flags |= CPUIDLE_FLAG_TIME_VALID; + state->flags |= CPUIDLE_FLAG_CHECK_BM; + state->enter = acpi_idle_enter_c3; + break; + } + + count++; + } + + if (!count) + return -EINVAL; + + dev->state_count = count; return 0; } + +struct cpuidle_driver acpi_idle_driver = { + .name = "acpi_idle", + .init = acpi_idle_init, + .redetect = acpi_idle_init, + .bm_check = acpi_idle_bm_check, + .owner = THIS_MODULE, +}; Index: linux-2.6.22-rc5/include/acpi/processor.h =================================================================== --- linux-2.6.22-rc5.orig/include/acpi/processor.h 2007-06-17 08:52:04.000000000 +0200 +++ linux-2.6.22-rc5/include/acpi/processor.h 2007-06-17 08:52:05.000000000 +0200 @@ -161,6 +161,7 @@ struct acpi_processor_flags { u8 bm_check:1; u8 has_cst:1; u8 power_setup_done:1; + u8 bm_rld_set:1; }; struct acpi_processor { @@ -279,6 +280,8 @@ int acpi_processor_power_init(struct acp int acpi_processor_cst_has_changed(struct acpi_processor *pr); int acpi_processor_power_exit(struct acpi_processor *pr, struct acpi_device *device); +extern struct cpuidle_driver acpi_idle_driver; +void acpi_max_cstate_changed(void); /* in processor_thermal.c */ int acpi_processor_get_limit_info(struct acpi_processor *pr); Index: linux-2.6.22-rc5/Documentation/cpuidle/core.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/Documentation/cpuidle/core.txt 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,17 @@ + + Supporting multiple CPU idle levels in kernel + + cpuidle + +General Information: + +Various CPUs today support multiple idle levels that are differentiated +by varying exit latencies and power consumption during idle. +cpuidle is a generic in-kernel infrastructure that separates +idle policy (governor) from idle mechanism (driver) and provides a +standardized infrastructure to support independent development of +governors and drivers. + +cpuidle resides under drivers/cpuidle. + + Index: linux-2.6.22-rc5/Documentation/cpuidle/driver.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/Documentation/cpuidle/driver.txt 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,29 @@ + + + Supporting multiple CPU idle levels in kernel + + cpuidle drivers + + + + +cpuidle driver hooks into the cpuidle infrastructure and does the +architecture/platform dependent part of CPU idle states. Driver +provides the platform idle state detection capability and also +has mechanisms in place to support actusl entry-exit into a CPU idle state. + +cpuidle driver supports capability detection for a platform using the +init and exit routines. They will be called for each online CPU, with a +percpu cpuidle_driver object and driver should fill in cpuidle_states +inside cpuidle_driver depending on the CPU capability. + +Driver can handle dynamic state changes (like battery<->AC), by calling +force_redetect interface. + +It is possible to have more than one driver registered at the same time and +user can switch between drivers using /sysfs interface (when enabled). + +Interfaces: +int cpuidle_register_driver(struct cpuidle_driver *drv); +void cpuidle_unregister_driver(struct cpuidle_driver *drv); +int cpuidle_force_redetect(struct cpuidle_device *dev); Index: linux-2.6.22-rc5/Documentation/cpuidle/governor.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/Documentation/cpuidle/governor.txt 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,28 @@ + + + + Supporting multiple CPU idle levels in kernel + + cpuidle governors + + + + +cpuidle governor is policy routine that decides what idle state to enter at +any given time. cpuidle core uses different callbacks to governor while +handling idle entry. +* select_state() callback where governor can determine next idle state to enter +* prepare_idle() callback is called before entering an idle state +* scan() callback is called after a driver forces redetection of the states + +More than one governor can be registered at the same time and +user can switch between drivers using /sysfs interface (when supported). + +More than one governor part is supported for developers to easily experiment +with different governors. By default, most optimal governor based on your +kernel configuration and platform will be selected by cpuidle. + +Interfaces: +int cpuidle_register_governor(struct cpuidle_governor *gov); +void cpuidle_unregister_governor(struct cpuidle_governor *gov); + Index: linux-2.6.22-rc5/Documentation/cpuidle/sysfs.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/Documentation/cpuidle/sysfs.txt 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,35 @@ + + + Supporting multiple CPU idle levels in kernel + + cpuidle sysfs + +System global cpuidle related information and tunables are under +/sys/devices/system/cpu/cpuidle + +The current interfaces in this directory has self-explanatory names: +* current_driver_ro +* current_governor_ro + +With cpuidle_sysfs_switch boot option (meant for developer testing) +following objects are visible instead. +* available_drivers +* available_governors +* current_driver +* current_governor +In this case user can switch the driver, governor at run time by writing +onto current_driver and current_governor. + + +Per logical CPU specific cpuidle information are under +/sys/devices/system/cpu/cpuX/cpuidle +for each online cpu X + +Under this percpu directory, there is a directory for each idle state supported +by the driver, which in turn has +* latency : Latency to exit out of this idle state (in microseconds) +* power : Power consumed while in this idle state (in milliwatts) +* time : Total time spent in this idle state (in microseconds) +* usage : Number of times this state was entered (count) + + Index: linux-2.6.22-rc5/drivers/acpi/osl.c =================================================================== --- linux-2.6.22-rc5.orig/drivers/acpi/osl.c 2007-06-17 08:52:03.000000000 +0200 +++ linux-2.6.22-rc5/drivers/acpi/osl.c 2007-06-17 08:52:06.000000000 +0200 @@ -1056,6 +1056,17 @@ unsigned int max_cstate = ACPI_PROCESSOR EXPORT_SYMBOL(max_cstate); +void (*acpi_do_set_cstate_limit)(void); +EXPORT_SYMBOL(acpi_do_set_cstate_limit); + +void acpi_set_cstate_limit(unsigned int new_limit) +{ + max_cstate = new_limit; + if (acpi_do_set_cstate_limit) + acpi_do_set_cstate_limit(); +} +EXPORT_SYMBOL(acpi_set_cstate_limit); + /* * Acquire a spinlock. * Index: linux-2.6.22-rc5/include/linux/acpi.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/acpi.h 2007-06-17 08:52:03.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/acpi.h 2007-06-17 08:52:05.000000000 +0200 @@ -206,11 +206,8 @@ static inline unsigned int acpi_get_csta { return max_cstate; } -static inline void acpi_set_cstate_limit(unsigned int new_limit) -{ - max_cstate = new_limit; - return; -} +extern void (*acpi_do_set_cstate_limit)(void); +extern void acpi_set_cstate_limit(unsigned int new_limit); #else static inline unsigned int acpi_get_cstate_limit(void) { return 0; } static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; } Index: linux-2.6.22-rc5/arch/i386/kernel/process.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/process.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/process.c 2007-06-17 08:52:05.000000000 +0200 @@ -179,13 +179,14 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + tick_nohz_stop_sched_tick(); + check_pgt_cache(); rmb(); idle = pm_idle; Index: linux-2.6.22-rc5/include/linux/tick.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/tick.h 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/tick.h 2007-06-17 08:52:08.000000000 +0200 @@ -7,6 +7,7 @@ #define _LINUX_TICK_H #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -40,6 +41,7 @@ enum tick_nohz_mode { * @idle_sleeps: Number of idle calls, where the sched tick was stopped * @idle_entrytime: Time when the idle call was entered * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @sleep_length: Duration of the current idle sleep */ struct tick_sched { struct hrtimer sched_timer; @@ -52,6 +54,7 @@ struct tick_sched { unsigned long idle_sleeps; ktime_t idle_entrytime; ktime_t idle_sleeptime; + ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; ktime_t idle_expires; @@ -100,10 +103,18 @@ static inline int tick_check_oneshot_cha extern void tick_nohz_stop_sched_tick(void); extern void tick_nohz_restart_sched_tick(void); extern void tick_nohz_update_jiffies(void); +extern ktime_t tick_nohz_get_sleep_length(void); +extern unsigned long tick_nohz_get_idle_jiffies(void); # else static inline void tick_nohz_stop_sched_tick(void) { } static inline void tick_nohz_restart_sched_tick(void) { } static inline void tick_nohz_update_jiffies(void) { } +static inline ktime_t tick_nohz_get_sleep_length(void) +{ + ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; + + return len; +} # endif /* !NO_HZ */ #endif Index: linux-2.6.22-rc5/kernel/softirq.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/softirq.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/kernel/softirq.c 2007-06-17 08:52:05.000000000 +0200 @@ -303,11 +303,6 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(); -#endif preempt_enable_no_resched(); } Index: linux-2.6.22-rc5/kernel/time/tick-sched.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/tick-sched.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/tick-sched.c 2007-06-17 08:52:07.000000000 +0200 @@ -153,6 +153,7 @@ void tick_nohz_stop_sched_tick(void) unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; ktime_t last_update, expires, now, delta; + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; int cpu; local_irq_save(flags); @@ -290,11 +291,36 @@ void tick_nohz_stop_sched_tick(void) out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; + ts->sleep_length = ktime_sub(dev->next_event, now); end: local_irq_restore(flags); } /** + * tick_nohz_get_sleep_length - return the length of the current sleep + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_sleep_length(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + return ts->sleep_length; +} +EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length); + +/** + * tick_nohz_get_idle_jiffies - returns the current idle jiffie count + */ +unsigned long tick_nohz_get_idle_jiffies(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + return ts->idle_jiffies; +} +EXPORT_SYMBOL_GPL(tick_nohz_get_idle_jiffies); + +/** * nohz_restart_sched_tick - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle @@ -546,6 +572,7 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); + u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -554,8 +581,12 @@ void tick_setup_sched_timer(void) ts->sched_timer.function = tick_sched_timer; ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; - /* Get the next period */ + /* Get the next period (per cpu)*/ ts->sched_timer.expires = tick_init_jiffy_update(); + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, NR_CPUS); + offset *= smp_processor_id(); + ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); Index: linux-2.6.22-rc5/arch/arm/kernel/process.c =================================================================== --- linux-2.6.22-rc5.orig/arch/arm/kernel/process.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/arch/arm/kernel/process.c 2007-06-17 08:52:05.000000000 +0200 @@ -130,6 +130,7 @@ static void default_idle(void) else { local_irq_disable(); if (!need_resched()) { + tick_nohz_stop_sched_tick(); timer_dyn_reprogram(); arch_idle(); } @@ -160,7 +161,6 @@ void cpu_idle(void) if (!idle) idle = default_idle; leds_event(led_idle_start); - tick_nohz_stop_sched_tick(); while (!need_resched()) idle(); leds_event(led_idle_end); Index: linux-2.6.22-rc5/arch/sh/kernel/process.c =================================================================== --- linux-2.6.22-rc5.orig/arch/sh/kernel/process.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/arch/sh/kernel/process.c 2007-06-17 08:52:05.000000000 +0200 @@ -62,8 +62,10 @@ void default_idle(void) clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); set_bl_bit(); - while (!need_resched()) + while (!need_resched()) { + tick_nohz_stop_sched_tick(); cpu_sleep(); + } clear_bl_bit(); set_thread_flag(TIF_POLLING_NRFLAG); } else @@ -82,7 +84,6 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_stop_sched_tick(); while (!need_resched()) idle(); tick_nohz_restart_sched_tick(); Index: linux-2.6.22-rc5/arch/sparc64/kernel/process.c =================================================================== --- linux-2.6.22-rc5.orig/arch/sparc64/kernel/process.c 2007-06-17 08:52:02.000000000 +0200 +++ linux-2.6.22-rc5/arch/sparc64/kernel/process.c 2007-06-17 08:52:05.000000000 +0200 @@ -89,9 +89,10 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_stop_sched_tick(); - while (!need_resched()) + while (!need_resched()) { + tick_nohz_stop_sched_tick(); sparc64_yield(); + } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); Index: linux-2.6.22-rc5/drivers/cpuidle/governors/menu.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/drivers/cpuidle/governors/menu.c 2007-06-17 08:52:06.000000000 +0200 @@ -0,0 +1,181 @@ +/* + * menu.c - the menu idle governor + * + * Copyright (C) 2006-2007 Adam Belay + * + * This code is licenced under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define BM_HOLDOFF 20000 /* 20 ms */ +#define DEMOTION_THRESHOLD 5 +#define DEMOTION_TIMEOUT_MULTIPLIER 1000 + +struct menu_device { + int last_state_idx; + + int deepest_break_state; + struct timespec break_expire_time_ts; + int break_last_cnt; + + int deepest_bm_state; + int bm_elapsed_us; + int bm_holdoff_us; +}; + +static DEFINE_PER_CPU(struct menu_device, menu_devices); + +/** + * menu_select - selects the next idle state to enter + * @dev: the CPU + */ +static int menu_select(struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int i, expected_us, max_state = dev->state_count; + + /* discard BM history because it is sticky */ + cpuidle_get_bm_activity(); + + /* determine the expected residency time */ + expected_us = (s32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000; + + /* determine the maximum state compatible with current BM status */ + if (cpuidle_get_bm_activity()) + data->bm_elapsed_us = 0; + if (data->bm_elapsed_us <= data->bm_holdoff_us) + max_state = data->deepest_bm_state + 1; + + /* determine the maximum state compatible with recent idle breaks */ + if (data->deepest_break_state >= 0) { + struct timespec now; + ktime_get_ts(&now); + if (timespec_compare(&data->break_expire_time_ts, &now) > 0) { + max_state = min(max_state, + data->deepest_break_state + 1); + } else { + data->deepest_break_state = -1; + } + } + + /* find the deepest idle state that satisfies our constraints */ + for (i = 1; i < max_state; i++) { + struct cpuidle_state *s = &dev->states[i]; + + if (s->target_residency > expected_us) + break; + + if (s->exit_latency > system_latency_constraint()) + break; + } + + if (data->last_state_idx != i - 1) + data->break_last_cnt = 0; + + data->last_state_idx = i - 1; + return i - 1; +} + +/** + * menu_reflect - attempts to guess what happened after entry + * @dev: the CPU + * + * NOTE: it's important to be fast here because this operation will add to + * the overall exit latency. + */ +static void menu_reflect(struct cpuidle_device *dev) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int last_idx = data->last_state_idx; + int measured_us = cpuidle_get_last_residency(dev); + struct cpuidle_state *target = &dev->states[last_idx]; + + /* + * Ugh, this idle state doesn't support residency measurements, so we + * are basically lost in the dark. As a compromise, assume we slept + * for one full standard timer tick. However, be aware that this + * could potentially result in a suboptimal state transition. + */ + if (!(target->flags & CPUIDLE_FLAG_TIME_VALID)) + measured_us = USEC_PER_SEC / HZ; + + data->bm_elapsed_us += measured_us; + + if (data->last_state_idx == 0) + return; + + /* + * Did something other than the timer interrupt + * cause an early break event? + */ + if (unlikely(measured_us < target->target_residency)) { + if (data->break_last_cnt > DEMOTION_THRESHOLD) { + data->deepest_break_state = data->last_state_idx - 1; + ktime_get_ts(&data->break_expire_time_ts); + timespec_add_ns(&data->break_expire_time_ts, + target->target_residency * + DEMOTION_TIMEOUT_MULTIPLIER); + } else { + data->break_last_cnt++; + } + } else { + if (data->break_last_cnt > 0) + data->break_last_cnt--; + } +} + +/** + * menu_scan_device - scans a CPU's states and does setup + * @dev: the CPU + */ +static void menu_scan_device(struct cpuidle_device *dev) +{ + struct menu_device *data = &per_cpu(menu_devices, dev->cpu); + int i; + + data->last_state_idx = 0; + data->bm_elapsed_us = 0; + data->bm_holdoff_us = BM_HOLDOFF; + data->deepest_break_state = -1; + + for (i = 1; i < dev->state_count; i++) + if (dev->states[i].flags & CPUIDLE_FLAG_CHECK_BM) + break; + data->deepest_bm_state = i - 1; +} + +struct cpuidle_governor menu_governor = { + .name = "menu", + .rating = 20, + .scan = menu_scan_device, + .select = menu_select, + .reflect = menu_reflect, + .owner = THIS_MODULE, +}; + +/** + * init_menu - initializes the governor + */ +static int __init init_menu(void) +{ + return cpuidle_register_governor(&menu_governor); +} + +/** + * exit_menu - exits the governor + */ +static void __exit exit_menu(void) +{ + cpuidle_unregister_governor(&menu_governor); +} + +MODULE_LICENSE("GPL"); +module_init(init_menu); +module_exit(exit_menu); Index: linux-2.6.22-rc5/kernel/time/clockevents.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/clockevents.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/clockevents.c 2007-06-17 08:52:07.000000000 +0200 @@ -205,47 +205,6 @@ void clockevents_exchange_device(struct } /** - * clockevents_request_device - */ -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask) -{ - struct clock_event_device *cur, *dev = NULL; - struct list_head *tmp; - - spin_lock(&clockevents_lock); - - list_for_each(tmp, &clockevent_devices) { - cur = list_entry(tmp, struct clock_event_device, list); - - if ((cur->features & features) == features && - cpus_equal(cpumask, cur->cpumask)) { - if (!dev || dev->rating < cur->rating) - dev = cur; - } - } - - clockevents_exchange_device(NULL, dev); - - spin_unlock(&clockevents_lock); - - return dev; -} - -/** - * clockevents_release_device - */ -void clockevents_release_device(struct clock_event_device *dev) -{ - spin_lock(&clockevents_lock); - - clockevents_exchange_device(dev, NULL); - clockevents_notify_released(); - - spin_unlock(&clockevents_lock); -} - -/** * clockevents_notify - notification about relevant events */ void clockevents_notify(unsigned long reason, void *arg) Index: linux-2.6.22-rc5/include/linux/clockchips.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/clockchips.h 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/clockchips.h 2007-06-17 08:52:07.000000000 +0200 @@ -23,6 +23,7 @@ enum clock_event_mode { CLOCK_EVT_MODE_SHUTDOWN, CLOCK_EVT_MODE_PERIODIC, CLOCK_EVT_MODE_ONESHOT, + CLOCK_EVT_MODE_RESUME, }; /* Clock event notification values */ @@ -119,10 +120,6 @@ extern void clockevents_register_device( extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern -struct clock_event_device *clockevents_request_device(unsigned int features, - cpumask_t cpumask); -extern void clockevents_release_device(struct clock_event_device *dev); extern void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode); extern int clockevents_register_notifier(struct notifier_block *nb); Index: linux-2.6.22-rc5/kernel/time/timekeeping.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/timekeeping.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/timekeeping.c 2007-06-17 08:52:08.000000000 +0200 @@ -151,6 +151,9 @@ int do_settimeofday(struct timespec *tv) clock->error = 0; ntp_clear(); + if (clock->settimeofday) + clock->settimeofday(clock, tv); + update_vsyscall(&xtime, clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -391,7 +394,7 @@ static __always_inline int clocksource_b * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) +static void clocksource_adjust(s64 offset) { s64 error, interval = clock->cycle_interval; int adj; @@ -466,7 +469,7 @@ void update_wall_time(void) } /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); + clocksource_adjust(offset); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; Index: linux-2.6.22-rc5/kernel/timer.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/timer.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/kernel/timer.c 2007-06-17 08:52:07.000000000 +0200 @@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_bas static inline void timer_set_deferrable(struct timer_list *timer) { timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + TBASE_DEFERRABLE_FLAG)); } static inline void timer_set_base(struct timer_list *timer, tvec_base_t *new_base) { timer->base = (tvec_base_t *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + tbase_get_deferrable(timer->base)); } /** @@ -431,10 +431,10 @@ EXPORT_SYMBOL(__mod_timer); void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = per_cpu(tvec_bases, cpu); - unsigned long flags; + unsigned long flags; timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); + BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); @@ -613,7 +613,7 @@ static inline void __run_timers(tvec_bas while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + int index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -630,8 +630,8 @@ static inline void __run_timers(tvec_bas unsigned long data; timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; + fn = timer->function; + data = timer->data; timer_stats_account_timer(timer); @@ -675,8 +675,8 @@ static unsigned long __next_timer_interr index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; + if (tbase_get_deferrable(nte->base)) + continue; found = 1; expires = nte->expires; @@ -820,7 +820,7 @@ void update_process_times(int user_tick) if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); - run_posix_cpu_timers(p); + run_posix_cpu_timers(p); } /* @@ -895,7 +895,7 @@ static inline void update_times(unsigned update_wall_time(); calc_load(ticks); } - + /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1091,7 +1091,7 @@ asmlinkage long sys_gettid(void) /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill - */ + */ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; Index: linux-2.6.22-rc5/arch/i386/kernel/apic.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/apic.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/apic.c 2007-06-17 08:52:07.000000000 +0200 @@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write_around(APIC_LVTT, v); break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } local_irq_restore(flags); @@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void) */ if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); } /* Setup the lapic or request the broadcast */ Index: linux-2.6.22-rc5/arch/i386/kernel/hpet.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/hpet.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/hpet.c 2007-06-17 08:52:10.000000000 +0200 @@ -1,11 +1,14 @@ #include #include +#include #include #include #include +#include #include #include +#include #include #include @@ -21,8 +24,34 @@ extern struct clock_event_device *global * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; + static void __iomem * hpet_virt_address; +#ifdef CONFIG_X86_64 + +#include +#include + +static inline void hpet_set_mapping(void) +{ + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + +} + +static inline void __iomem *hpet_get_virt_address(void) +{ + return hpet_virt_address; +} + +static inline void hpet_clear_mapping(void) +{ + hpet_virt_address = NULL; +} + +#else + static inline unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -33,6 +62,23 @@ static inline void hpet_writel(unsigned writel(d, hpet_virt_address + a); } +static inline void hpet_set_mapping(void) +{ + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +} + +static inline void __iomem *hpet_get_virt_address(void) +{ + return hpet_virt_address; +} + +static inline void hpet_clear_mapping(void) +{ + iounmap(hpet_virt_address); + hpet_virt_address = NULL; +} +#endif + /* * HPET command line enable / disable */ @@ -48,6 +94,13 @@ static int __init hpet_setup(char* str) } __setup("hpet=", hpet_setup); +static int __init disable_hpet(char *str) +{ + boot_hpet_disable = 1; + return 1; +} +__setup("nohpet", disable_hpet); + static inline int is_hpet_capable(void) { return (!boot_hpet_disable && hpet_address); @@ -73,7 +126,7 @@ int is_hpet_enabled(void) #ifdef CONFIG_HPET static void hpet_reserve_platform_timers(unsigned long id) { - struct hpet __iomem *hpet = hpet_virt_address; + struct hpet __iomem *hpet = hpet_get_virt_address(); struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; unsigned int nrtimers, i; struct hpet_data hd; @@ -82,7 +135,7 @@ static void hpet_reserve_platform_timers memset(&hd, 0, sizeof (hd)); hd.hd_phys_address = hpet_address; - hd.hd_address = hpet_virt_address; + hd.hd_address = hpet; hd.hd_nirqs = nrtimers; hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); @@ -110,9 +163,9 @@ static void hpet_reserve_platform_timers */ static unsigned long hpet_period; -static void hpet_set_mode(enum clock_event_mode mode, +static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt); -static int hpet_next_event(unsigned long delta, +static int hpet_legacy_next_event(unsigned long delta, struct clock_event_device *evt); /* @@ -121,10 +174,11 @@ static int hpet_next_event(unsigned long static struct clock_event_device hpet_clockevent = { .name = "hpet", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = hpet_set_mode, - .set_next_event = hpet_next_event, + .set_mode = hpet_legacy_set_mode, + .set_next_event = hpet_legacy_next_event, .shift = 32, .irq = 0, + .rating = 50, }; static void hpet_start_counter(void) @@ -139,7 +193,18 @@ static void hpet_start_counter(void) hpet_writel(cfg, HPET_CFG); } -static void hpet_enable_int(void) +static void hpet_resume_device(void) +{ + force_hpet_resume(); +} + +static void hpet_restart_counter(void) +{ + hpet_resume_device(); + hpet_start_counter(); +} + +static void hpet_enable_legacy_int(void) { unsigned long cfg = hpet_readl(HPET_CFG); @@ -148,7 +213,39 @@ static void hpet_enable_int(void) hpet_legacy_int_enabled = 1; } -static void hpet_set_mode(enum clock_event_mode mode, +static void hpet_legacy_clockevent_register(void) +{ + uint64_t hpet_freq; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + printk(KERN_DEBUG "hpet clockevent registered\n"); +} + +static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { unsigned long cfg, cmp, now; @@ -187,10 +284,14 @@ static void hpet_set_mode(enum clock_eve cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T0_CFG); break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_legacy_int(); + break; } } -static int hpet_next_event(unsigned long delta, +static int hpet_legacy_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long cnt; @@ -210,6 +311,13 @@ static cycle_t read_hpet(void) return (cycle_t)hpet_readl(HPET_COUNTER); } +#ifdef CONFIG_X86_64 +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} +#endif + static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, @@ -217,21 +325,77 @@ static struct clocksource clocksource_hp .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_restart_counter, +#ifdef CONFIG_X86_64 + .vread = vread_hpet, +#endif }; +static int hpet_clocksource_register(void) +{ + u64 tmp, start, now; + cycle_t t1, t2; + + /* Start the counter */ + hpet_start_counter(); + + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + t2 = read_hpet(); + rdtscll(now); + } while (now - start < 200000UL); + + if (t1 == t2) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + return -ENODEV; + } + + /* Initialize and register HPET clocksource + * + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + clocksource_register(&clocksource_hpet); + + return 0; +} + /* * Try to setup the HPET timer */ int __init hpet_enable(void) { unsigned long id; - uint64_t hpet_freq; - u64 tmp; + + if (hpet_get_virt_address()) + return 0; if (!is_hpet_capable()) return 0; - hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_set_mapping(); /* * Read the period and check for a sane value: @@ -241,21 +405,6 @@ int __init hpet_enable(void) goto out_nohpet; /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. - */ - hpet_freq = 1000000000000000ULL; - do_div(hpet_freq, hpet_period); - hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, 32); - /* Calculate the min / max delta */ - hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); - - /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ @@ -270,50 +419,43 @@ int __init hpet_enable(void) goto out_nohpet; #endif - /* Start the counter */ - hpet_start_counter(); - - /* Initialize and register HPET clocksource - * - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * aproximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult - */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; - - clocksource_register(&clocksource_hpet); - + if (hpet_clocksource_register()) + goto out_nohpet; if (id & HPET_ID_LEGSUP) { - hpet_enable_int(); + hpet_legacy_clockevent_register(); hpet_reserve_platform_timers(id); - /* - * Start hpet with the boot cpu mask and make it - * global after the IO_APIC has been initialized. - */ - hpet_clockevent.cpumask =cpumask_of_cpu(0); - clockevents_register_device(&hpet_clockevent); - global_clock_event = &hpet_clockevent; return 1; } return 0; out_nohpet: - iounmap(hpet_virt_address); - hpet_virt_address = NULL; + hpet_clear_mapping(); boot_hpet_disable = 1; return 0; } +static int __init hpet_late_init(void) +{ + if (boot_hpet_disable) + return -ENODEV; + + if (!hpet_address) { + if (!force_hpet_address) + return -ENODEV; + + hpet_address = force_hpet_address; + hpet_enable(); + if (!hpet_get_virt_address()) + return -ENODEV; + } + + return 0; +} +fs_initcall(hpet_late_init); + + #ifdef CONFIG_HPET_EMULATE_RTC /* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET @@ -524,68 +666,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, return IRQ_HANDLED; } #endif - - -/* - * Suspend/resume part - */ - -#ifdef CONFIG_PM - -static int hpet_suspend(struct sys_device *sys_device, pm_message_t state) -{ - unsigned long cfg = hpet_readl(HPET_CFG); - - cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_resume(struct sys_device *sys_device) -{ - unsigned int id; - - hpet_start_counter(); - - id = hpet_readl(HPET_ID); - - if (id & HPET_ID_LEGSUP) - hpet_enable_int(); - - return 0; -} - -static struct sysdev_class hpet_class = { - set_kset_name("hpet"), - .suspend = hpet_suspend, - .resume = hpet_resume, -}; - -static struct sys_device hpet_device = { - .id = 0, - .cls = &hpet_class, -}; - - -static __init int hpet_register_sysfs(void) -{ - int err; - - if (!is_hpet_capable()) - return 0; - - err = sysdev_class_register(&hpet_class); - - if (!err) { - err = sysdev_register(&hpet_device); - if (err) - sysdev_class_unregister(&hpet_class); - } - - return err; -} - -device_initcall(hpet_register_sysfs); - -#endif Index: linux-2.6.22-rc5/arch/i386/kernel/i8253.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/i8253.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/i8253.c 2007-06-17 08:52:09.000000000 +0200 @@ -3,19 +3,17 @@ * */ #include -#include +#include +#include #include -#include #include -#include +#include #include #include #include #include -#include "io_ports.h" - DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -41,26 +39,27 @@ static void init_pit_timer(enum clock_ev case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - udelay(10); outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); outb(LATCH >> 8 , PIT_CH0); /* MSB */ break; - /* - * Avoid unnecessary state transitions, as it confuses - * Geode / Cyrix based boxen. - */ case CLOCK_EVT_MODE_SHUTDOWN: - if (evt->mode == CLOCK_EVT_MODE_UNUSED) - break; case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN) - break; + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + break; + case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ outb_p(0x38, PIT_MODE); - udelay(10); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ break; } spin_unlock_irqrestore(&i8253_lock, flags); @@ -120,6 +119,7 @@ void __init setup_pit_timer(void) global_clock_event = &pit_clockevent; } +#ifndef CONFIG_X86_64 /* * Since the PIT overflows every tick, its not very useful * to just read by itself. So use jiffies to emulate a free @@ -204,3 +204,5 @@ static int __init init_pit_clocksource(v return clocksource_register(&clocksource_pit); } arch_initcall(init_pit_clocksource); + +#endif Index: linux-2.6.22-rc5/kernel/time/tick-broadcast.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/tick-broadcast.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/tick-broadcast.c 2007-06-17 08:52:10.000000000 +0200 @@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_clear_oneshot(int cpu); +#else +static inline void tick_broadcast_clear_oneshot(int cpu) { } +#endif + /* * Debugging: see timer_list.c */ @@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (bc) tick_setup_periodic(bc, 1); } @@ -58,8 +64,9 @@ static void tick_broadcast_start_periodi */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if (tick_broadcast_device.evtdev || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if ((tick_broadcast_device.evtdev && + tick_broadcast_device.evtdev->rating >= dev->rating) || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; clockevents_exchange_device(NULL, dev); @@ -99,8 +106,19 @@ int tick_device_uses_broadcast(struct cl cpu_set(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; - } + } else { + /* + * When the new devices is not affected by the stop + * feature and the cpu is marked in the broadcast mask + * then clear the broadcast bit. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { + int cpu = smp_processor_id(); + cpu_clear(cpu, tick_broadcast_mask); + tick_broadcast_clear_oneshot(cpu); + } + } spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -299,7 +317,7 @@ void tick_suspend_broadcast(void) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -316,6 +334,8 @@ int tick_resume_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if(!cpus_empty(tick_broadcast_mask)) @@ -485,16 +505,24 @@ out: spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpu_clear(cpu, tick_broadcast_oneshot_mask); +} + /** * tick_broadcast_setup_highres - setup the broadcast device for highres */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; - } + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; } /* Index: linux-2.6.22-rc5/kernel/time/tick-common.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/tick-common.c 2007-06-17 08:52:01.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/tick-common.c 2007-06-17 08:52:10.000000000 +0200 @@ -200,7 +200,7 @@ static int tick_check_new_device(struct cpu = smp_processor_id(); if (!cpu_isset(cpu, newdev->cpumask)) - goto out; + goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -265,7 +265,7 @@ out_bc: */ if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; -out: + spin_unlock_irqrestore(&tick_device_lock, flags); return ret; @@ -318,12 +318,17 @@ static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; + int broadcast = tick_resume_broadcast(); spin_lock_irqsave(&tick_device_lock, flags); - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -360,8 +365,7 @@ static int tick_notify(struct notifier_b break; case CLOCK_EVT_NOTIFY_RESUME: - if (!tick_resume_broadcast()) - tick_resume(); + tick_resume(); break; default: Index: linux-2.6.22-rc5/kernel/hrtimer.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/hrtimer.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/kernel/hrtimer.c 2007-06-17 08:52:08.000000000 +0200 @@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprog */ static int hrtimer_switch_to_hres(void) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); return 0; } base->hres_active = 1; @@ -683,6 +686,7 @@ static void enqueue_hrtimer(struct hrtim struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; struct hrtimer *entry; + int leftmost = 1; /* * Find the right place in the rbtree: @@ -694,18 +698,19 @@ static void enqueue_hrtimer(struct hrtim * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) + if (timer->expires.tv64 < entry->expires.tv64) { link = &(*link)->rb_left; - else + } else { link = &(*link)->rb_right; + leftmost = 0; + } } /* * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + if (leftmost) { /* * Reprogram the clock event device. When the timer is already * expired hrtimer_enqueue_reprogram has either called the Index: linux-2.6.22-rc5/kernel/time/tick-oneshot.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/tick-oneshot.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/tick-oneshot.c 2007-06-17 08:52:07.000000000 +0200 @@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handle struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + printk(" %s does not support one-shot mode.\n", + dev->name); + } return -EINVAL; + } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; Index: linux-2.6.22-rc5/arch/x86_64/kernel/time.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/time.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/time.c 2007-06-17 08:52:09.000000000 +0200 @@ -28,11 +28,12 @@ #include #include #include +#include #ifdef CONFIG_ACPI #include /* for PM timer frequency */ #include #endif -#include +#include #include #include #include @@ -44,12 +45,10 @@ #include #include #include - -static char *timename = NULL; +#include DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -79,8 +78,9 @@ EXPORT_SYMBOL(profile_pc); * sheet for details. */ -static void set_rtc_mmss(unsigned long nowtime) +static int set_rtc_mmss(unsigned long nowtime) { + int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char control, freq_select; @@ -120,6 +120,7 @@ static void set_rtc_mmss(unsigned long n if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); + retval = -1; } else { BIN_TO_BCD(real_seconds); BIN_TO_BCD(real_minutes); @@ -139,67 +140,23 @@ static void set_rtc_mmss(unsigned long n CMOS_WRITE(freq_select, RTC_FREQ_SELECT); spin_unlock(&rtc_lock); -} + return retval; +} -void main_timer_handler(void) +int update_persistent_clock(struct timespec now) { - static unsigned long rtc_update = 0; -/* - * Here we are in the timer irq handler. We have irqs locally disabled (so we - * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running - * on the other CPU, so we need a lock. We also need to lock the vsyscall - * variables, because both do_timer() and us change them -arca+vojtech - */ - - write_seqlock(&xtime_lock); - -/* - * Do the timer stuff. - */ - - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - - if (!using_apic_timer) - smp_local_timer_interrupt(); - -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - - write_sequnlock(&xtime_lock); + return set_rtc_mmss(now.tv_sec); } static irqreturn_t timer_interrupt(int irq, void *dev_id) { - if (apic_runs_main_timer > 1) - return IRQ_HANDLED; - main_timer_handler(); - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); + global_clock_event->event_handler(global_clock_event); + return IRQ_HANDLED; } -static unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -226,7 +183,7 @@ static unsigned long get_cmos_time(void) /* * We know that x86-64 always uses BCD format, no need to check the * config register. - */ + */ BCD_TO_BIN(sec); BCD_TO_BIN(min); @@ -239,11 +196,11 @@ static unsigned long get_cmos_time(void) BCD_TO_BIN(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { + } else { /* * x86-64 systems only exists since 2002. * This will work up to Dec 31, 2100 - */ + */ year += 2000; } @@ -261,17 +218,17 @@ static unsigned int __init tsc_calibrate unsigned long flags; for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; no_ctr_free = (i == 4); if (no_ctr_free) { - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } local_irq_save(flags); /* start meauring cycles, incrementing from 0 */ @@ -279,119 +236,38 @@ static unsigned int __init tsc_calibrate wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); rdtscl(tsc_start); do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); } while ((tsc_now - tsc_start) < TICK_COUNT); local_irq_restore(flags); if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } return pmc_now * tsc_khz / (tsc_now - tsc_start); } -/* - * pit_calibrate_tsc() uses the speaker output (channel 2) of - * the PIT. This is better than using the timer interrupt output, - * because we can read the value of the speaker with just one inb(), - * where we need three i/o operations for the interrupt channel. - * We count how many ticks the TSC does in 50 ms. - */ - -static unsigned int __init pit_calibrate_tsc(void) -{ - unsigned long start, end; - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); - start = get_cycles_sync(); - while ((inb(0x61) & 0x20) == 0); - end = get_cycles_sync(); - - spin_unlock_irqrestore(&i8253_lock, flags); - - return (end - start) / 50; -} - -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 - -static void __pit_init(int val, u8 mode) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(mode, PIT_MODE); - outb_p(val & 0xff, PIT_CH0); /* LSB */ - outb_p(val >> 8, PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} - -void __init pit_init(void) -{ - __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ -} - -void pit_stop_interrupt(void) -{ - __pit_init(0, 0x30); /* mode 0 */ -} - -void stop_timer_interrupt(void) -{ - char *name; - if (hpet_address) { - name = "HPET"; - hpet_timer_stop_set_go(0); - } else { - name = "PIT"; - pit_stop_interrupt(); - } - printk(KERN_INFO "timer: %s interrupt stopped.\n", name); -} - static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, - .name = "timer" + .name = "timer" }; void __init time_init(void) { - if (nohpet) - hpet_address = 0; - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - if (hpet_arch_init()) - hpet_address = 0; - - if (hpet_use_timer) { - /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; - tsc_khz = hpet_calibrate_tsc(); - timename = "HPET"; - } else { - pit_init(); - tsc_khz = pit_calibrate_tsc(); - timename = "PIT"; - } + if (!hpet_enable()) + setup_pit_timer(); + + setup_irq(0, &irq0); + + tsc_calibrate(); cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && @@ -411,79 +287,4 @@ void __init time_init(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); - - setup_irq(0, &irq0); -} - - -static long clock_cmos_diff; -static unsigned long sleep_start; - -/* - * sysfs support for the timer. - */ - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; - return 0; } - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } - if (hpet_address) - hpet_reenable(); - else - i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - -/* XXX this sysfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); Index: linux-2.6.22-rc5/drivers/input/misc/pcspkr.c =================================================================== --- linux-2.6.22-rc5.orig/drivers/input/misc/pcspkr.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/drivers/input/misc/pcspkr.c 2007-06-17 08:52:08.000000000 +0200 @@ -24,7 +24,12 @@ MODULE_AUTHOR("Vojtech Pavlik +#else +static DEFINE_SPINLOCK(i8253_lock); +#endif static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { @@ -43,7 +48,7 @@ static int pcspkr_event(struct input_dev if (value > 20 && value < 32767) count = PIT_TICK_RATE / value; - spin_lock_irqsave(&i8253_beep_lock, flags); + spin_lock_irqsave(&i8253_lock, flags); if (count) { /* enable counter 2 */ @@ -58,7 +63,7 @@ static int pcspkr_event(struct input_dev outb(inb_p(0x61) & 0xFC, 0x61); } - spin_unlock_irqrestore(&i8253_beep_lock, flags); + spin_unlock_irqrestore(&i8253_lock, flags); return 0; } Index: linux-2.6.22-rc5/include/asm-x86_64/i8253.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-rc5/include/asm-x86_64/i8253.h 2007-06-17 08:52:09.000000000 +0200 @@ -0,0 +1,2 @@ +#include +#include Index: linux-2.6.22-rc5/include/linux/clocksource.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/clocksource.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/clocksource.h 2007-06-17 08:52:08.000000000 +0200 @@ -50,6 +50,7 @@ struct clocksource; * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary + * @settimeofday: [Optional] Do arch specific work during do_settimeofday * @cycle_interval: Used internally by timekeeping core, please ignore. * @xtime_interval: Used internally by timekeeping core, please ignore. */ @@ -68,6 +69,8 @@ struct clocksource { cycle_t (*vread)(void); void (*resume)(void); + void (*settimeofday)(struct clocksource *cs, struct timespec *ts); + /* timekeeping specific data, ignore */ cycle_t cycle_interval; u64 xtime_interval; Index: linux-2.6.22-rc5/drivers/char/rtc.c =================================================================== --- linux-2.6.22-rc5.orig/drivers/char/rtc.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/drivers/char/rtc.c 2007-06-17 08:52:08.000000000 +0200 @@ -82,7 +82,7 @@ #include #include -#if defined(__i386__) +#if defined(__i386__) || defined(__x86_64__) #include #endif Index: linux-2.6.22-rc5/include/asm-x86_64/timex.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-x86_64/timex.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-x86_64/timex.h 2007-06-17 08:52:08.000000000 +0200 @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include Index: linux-2.6.22-rc5/include/asm-x86_64/apic.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-x86_64/apic.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-x86_64/apic.h 2007-06-17 08:52:09.000000000 +0200 @@ -79,12 +79,12 @@ extern void smp_local_timer_interrupt (v extern void setup_boot_APIC_clock (void); extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); -extern void disable_APIC_timer(void); -extern void enable_APIC_timer(void); extern void setup_apic_routing(void); -extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask); +extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask); + +extern int apic_is_clustered_box(void); #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 @@ -93,10 +93,6 @@ extern void setup_APIC_extened_lvt(unsig #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 -void smp_send_timer_broadcast_ipi(void); -void switch_APIC_timer_to_ipi(void *cpumask); -void switch_ipi_to_APIC_timer(void *cpumask); - #define ARCH_APICTIMER_STOPS_ON_C3 1 extern unsigned boot_cpu_id; Index: linux-2.6.22-rc5/include/asm-x86_64/hpet.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-x86_64/hpet.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-x86_64/hpet.h 2007-06-17 08:52:09.000000000 +0200 @@ -1,78 +1,2 @@ -#ifndef _ASM_X8664_HPET_H -#define _ASM_X8664_HPET_H 1 -/* - * Documentation on HPET can be found at: - * http://www.intel.com/ial/home/sp/pcmmspec.htm - * ftp://download.intel.com/ial/home/sp/mmts098.pdf - */ - -#define HPET_MMAP_SIZE 1024 - -#define HPET_ID 0x000 -#define HPET_PERIOD 0x004 -#define HPET_CFG 0x010 -#define HPET_STATUS 0x020 -#define HPET_COUNTER 0x0f0 -#define HPET_Tn_OFFSET 0x20 -#define HPET_Tn_CFG(n) (0x100 + (n) * HPET_Tn_OFFSET) -#define HPET_Tn_ROUTE(n) (0x104 + (n) * HPET_Tn_OFFSET) -#define HPET_Tn_CMP(n) (0x108 + (n) * HPET_Tn_OFFSET) -#define HPET_T0_CFG HPET_Tn_CFG(0) -#define HPET_T0_CMP HPET_Tn_CMP(0) -#define HPET_T1_CFG HPET_Tn_CFG(1) -#define HPET_T1_CMP HPET_Tn_CMP(1) - -#define HPET_ID_VENDOR 0xffff0000 -#define HPET_ID_LEGSUP 0x00008000 -#define HPET_ID_64BIT 0x00002000 -#define HPET_ID_NUMBER 0x00001f00 -#define HPET_ID_REV 0x000000ff -#define HPET_ID_NUMBER_SHIFT 8 - -#define HPET_ID_VENDOR_SHIFT 16 -#define HPET_ID_VENDOR_8086 0x8086 - -#define HPET_CFG_ENABLE 0x001 -#define HPET_CFG_LEGACY 0x002 -#define HPET_LEGACY_8254 2 -#define HPET_LEGACY_RTC 8 - -#define HPET_TN_LEVEL 0x0002 -#define HPET_TN_ENABLE 0x0004 -#define HPET_TN_PERIODIC 0x0008 -#define HPET_TN_PERIODIC_CAP 0x0010 -#define HPET_TN_64BIT_CAP 0x0020 -#define HPET_TN_SETVAL 0x0040 -#define HPET_TN_32BIT 0x0100 -#define HPET_TN_ROUTE 0x3e00 -#define HPET_TN_FSB 0x4000 -#define HPET_TN_FSB_CAP 0x8000 - -#define HPET_TN_ROUTE_SHIFT 9 - -#define HPET_TICK_RATE (HZ * 100000UL) - -extern int is_hpet_enabled(void); -extern int hpet_rtc_timer_init(void); -extern int apic_is_clustered_box(void); -extern int hpet_arch_init(void); -extern int hpet_timer_stop_set_go(unsigned long tick); -extern int hpet_reenable(void); -extern unsigned int hpet_calibrate_tsc(void); - -extern int hpet_use_timer; -extern unsigned long hpet_address; -extern unsigned long hpet_period; -extern unsigned long hpet_tick; - -#ifdef CONFIG_HPET_EMULATE_RTC -extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec); -extern int hpet_set_periodic_freq(unsigned long freq); -extern int hpet_rtc_dropped_irq(void); -extern int hpet_rtc_timer_init(void); -#endif /* CONFIG_HPET_EMULATE_RTC */ - -#endif +#include Index: linux-2.6.22-rc5/arch/i386/kernel/time.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/time.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/time.c 2007-06-17 08:52:08.000000000 +0200 @@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void return retval; } -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); -int no_sync_cmos_clock; - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } extern void (*late_time_init)(void); Index: linux-2.6.22-rc5/arch/sparc64/kernel/time.c =================================================================== --- linux-2.6.22-rc5.orig/arch/sparc64/kernel/time.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/arch/sparc64/kernel/time.c 2007-06-17 08:52:08.000000000 +0200 @@ -403,58 +403,9 @@ static struct sparc64_tick_ops hbtick_op static unsigned long timer_ticks_per_nsec_quotient __read_mostly; -#define TICK_SIZE (tick_nsec / 1000) - -#define USEC_AFTER 500000 -#define USEC_BEFORE 500000 - -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */ Index: linux-2.6.22-rc5/include/linux/time.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/time.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/time.h 2007-06-17 08:52:08.000000000 +0200 @@ -93,6 +93,8 @@ extern struct timespec wall_to_monotonic extern seqlock_t xtime_lock __attribute__((weak)); extern unsigned long read_persistent_clock(void); +extern int update_persistent_clock(struct timespec now); +extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); static inline unsigned long get_seconds(void) Index: linux-2.6.22-rc5/kernel/time/ntp.c =================================================================== --- linux-2.6.22-rc5.orig/kernel/time/ntp.c 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/kernel/time/ntp.c 2007-06-17 08:52:08.000000000 +0200 @@ -8,11 +8,12 @@ * changelogs. */ +#include +#include #include #include +#include #include -#include -#include #include #include @@ -187,12 +188,64 @@ u64 current_tick_length(void) return tick_length; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE + +/* Disable the cmos update - used by virtualization and embedded */ +int no_sync_cmos_clock __read_mostly; -void __attribute__ ((weak)) notify_arch_cmos_timer(void) +static void sync_cmos_clock(unsigned long dummy); + +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) { - return; + struct timespec now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + getnstimeofday(&now); + if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + fail = update_persistent_clock(now); + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); } +static void notify_cmos_timer(void) +{ + if (no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); +} + +#else +static inline void notify_cmos_timer(void) { } +#endif + /* adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ @@ -357,6 +410,6 @@ leave: if ((time_status & (STA_UNSYNC|ST txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); + notify_cmos_timer(); return(result); } Index: linux-2.6.22-rc5/arch/sparc64/Kconfig =================================================================== --- linux-2.6.22-rc5.orig/arch/sparc64/Kconfig 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/arch/sparc64/Kconfig 2007-06-17 08:52:08.000000000 +0200 @@ -23,6 +23,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config GENERIC_CLOCKEVENTS bool default y Index: linux-2.6.22-rc5/include/asm-i386/timer.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-i386/timer.h 2007-06-17 08:52:00.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-i386/timer.h 2007-06-17 08:52:08.000000000 +0200 @@ -5,13 +5,11 @@ #define TICK_SIZE (tick_nsec / 1000) -void setup_pit_timer(void); unsigned long long native_sched_clock(void); unsigned long native_calculate_cpu_khz(void); extern int timer_ack; extern int no_timer_check; -extern int no_sync_cmos_clock; extern int recalibrate_cpu_khz(void); #ifndef CONFIG_PARAVIRT Index: linux-2.6.22-rc5/include/asm-i386/i8253.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-i386/i8253.h 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-i386/i8253.h 2007-06-17 08:52:09.000000000 +0200 @@ -18,4 +18,11 @@ static inline void pit_interrupt_hook(vo global_clock_event->event_handler(global_clock_event); } +extern void setup_pit_timer(void); + +/* i8253A PIT registers */ +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 +#define PIT_CH2 0x42 + #endif /* __ASM_I8253_H__ */ Index: linux-2.6.22-rc5/arch/x86_64/kernel/tsc.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/tsc.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/tsc.c 2007-06-17 08:52:09.000000000 +0200 @@ -6,7 +6,9 @@ #include #include #include +#include +#include #include static int notsc __initdata = 0; @@ -61,25 +63,9 @@ static inline int check_tsc_unstable(voi * first tick after the change will be slightly wrong. */ -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long tsc_khz_ref = 0; +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -125,10 +111,8 @@ static struct notifier_block time_cpufre static int __init cpufreq_tsc(void) { - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); return 0; } @@ -136,7 +120,93 @@ core_initcall(cpufreq_tsc); #endif -static int tsc_unstable = 0; +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static unsigned long __init tsc_read_refs(unsigned long *pm, + unsigned long *hpet) +{ + unsigned long t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles_sync(); + if (hpet) + *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *pm = acpi_pm_read_early(); + t2 = get_cycles_sync(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULONG_MAX; +} + +/** + * tsc_calibrate - calibrate the tsc on boot + */ +void __init tsc_calibrate(void) +{ + unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; + int hpet = is_hpet_enabled(); + + local_irq_save(flags); + + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + tr1 = get_cycles_sync(); + while ((inb(0x61) & 0x20) == 0); + tr2 = get_cycles_sync(); + + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Preset the result with the raw and inaccurate PIT + * calibration value + */ + tsc_khz = (tr2 - tr1) / 50; + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) { + printk(KERN_INFO "TSC calibrated against PIT\n"); + return; + } + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { + printk(KERN_WARNING "TSC calibration disturbed by SMI, " + "using PIT calibration result\n"); + return; + } + + tsc2 = (tsc2 - tsc1) * 1000000L; + + if (hpet) { + printk(KERN_INFO "TSC calibrated against HPET\n"); + if (hpet2 < hpet1) + hpet2 += 0x100000000; + hpet2 -= hpet1; + tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; + } else { + printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); + if (pm2 < pm1) + pm2 += ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; + } + + tsc_khz = tsc2 / tsc1; +} /* * Make an educated guess if the TSC is trustworthy and synchronized @@ -153,17 +223,18 @@ __cpuinit int unsynchronized_tsc(void) #endif /* Most intel systems have synchronized TSCs except for multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + if (acpi_gbl_FADT.header.length > 0 && + acpi_gbl_FADT.C3latency < 1000) return 1; #endif - return 0; + return 0; } - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } int __init notsc_setup(char *s) @@ -216,8 +287,9 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); void __init init_tsc_clocksource(void) { if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); + clocksource_tsc.mult = + clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); + if (check_tsc_unstable()) clocksource_tsc.rating = 0; Index: linux-2.6.22-rc5/arch/x86_64/kernel/apic.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/apic.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/apic.c 2007-06-17 08:52:09.000000000 +0200 @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include @@ -38,10 +40,10 @@ #include #include #include +#include int apic_mapped; int apic_verbosity; -int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; @@ -50,20 +52,36 @@ int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +/* Local APIC timer verification ok */ +static int local_apic_timer_verify_ok; + static struct resource *ioapic_resources; static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -/* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers - */ -static cpumask_t timer_interrupt_broadcast_ipi_mask; +static unsigned int calibration_result; -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); + +static void lapic_timer_broadcast(cpumask_t mask); + +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static void apic_pm_activate(void); @@ -92,8 +110,9 @@ unsigned int safe_apic_wait_icr_idle(voi void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; - - v = APIC_DM_NMI; /* unmask and set to NMI */ + + /* unmask and set to NMI */ + v = APIC_DM_NMI; apic_write(APIC_LVT0, v); } @@ -120,7 +139,7 @@ void ack_bad_irq(unsigned int irq) * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. - * But don't ack when the APIC is disabled. -AK + * But don't ack when the APIC is disabled. -AK */ if (!disable_apic) ack_APIC_irq(); @@ -616,7 +635,7 @@ early_param("apic", apic_set_verbosity); * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) + * not correctly set up (usually the APIC timer won't work etc.) */ static int __init detect_init_APIC (void) @@ -759,14 +778,14 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { unsigned int lvtt_value, tmp_value; - int cpu = smp_processor_id(); - - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!irqen) lvtt_value |= APIC_LVT_MASKED; apic_write(APIC_LVTT, lvtt_value); @@ -779,227 +798,313 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } -static void setup_APIC_timer(unsigned int clocks) +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) { unsigned long flags; + unsigned int v; + + /* Lapic used for broadcast ? */ + if (!local_apic_timer_verify_ok) + return; local_irq_save(flags); - /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - __setup_APIC_LVTT(clocks); - /* Turn off PIT interrupt if we use APIC timer as main timer. - Only works with the PM timer right now - TBD fix it for HPET too. */ - if ((pmtmr_ioport != 0) && - smp_processor_id() == boot_cpu_id && - apic_runs_main_timer == 1 && - !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { - stop_timer_interrupt(); - apic_runs_main_timer++; + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } + local_irq_restore(flags); } /* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. + * Local APIC timer broadcast function */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) +static void lapic_timer_broadcast(cpumask_t mask) { - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(4000000000); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif - { - rdtscll(tsc_start); +} - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - printk("result %d\n", result); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ +#define LAPIC_CAL_LOOPS (HZ/10) - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); +static __initdata volatile int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; - return result * APIC_DIVISOR / HZ; +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + /* + * The local apic timer can be disabled via the kernel + * commandline or from the test above. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - local_irq_disable(); + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; - calibration_result = calibrate_APIC_clock(); /* - * Now set up the timer for real. + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame */ - setup_APIC_timer(calibration_result); + __setup_APIC_LVTT(1000000000, 0, 0); + /* Let the interrupts run */ local_irq_enable(); -} -void __cpuinit setup_secondary_APIC_clock(void) -{ - local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); - local_irq_enable(); -} + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; + local_irq_disable(); - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; } -} -void enable_APIC_timer(void) -{ - int cpu = smp_processor_id(); + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); - if (using_apic_timer && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - unsigned long v; + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); } -} -void switch_APIC_timer_to_ipi(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - disable_APIC_timer(); - cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); - } -} -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); + local_apic_timer_verify_ok = 1; -void smp_send_timer_broadcast_ipi(void) -{ - int cpu = smp_processor_id(); - cpumask_t mask; + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); - add_pda(apic_timer_irqs, 1); - smp_local_timer_interrupt(); - } + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); - if (!cpus_empty(mask)) { - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + local_apic_timer_verify_ok = 0; + } else + local_irq_enable(); + + if (!local_apic_timer_verify_ok) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() == 1) + return; + } else { + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; } + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); } -void switch_ipi_to_APIC_timer(void *cpumask) +void __cpuinit setup_secondary_APIC_clock(void) { - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); - enable_APIC_timer(); - } + setup_APIC_timer(); } -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); int setup_profiling_timer(unsigned int multiplier) { return -EINVAL; } -void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) +void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; unsigned int v = (mask << 16) | (msg_type << 8) | vector; + apic_write(reg, v); } -#undef APIC_DIVISOR - /* * Local timer interrupt handler. It does both profiling and * process statistics/rescheduling. @@ -1010,24 +1115,36 @@ void setup_APIC_extened_lvt(unsigned cha * value into /proc/profile. */ -void smp_local_timer_interrupt(void) +void local_apic_timer_interrupt(void) { - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) - main_timer_handler(); + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). - * - * We might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. */ + add_pda(apic_timer_irqs, 1); + + evt->event_handler(evt); } /* @@ -1043,11 +1160,6 @@ void smp_apic_timer_interrupt(struct pt_ struct pt_regs *old_regs = set_irq_regs(regs); /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. */ @@ -1059,7 +1171,7 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(); + local_apic_timer_interrupt(); irq_exit(); set_irq_regs(old_regs); } @@ -1127,21 +1239,6 @@ asmlinkage void smp_spurious_interrupt(v v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); - -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif irq_exit(); } @@ -1173,11 +1270,11 @@ asmlinkage void smp_error_interrupt(void 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } -int disable_apic; +int disable_apic; /* * This initializes the IO-APIC and APIC hardware if this is @@ -1185,11 +1282,11 @@ int disable_apic; */ int __init APIC_init_uniprocessor (void) { - if (disable_apic) { + if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); - return -1; + return -1; } - if (!cpu_has_apic) { + if (!cpu_has_apic) { disable_apic = 1; printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; @@ -1211,8 +1308,8 @@ int __init APIC_init_uniprocessor (void) return 0; } -static __init int setup_disableapic(char *str) -{ +static __init int setup_disableapic(char *str) +{ disable_apic = 1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; @@ -1220,10 +1317,10 @@ static __init int setup_disableapic(char early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ +static __init int setup_nolapic(char *str) +{ return setup_disableapic(str); -} +} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -1233,36 +1330,12 @@ static int __init parse_lapic_timer_c2_o } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) -{ +static __init int setup_noapictimer(char *str) +{ if (str[0] != ' ' && str[0] != 0) return 0; disable_apic_timer = 1; return 1; -} - -static __init int setup_apicmaintimer(char *str) -{ - apic_runs_main_timer = 1; - nohpet = 1; - return 1; } -__setup("apicmaintimer", setup_apicmaintimer); - -static __init int setup_noapicmaintimer(char *str) -{ - apic_runs_main_timer = -1; - return 1; -} -__setup("noapicmaintimer", setup_noapicmaintimer); - -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return setup_apicmaintimer(NULL); -} -__setup("apicpmtimer", setup_apicpmtimer); - -__setup("noapictimer", setup_noapictimer); +__setup("noapictimer", setup_noapictimer); Index: linux-2.6.22-rc5/arch/x86_64/kernel/hpet.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/hpet.c 2007-06-17 08:51:59.000000000 +0200 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,493 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define HPET_MASK 0xFFFFFFFF -#define HPET_SHIFT 22 - -/* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000 - -int nohpet __initdata; - -unsigned long hpet_address; -unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ - -int hpet_use_timer; /* Use counter of hpet for time keeping, - * otherwise PIT - */ - -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!hpet_address) - return 0; - - memset(&hd, 0, sizeof(hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static cycle_t read_hpet(void) -{ - return (cycle_t)hpet_readl(HPET_COUNTER); -} - -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} - -struct clocksource clocksource_hpet = { - .name = "hpet", - .rating = 250, - .read = read_hpet, - .mask = (cycle_t)HPET_MASK, - .mult = 0, /* set below */ - .shift = HPET_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .vread = vread_hpet, -}; - -int hpet_arch_init(void) -{ - unsigned int id; - u64 tmp; - - if (!hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, hpet_address); - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - /* - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * aproximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult - */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; - clocksource_register(&clocksource_hpet); - - return hpet_timer_stop_set_go(hpet_tick); -} - -int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define TICK_MIN 5000 -#define MAX_TRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, i; - - for (i = 0; i < MAX_TRIES; i++) { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - if (tsc2 - tsc1 > TICK_MIN) - break; - } - *hpet = hpet1; - *tsc = tsc2; -} - -unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); Index: linux-2.6.22-rc5/include/asm-i386/hpet.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-i386/hpet.h 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-i386/hpet.h 2007-06-17 08:52:10.000000000 +0200 @@ -4,112 +4,90 @@ #ifdef CONFIG_HPET_TIMER -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - /* * Documentation on HPET can be found at: * http://www.intel.com/ial/home/sp/pcmmspec.htm * ftp://download.intel.com/ial/home/sp/mmts098.pdf */ -#define HPET_MMAP_SIZE 1024 +#define HPET_MMAP_SIZE 1024 -#define HPET_ID 0x000 -#define HPET_PERIOD 0x004 -#define HPET_CFG 0x010 -#define HPET_STATUS 0x020 -#define HPET_COUNTER 0x0f0 -#define HPET_T0_CFG 0x100 -#define HPET_T0_CMP 0x108 -#define HPET_T0_ROUTE 0x110 -#define HPET_T1_CFG 0x120 -#define HPET_T1_CMP 0x128 -#define HPET_T1_ROUTE 0x130 -#define HPET_T2_CFG 0x140 -#define HPET_T2_CMP 0x148 -#define HPET_T2_ROUTE 0x150 - -#define HPET_ID_LEGSUP 0x00008000 -#define HPET_ID_NUMBER 0x00001f00 -#define HPET_ID_REV 0x000000ff +#define HPET_ID 0x000 +#define HPET_PERIOD 0x004 +#define HPET_CFG 0x010 +#define HPET_STATUS 0x020 +#define HPET_COUNTER 0x0f0 +#define HPET_T0_CFG 0x100 +#define HPET_T0_CMP 0x108 +#define HPET_T0_ROUTE 0x110 +#define HPET_T1_CFG 0x120 +#define HPET_T1_CMP 0x128 +#define HPET_T1_ROUTE 0x130 +#define HPET_T2_CFG 0x140 +#define HPET_T2_CMP 0x148 +#define HPET_T2_ROUTE 0x150 + +#define HPET_ID_REV 0x000000ff +#define HPET_ID_NUMBER 0x00001f00 +#define HPET_ID_64BIT 0x00002000 +#define HPET_ID_LEGSUP 0x00008000 +#define HPET_ID_VENDOR 0xffff0000 #define HPET_ID_NUMBER_SHIFT 8 +#define HPET_ID_VENDOR_SHIFT 16 + +#define HPET_ID_VENDOR_8086 0x8086 -#define HPET_CFG_ENABLE 0x001 -#define HPET_CFG_LEGACY 0x002 +#define HPET_CFG_ENABLE 0x001 +#define HPET_CFG_LEGACY 0x002 #define HPET_LEGACY_8254 2 #define HPET_LEGACY_RTC 8 -#define HPET_TN_ENABLE 0x004 -#define HPET_TN_PERIODIC 0x008 -#define HPET_TN_PERIODIC_CAP 0x010 -#define HPET_TN_SETVAL 0x040 -#define HPET_TN_32BIT 0x100 - -/* Use our own asm for 64 bit multiply/divide */ -#define ASM_MUL64_REG(eax_out,edx_out,reg_in,eax_in) \ - __asm__ __volatile__("mull %2" \ - :"=a" (eax_out), "=d" (edx_out) \ - :"r" (reg_in), "0" (eax_in)) - -#define ASM_DIV64_REG(eax_out,edx_out,reg_in,eax_in,edx_in) \ - __asm__ __volatile__("divl %2" \ - :"=a" (eax_out), "=d" (edx_out) \ - :"r" (reg_in), "0" (eax_in), "1" (edx_in)) +#define HPET_TN_LEVEL 0x0002 +#define HPET_TN_ENABLE 0x0004 +#define HPET_TN_PERIODIC 0x0008 +#define HPET_TN_PERIODIC_CAP 0x0010 +#define HPET_TN_64BIT_CAP 0x0020 +#define HPET_TN_SETVAL 0x0040 +#define HPET_TN_32BIT 0x0100 +#define HPET_TN_ROUTE 0x3e00 +#define HPET_TN_FSB 0x4000 +#define HPET_TN_FSB_CAP 0x8000 +#define HPET_TN_ROUTE_SHIFT 9 -#define KERNEL_TICK_USEC (1000000UL/HZ) /* tick value in microsec */ /* Max HPET Period is 10^8 femto sec as in HPET spec */ -#define HPET_MAX_PERIOD (100000000UL) +#define HPET_MAX_PERIOD 100000000UL /* * Min HPET period is 10^5 femto sec just for safety. If it is less than this, * then 32 bit HPET counter wrapsaround in less than 0.5 sec. */ -#define HPET_MIN_PERIOD (100000UL) -#define HPET_TICK_RATE (HZ * 100000UL) +#define HPET_MIN_PERIOD 100000UL -extern unsigned long hpet_address; /* hpet memory map physical address */ +/* hpet memory map physical address */ +extern unsigned long hpet_address; +extern unsigned long force_hpet_address; extern int is_hpet_enabled(void); +extern int hpet_enable(void); #ifdef CONFIG_X86_64 -extern unsigned long hpet_tick; /* hpet clks count per tick */ -extern int hpet_use_timer; -extern int hpet_rtc_timer_init(void); -extern int hpet_enable(void); -extern int is_hpet_capable(void); -extern int hpet_readl(unsigned long a); -#else -extern int hpet_enable(void); +/* hpet_readl/writel defines */ +#include #endif +void force_hpet_resume(void); + #ifdef CONFIG_HPET_EMULATE_RTC + +#include + extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask); extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); -extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec); +extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec); extern int hpet_set_periodic_freq(unsigned long freq); extern int hpet_rtc_dropped_irq(void); extern int hpet_rtc_timer_init(void); extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); + #endif /* CONFIG_HPET_EMULATE_RTC */ #else Index: linux-2.6.22-rc5/arch/x86_64/kernel/Makefile =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/Makefile 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/Makefile 2007-06-17 08:52:09.000000000 +0200 @@ -9,7 +9,10 @@ obj-y := process.o signal.o entry.o trap x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ - perfctr-watchdog.o + perfctr-watchdog.o i8253.o + +i8253-y += ../../i386/kernel/i8253.o +hpet-y += ../../i386/kernel/hpet.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o Index: linux-2.6.22-rc5/arch/x86_64/kernel/mce_amd.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/mce_amd.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/mce_amd.c 2007-06-17 08:52:09.000000000 +0200 @@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(stru high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; wrmsr(address, low, high); - setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); + setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); Index: linux-2.6.22-rc5/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/smpboot.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/smpboot.c 2007-06-17 08:52:09.000000000 +0200 @@ -223,8 +223,6 @@ void __cpuinit smp_callin(void) local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); - disable_APIC_timer(); - /* * Save our processor parameters */ @@ -348,8 +346,6 @@ void __cpuinit start_secondary(void) enable_8259A_irq(0); } - enable_APIC_timer(); - /* * The sibling maps must be set before turing the online map on for * this cpu Index: linux-2.6.22-rc5/include/asm-x86_64/proto.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-x86_64/proto.h 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-x86_64/proto.h 2007-06-17 08:52:09.000000000 +0200 @@ -51,9 +51,6 @@ extern void reserve_bootmem_generic(unsi extern void load_gs_index(unsigned gs); -extern void stop_timer_interrupt(void); -extern void main_timer_handler(void); - extern unsigned long end_pfn_map; extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp); Index: linux-2.6.22-rc5/include/asm-i386/tsc.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-i386/tsc.h 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-i386/tsc.h 2007-06-17 08:52:09.000000000 +0200 @@ -71,4 +71,8 @@ extern void init_tsc_clocksource(void); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#ifdef CONFIG_X86_64 +extern void tsc_calibrate(void); +#endif + #endif Index: linux-2.6.22-rc5/arch/x86_64/kernel/i8259.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/i8259.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/i8259.c 2007-06-17 08:52:09.000000000 +0200 @@ -460,47 +460,6 @@ void invalidate_interrupt6(void); void invalidate_interrupt7(void); void thermal_interrupt(void); void threshold_interrupt(void); -void i8254_timer_resume(void); - -static void setup_timer_hardware(void) -{ - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - udelay(10); - outb(LATCH >> 8 , 0x40); /* MSB */ -} - -static int timer_resume(struct sys_device *dev) -{ - setup_timer_hardware(); - return 0; -} - -void i8254_timer_resume(void) -{ - setup_timer_hardware(); -} - -static struct sysdev_class timer_sysclass = { - set_kset_name("timer_pit"), - .resume = timer_resume, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int __init init_timer_sysfs(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(init_timer_sysfs); void __init init_IRQ(void) { @@ -551,12 +510,6 @@ void __init init_IRQ(void) set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_timer_hardware(); - if (!acpi_ioapic) setup_irq(2, &irq2); } Index: linux-2.6.22-rc5/arch/i386/kernel/vmiclock.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/vmiclock.c 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/vmiclock.c 2007-06-17 08:52:09.000000000 +0200 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "io_ports.h" Index: linux-2.6.22-rc5/include/asm-i386/mach-default/io_ports.h =================================================================== --- linux-2.6.22-rc5.orig/include/asm-i386/mach-default/io_ports.h 2007-06-17 08:51:59.000000000 +0200 +++ linux-2.6.22-rc5/include/asm-i386/mach-default/io_ports.h 2007-06-17 08:52:09.000000000 +0200 @@ -7,11 +7,6 @@ #ifndef _MACH_IO_PORTS_H #define _MACH_IO_PORTS_H -/* i8253A PIT registers */ -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -#define PIT_CH2 0x42 - /* i8259A PIC registers */ #define PIC_MASTER_CMD 0x20 #define PIC_MASTER_IMR 0x21 Index: linux-2.6.22-rc5/arch/x86_64/kernel/process.c =================================================================== --- linux-2.6.22-rc5.orig/arch/x86_64/kernel/process.c 2007-06-17 08:51:58.000000000 +0200 +++ linux-2.6.22-rc5/arch/x86_64/kernel/process.c 2007-06-17 08:52:09.000000000 +0200 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -207,6 +208,8 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + tick_nohz_stop_sched_tick(); + rmb(); idle = pm_idle; if (!idle) @@ -227,6 +230,7 @@ void cpu_idle (void) __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); Index: linux-2.6.22-rc5/arch/i386/kernel/quirks.c =================================================================== --- linux-2.6.22-rc5.orig/arch/i386/kernel/quirks.c 2007-06-17 08:51:58.000000000 +0200 +++ linux-2.6.22-rc5/arch/i386/kernel/quirks.c 2007-06-17 08:52:10.000000000 +0200 @@ -4,6 +4,8 @@ #include #include +#include + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) @@ -48,3 +50,275 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IN DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif + +#if defined(CONFIG_HPET_TIMER) +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + if (rcba_base == NULL) + BUG(); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val, rcba; + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + printk(KERN_DEBUG "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); + + +static struct pci_dev *cached_dev; + +static void old_ich_force_hpet_resume(void) +{ + u32 val, gen_cntl; + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val, gen_cntl; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + return ich_force_hpet_resume(); + + case OLD_ICH_FORCE_HPET_RESUME: + return old_ich_force_hpet_resume(); + + case VT8237_FORCE_HPET_RESUME: + return vt8237_force_hpet_resume(); + + default: + break; + } +} + +#endif Index: linux-2.6.22-rc5/include/linux/pci_ids.h =================================================================== --- linux-2.6.22-rc5.orig/include/linux/pci_ids.h 2007-06-17 08:51:58.000000000 +0200 +++ linux-2.6.22-rc5/include/linux/pci_ids.h 2007-06-17 08:52:10.000000000 +0200 @@ -2220,6 +2220,7 @@ #define PCI_DEVICE_ID_INTEL_82801EB_5 0x24d5 #define PCI_DEVICE_ID_INTEL_82801EB_6 0x24d6 #define PCI_DEVICE_ID_INTEL_82801EB_11 0x24db +#define PCI_DEVICE_ID_INTEL_82801EB_12 0x24dc #define PCI_DEVICE_ID_INTEL_82801EB_13 0x24dd #define PCI_DEVICE_ID_INTEL_ESB_1 0x25a1 #define PCI_DEVICE_ID_INTEL_ESB_2 0x25a2 Index: linux-2.6.22-rc5/Makefile =================================================================== --- linux-2.6.22-rc5.orig/Makefile 2007-06-17 08:51:57.000000000 +0200 +++ linux-2.6.22-rc5/Makefile 2007-06-17 08:52:10.000000000 +0200 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc5-hrt1 NAME = Holy Dancing Manatees, Batman! # *DOCUMENTATION*