From 99bcc0590806c4f7a4ecf1a11add335b56cde963 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jun 2005 13:59:43 +0100 Subject: [PATCH] ARM: Add missed AAEC2000 file My scripts missed committing this file. Signed-off-by: Russell King --- arch/arm/mach-aaec2000/Makefile.boot | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/arm/mach-aaec2000/Makefile.boot (limited to 'arch') diff --git a/arch/arm/mach-aaec2000/Makefile.boot b/arch/arm/mach-aaec2000/Makefile.boot new file mode 100644 index 000000000000..8f5a8b7c53c7 --- /dev/null +++ b/arch/arm/mach-aaec2000/Makefile.boot @@ -0,0 +1 @@ + zreladdr-y := 0xf0008000 -- cgit v1.2.3 From 2ea83398b75309d8fdc999c4bb252e72d7e4fd9d Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jun 2005 14:04:05 +0100 Subject: [PATCH] ARM: Add VST idle loop call This call allows the dynamic tick support to reprogram the timer immediately before the CPU idles. Signed-off-by: Russell King --- arch/arm/kernel/process.c | 5 ++++- arch/arm/kernel/time.c | 10 +++++++--- include/asm-arm/mach/time.h | 2 ++ 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8f146a4b4752..bbea636ff687 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include extern const char *processor_modes[]; extern void setup_mm_for_reboot(char mode); @@ -85,8 +86,10 @@ EXPORT_SYMBOL(pm_power_off); void default_idle(void) { local_irq_disable(); - if (!need_resched() && !hlt_counter) + if (!need_resched() && !hlt_counter) { + timer_dyn_reprogram(); arch_idle(); + } local_irq_enable(); } diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 06054c9ba074..1b7fcd50c3e2 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -424,15 +424,19 @@ static int timer_dyn_tick_disable(void) return ret; } +/* + * Reprogram the system timer for at least the calculated time interval. + * This function should be called from the idle thread with IRQs disabled, + * immediately before sleeping. + */ void timer_dyn_reprogram(void) { struct dyn_tick_timer *dyn_tick = system_timer->dyn_tick; - unsigned long flags; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock(&xtime_lock); if (dyn_tick->state & DYN_TICK_ENABLED) dyn_tick->reprogram(next_timer_interrupt() - jiffies); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock(&xtime_lock); } static ssize_t timer_show_dyn_tick(struct sys_device *dev, char *buf) diff --git a/include/asm-arm/mach/time.h b/include/asm-arm/mach/time.h index 047980ad18d1..2cf279a44017 100644 --- a/include/asm-arm/mach/time.h +++ b/include/asm-arm/mach/time.h @@ -60,6 +60,8 @@ struct dyn_tick_timer { }; void timer_dyn_reprogram(void); +#else +#define timer_dyn_reprogram() do { } while (0) #endif extern struct sys_timer *system_timer; -- cgit v1.2.3 From a343e6075a396e07eeff52c0da5629c8fd396be2 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jun 2005 14:08:56 +0100 Subject: [PATCH] ARM: Move PGD kernel page table initialisation It doesn't make sense to have the PGD kernel pointers initialisation separate from the PGD user pointers, especially when we clean the data cache over the whole range. Signed-off-by: Russell King --- arch/arm/mm/mm-armv.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 2c2b93d77d43..d79864a0dfa6 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -169,7 +169,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) memzero(new_pgd, FIRST_KERNEL_PGD_NR * sizeof(pgd_t)); + /* + * Copy over the kernel and IO PGD entries + */ init_pgd = pgd_offset_k(0); + memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, + (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); + + clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); if (!vectors_high()) { /* @@ -198,14 +205,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } - /* - * Copy over the kernel and IO PGD entries - */ - memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, - (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); - - clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - return new_pgd; no_pte: -- cgit v1.2.3 From a013053d4965d9a45300938e713a4b512e0257d8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jun 2005 14:16:47 +0100 Subject: [PATCH] ARM: Move memmap freeing into init.c It doesn't make sense for this to be in mm-armv.c now that 26-bit ARM support is no longer integrated into arch/arm. Signed-off-by: Russell King --- arch/arm/mm/init.c | 71 +++++++++++++++++++++++++++++++++++++++++++++----- arch/arm/mm/mm-armv.c | 72 --------------------------------------------------- 2 files changed, 65 insertions(+), 78 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index c08710b1ff02..6dcb23d64bf5 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -522,6 +522,69 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) printk(KERN_INFO "Freeing %s memory: %dK\n", s, size); } +static inline void +free_memmap(int node, unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *start_pg, *end_pg; + unsigned long pg, pgend; + + /* + * Convert start_pfn/end_pfn to a struct page pointer. + */ + start_pg = pfn_to_page(start_pfn); + end_pg = pfn_to_page(end_pfn); + + /* + * Convert to physical addresses, and + * round start upwards and end downwards. + */ + pg = PAGE_ALIGN(__pa(start_pg)); + pgend = __pa(end_pg) & PAGE_MASK; + + /* + * If there are free pages between these, + * free the section of the memmap array. + */ + if (pg < pgend) + free_bootmem_node(NODE_DATA(node), pg, pgend - pg); +} + +/* + * The mem_map array can get very big. Free the unused area of the memory map. + */ +static void __init free_unused_memmap_node(int node, struct meminfo *mi) +{ + unsigned long bank_start, prev_bank_end = 0; + unsigned int i; + + /* + * [FIXME] This relies on each bank being in address order. This + * may not be the case, especially if the user has provided the + * information on the command line. + */ + for (i = 0; i < mi->nr_banks; i++) { + if (mi->bank[i].size == 0 || mi->bank[i].node != node) + continue; + + bank_start = mi->bank[i].start >> PAGE_SHIFT; + if (bank_start < prev_bank_end) { + printk(KERN_ERR "MEM: unordered memory banks. " + "Not freeing memmap.\n"); + break; + } + + /* + * If we had a previous bank, and there is a space + * between the current bank and the previous, free it. + */ + if (prev_bank_end && prev_bank_end != bank_start) + free_memmap(node, prev_bank_end, bank_start); + + prev_bank_end = (mi->bank[i].start + + mi->bank[i].size) >> PAGE_SHIFT; + } +} + /* * mem_init() marks the free areas in the mem_map and tells us how much * memory is free. This is done after various parts of the system have @@ -540,16 +603,12 @@ void __init mem_init(void) max_mapnr = virt_to_page(high_memory) - mem_map; #endif - /* - * We may have non-contiguous memory. - */ - if (meminfo.nr_banks != 1) - create_memmap_holes(&meminfo); - /* this will put all unused low memory onto the freelists */ for_each_online_node(node) { pg_data_t *pgdat = NODE_DATA(node); + free_unused_memmap_node(node, &meminfo); + if (pgdat->node_spanned_pages != 0) totalram_pages += free_all_bootmem_node(pgdat); } diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index d79864a0dfa6..052ab443ec4e 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -697,75 +697,3 @@ void __init iotable_init(struct map_desc *io_desc, int nr) for (i = 0; i < nr; i++) create_mapping(io_desc + i); } - -static inline void -free_memmap(int node, unsigned long start_pfn, unsigned long end_pfn) -{ - struct page *start_pg, *end_pg; - unsigned long pg, pgend; - - /* - * Convert start_pfn/end_pfn to a struct page pointer. - */ - start_pg = pfn_to_page(start_pfn); - end_pg = pfn_to_page(end_pfn); - - /* - * Convert to physical addresses, and - * round start upwards and end downwards. - */ - pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; - - /* - * If there are free pages between these, - * free the section of the memmap array. - */ - if (pg < pgend) - free_bootmem_node(NODE_DATA(node), pg, pgend - pg); -} - -static inline void free_unused_memmap_node(int node, struct meminfo *mi) -{ - unsigned long bank_start, prev_bank_end = 0; - unsigned int i; - - /* - * [FIXME] This relies on each bank being in address order. This - * may not be the case, especially if the user has provided the - * information on the command line. - */ - for (i = 0; i < mi->nr_banks; i++) { - if (mi->bank[i].size == 0 || mi->bank[i].node != node) - continue; - - bank_start = mi->bank[i].start >> PAGE_SHIFT; - if (bank_start < prev_bank_end) { - printk(KERN_ERR "MEM: unordered memory banks. " - "Not freeing memmap.\n"); - break; - } - - /* - * If we had a previous bank, and there is a space - * between the current bank and the previous, free it. - */ - if (prev_bank_end && prev_bank_end != bank_start) - free_memmap(node, prev_bank_end, bank_start); - - prev_bank_end = PAGE_ALIGN(mi->bank[i].start + - mi->bank[i].size) >> PAGE_SHIFT; - } -} - -/* - * The mem_map array can get very big. Free - * the unused area of the memory map. - */ -void __init create_memmap_holes(struct meminfo *mi) -{ - int node; - - for_each_online_node(node) - free_unused_memmap_node(node, mi); -} -- cgit v1.2.3 From f3bb742640338eca0d8c3fa8071df89168efbf0a Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jun 2005 14:49:10 +0100 Subject: [PATCH] ARM: Update mach-types Signed-off-by: Russell King --- arch/arm/tools/mach-types | 69 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 30c1dfbb052f..6d3a79e5fef8 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -6,7 +6,7 @@ # To add an entry into this database, please see Documentation/arm/README, # or contact rmk@arm.linux.org.uk # -# Last update: Thu Mar 24 14:34:50 2005 +# Last update: Thu Jun 23 20:19:33 2005 # # machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number # @@ -243,7 +243,7 @@ yoho ARCH_YOHO YOHO 231 jasper ARCH_JASPER JASPER 232 dsc25 ARCH_DSC25 DSC25 233 omap_innovator MACH_OMAP_INNOVATOR OMAP_INNOVATOR 234 -ramses ARCH_RAMSES RAMSES 235 +mnci ARCH_RAMSES RAMSES 235 s28x ARCH_S28X S28X 236 mport3 ARCH_MPORT3 MPORT3 237 pxa_eagle250 ARCH_PXA_EAGLE250 PXA_EAGLE250 238 @@ -323,7 +323,7 @@ nimbra29x ARCH_NIMBRA29X NIMBRA29X 311 nimbra210 ARCH_NIMBRA210 NIMBRA210 312 hhp_d95xx ARCH_HHP_D95XX HHP_D95XX 313 labarm ARCH_LABARM LABARM 314 -m825xx ARCH_M825XX M825XX 315 +comcerto ARCH_M825XX M825XX 315 m7100 SA1100_M7100 M7100 316 nipc2 ARCH_NIPC2 NIPC2 317 fu7202 ARCH_FU7202 FU7202 318 @@ -724,3 +724,66 @@ lpc22xx MACH_LPC22XX LPC22XX 715 omap_comet3 MACH_COMET3 COMET3 716 omap_comet4 MACH_COMET4 COMET4 717 csb625 MACH_CSB625 CSB625 718 +fortunet2 MACH_FORTUNET2 FORTUNET2 719 +s5h2200 MACH_S5H2200 S5H2200 720 +optorm920 MACH_OPTORM920 OPTORM920 721 +adsbitsyxb MACH_ADSBITSYXB ADSBITSYXB 722 +adssphere MACH_ADSSPHERE ADSSPHERE 723 +adsportal MACH_ADSPORTAL ADSPORTAL 724 +ln2410sbc MACH_LN2410SBC LN2410SBC 725 +cb3rufc MACH_CB3RUFC CB3RUFC 726 +mp2usb MACH_MP2USB MP2USB 727 +ntnp425c MACH_NTNP425C NTNP425C 728 +colibri MACH_COLIBRI COLIBRI 729 +pcm7220 MACH_PCM7220 PCM7220 730 +gateway7001 MACH_GATEWAY7001 GATEWAY7001 731 +pcm027 MACH_PCM027 PCM027 732 +cmpxa MACH_CMPXA CMPXA 733 +anubis MACH_ANUBIS ANUBIS 734 +ite8152 MACH_ITE8152 ITE8152 735 +lpc3xxx MACH_LPC3XXX LPC3XXX 736 +puppeteer MACH_PUPPETEER PUPPETEER 737 +vt001 MACH_MACH_VADATECH MACH_VADATECH 738 +e570 MACH_E570 E570 739 +x50 MACH_X50 X50 740 +recon MACH_RECON RECON 741 +xboardgp8 MACH_XBOARDGP8 XBOARDGP8 742 +fpic2 MACH_FPIC2 FPIC2 743 +akita MACH_AKITA AKITA 744 +a81 MACH_A81 A81 745 +svm_sc25x MACH_SVM_SC25X SVM_SC25X 746 +vt020 MACH_VADATECH020 VADATECH020 747 +tli MACH_TLI TLI 748 +edb9315lc MACH_EDB9315LC EDB9315LC 749 +passec MACH_PASSEC PASSEC 750 +ds_tiger MACH_DS_TIGER DS_TIGER 751 +e310 MACH_E310 E310 752 +e330 MACH_E330 E330 753 +rt3000 MACH_RT3000 RT3000 754 +nokia770 MACH_NOKIA770 NOKIA770 755 +pnx0106 MACH_PNX0106 PNX0106 756 +hx21xx MACH_HX21XX HX21XX 757 +faraday MACH_FARADAY FARADAY 758 +sbc9312 MACH_SBC9312 SBC9312 759 +batman MACH_BATMAN BATMAN 760 +jpd201 MACH_JPD201 JPD201 761 +mipsa MACH_MIPSA MIPSA 762 +kacom MACH_KACOM KACOM 763 +swarcocpu MACH_SWARCOCPU SWARCOCPU 764 +swarcodsl MACH_SWARCODSL SWARCODSL 765 +blueangel MACH_BLUEANGEL BLUEANGEL 766 +hairygrama MACH_HAIRYGRAMA HAIRYGRAMA 767 +banff MACH_BANFF BANFF 768 +carmeva MACH_CARMEVA CARMEVA 769 +sam255 MACH_SAM255 SAM255 770 +ppm10 MACH_PPM10 PPM10 771 +edb9315a MACH_EDB9315A EDB9315A 772 +sunset MACH_SUNSET SUNSET 773 +stargate2 MACH_STARGATE2 STARGATE2 774 +intelmote2 MACH_INTELMOTE2 INTELMOTE2 775 +trizeps4 MACH_TRIZEPS4 TRIZEPS4 776 +mainstone2 MACH_MAINSTONE2 MAINSTONE2 777 +ez_ixp42x MACH_EZ_IXP42X EZ_IXP42X 778 +tapwave_zodiac MACH_TAPWAVE_ZODIAC TAPWAVE_ZODIAC 779 +universalmeter MACH_UNIVERSALMETER UNIVERSALMETER 780 +hicoarm9 MACH_HICOARM9 HICOARM9 781 -- cgit v1.2.3 From 22e2c507c301c3dbbcf91b4948b88f78842ee6c9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Jun 2005 10:55:12 +0200 Subject: [PATCH] Update cfq io scheduler to time sliced design This updates the CFQ io scheduler to the new time sliced design (cfq v3). It provides full process fairness, while giving excellent aggregate system throughput even for many competing processes. It supports io priorities, either inherited from the cpu nice value or set directly with the ioprio_get/set syscalls. The latter closely mimic set/getpriority. This import is based on my latest from -mm. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- arch/i386/kernel/syscall_table.S | 2 + arch/ia64/kernel/entry.S | 4 +- arch/ppc/kernel/misc.S | 2 + drivers/block/as-iosched.c | 5 +- drivers/block/cfq-iosched.c | 1910 +++++++++++++++++++++++++------------- drivers/block/deadline-iosched.c | 3 +- drivers/block/elevator.c | 9 +- drivers/block/ll_rw_blk.c | 59 +- fs/Makefile | 1 + fs/ioprio.c | 172 ++++ fs/reiserfs/journal.c | 12 + include/asm-i386/unistd.h | 4 +- include/asm-ia64/unistd.h | 2 + include/asm-ppc/unistd.h | 4 +- include/asm-x86_64/unistd.h | 6 +- include/linux/bio.h | 14 + include/linux/blkdev.h | 25 +- include/linux/elevator.h | 8 +- include/linux/fs.h | 19 + include/linux/init_task.h | 2 + include/linux/ioprio.h | 87 ++ include/linux/sched.h | 6 +- include/linux/writeback.h | 6 +- kernel/exit.c | 2 + kernel/fork.c | 5 + kernel/sched.c | 8 - 26 files changed, 1685 insertions(+), 692 deletions(-) create mode 100644 fs/ioprio.c create mode 100644 include/linux/ioprio.h (limited to 'arch') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 442a6e937b19..3db9a04aec6e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -289,3 +289,5 @@ ENTRY(sys_call_table) .long sys_add_key .long sys_request_key .long sys_keyctl + .long sys_ioprio_set + .long sys_ioprio_get /* 290 */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index b1d5d3d5276c..785a51b0ad8e 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1577,8 +1577,8 @@ sys_call_table: data8 sys_add_key data8 sys_request_key data8 sys_keyctl - data8 sys_ni_syscall - data8 sys_ni_syscall // 1275 + data8 sys_ioprio_set + data8 sys_ioprio_get // 1275 data8 sys_set_zone_reclaim data8 sys_ni_syscall data8 sys_ni_syscall diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index b6a63a49a232..191a8def3bdb 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table) .long sys_request_key /* 270 */ .long sys_keyctl .long sys_waitid + .long sys_ioprio_set + .long sys_ioprio_get diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c index 3410b4d294b9..91aeb678135d 100644 --- a/drivers/block/as-iosched.c +++ b/drivers/block/as-iosched.c @@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq) rq->elevator_private = NULL; } -static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +static int as_set_request(request_queue_t *q, struct request *rq, + struct bio *bio, int gfp_mask) { struct as_data *ad = q->elevator->elevator_data; struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); @@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) return 1; } -static int as_may_queue(request_queue_t *q, int rw) +static int as_may_queue(request_queue_t *q, int rw, struct bio *bio) { int ret = ELV_MQUEUE_MAY; struct as_data *ad = q->elevator->elevator_data; diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 3ac47dde64da..35f6e569d5e5 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -21,22 +21,33 @@ #include #include #include - -static unsigned long max_elapsed_crq; -static unsigned long max_elapsed_dispatch; +#include +#include /* * tunables */ static int cfq_quantum = 4; /* max queue in one round of service */ static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ -static int cfq_service = HZ; /* period over which service is avg */ -static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */ -static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */ -static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */ +static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ +static int cfq_slice_sync = HZ / 10; +static int cfq_slice_async = HZ / 50; +static int cfq_slice_async_rq = 2; +static int cfq_slice_idle = HZ / 50; + +#define CFQ_IDLE_GRACE (HZ / 10) +#define CFQ_SLICE_SCALE (5) + +#define CFQ_KEY_ASYNC (0) + +/* + * disable queueing at the driver/hardware level + */ +static int cfq_max_depth = 1; + /* * for the hash of cfqq inside the cfqd */ @@ -55,6 +66,7 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ #define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) +#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define RQ_DATA(rq) (rq)->elevator_private @@ -75,78 +87,101 @@ static int cfq_back_penalty = 2; /* penalty of a backwards seek */ #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) #define rq_rb_key(rq) (rq)->sector -/* - * threshold for switching off non-tag accounting - */ -#define CFQ_MAX_TAG (4) - -/* - * sort key types and names - */ -enum { - CFQ_KEY_PGID, - CFQ_KEY_TGID, - CFQ_KEY_UID, - CFQ_KEY_GID, - CFQ_KEY_LAST, -}; - -static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL }; - static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static kmem_cache_t *cfq_ioc_pool; +#define CFQ_PRIO_LISTS IOPRIO_BE_NR +#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) +#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) + +#define cfq_cfqq_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) + +/* + * Per block device queue structure + */ struct cfq_data { - struct list_head rr_list; + atomic_t ref; + request_queue_t *queue; + + /* + * rr list of queues with requests and the count of them + */ + struct list_head rr_list[CFQ_PRIO_LISTS]; + struct list_head busy_rr; + struct list_head cur_rr; + struct list_head idle_rr; + unsigned int busy_queues; + + /* + * non-ordered list of empty cfqq's + */ struct list_head empty_list; + /* + * cfqq lookup hash + */ struct hlist_head *cfq_hash; - struct hlist_head *crq_hash; - /* queues on rr_list (ie they have pending requests */ - unsigned int busy_queues; + /* + * global crq hash for all queues + */ + struct hlist_head *crq_hash; unsigned int max_queued; - atomic_t ref; + mempool_t *crq_pool; - int key_type; + int rq_in_driver; - mempool_t *crq_pool; + /* + * schedule slice state info + */ + /* + * idle window management + */ + struct timer_list idle_slice_timer; + struct work_struct unplug_work; - request_queue_t *queue; + struct cfq_queue *active_queue; + struct cfq_io_context *active_cic; + int cur_prio, cur_end_prio; + unsigned int dispatch_slice; + + struct timer_list idle_class_timer; sector_t last_sector; + unsigned long last_end_request; - int rq_in_driver; + unsigned int rq_starved; /* * tunables, see top of file */ unsigned int cfq_quantum; unsigned int cfq_queued; - unsigned int cfq_fifo_expire_r; - unsigned int cfq_fifo_expire_w; - unsigned int cfq_fifo_batch_expire; + unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; - unsigned int find_best_crq; - - unsigned int cfq_tagged; + unsigned int cfq_slice[2]; + unsigned int cfq_slice_async_rq; + unsigned int cfq_slice_idle; + unsigned int cfq_max_depth; }; +/* + * Per process-grouping structure + */ struct cfq_queue { /* reference count */ atomic_t ref; /* parent cfq_data */ struct cfq_data *cfqd; - /* hash of mergeable requests */ + /* cfqq lookup hash */ struct hlist_node cfq_hash; /* hash key */ - unsigned long key; - /* whether queue is on rr (or empty) list */ - int on_rr; + unsigned int key; /* on either rr or empty list of cfqd */ struct list_head cfq_list; /* sorted list of pending requests */ @@ -158,21 +193,35 @@ struct cfq_queue { /* currently allocated requests */ int allocated[2]; /* fifo list of requests in sort_list */ - struct list_head fifo[2]; - /* last time fifo expired */ - unsigned long last_fifo_expire; - - int key_type; - - unsigned long service_start; - unsigned long service_used; + struct list_head fifo; - unsigned int max_rate; + unsigned long slice_start; + unsigned long slice_end; + unsigned long slice_left; + unsigned long service_last; /* number of requests that have been handed to the driver */ int in_flight; - /* number of currently allocated requests */ - int alloc_limit[2]; + + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; + + /* whether queue is on rr (or empty) list */ + unsigned on_rr : 1; + /* idle slice, waiting for new request submission */ + unsigned wait_request : 1; + /* set when wait_request gets set, reset on first rq alloc */ + unsigned must_alloc : 1; + /* only gets one must_alloc per slice */ + unsigned must_alloc_slice : 1; + /* idle slice, request added, now waiting to dispatch it */ + unsigned must_dispatch : 1; + /* fifo expire per-slice */ + unsigned fifo_expire : 1; + + unsigned idle_window : 1; + unsigned prio_changed : 1; }; struct cfq_rq { @@ -184,42 +233,17 @@ struct cfq_rq { struct cfq_queue *cfq_queue; struct cfq_io_context *io_context; - unsigned long service_start; - unsigned long queue_start; - - unsigned int in_flight : 1; - unsigned int accounted : 1; - unsigned int is_sync : 1; - unsigned int is_write : 1; + unsigned in_flight : 1; + unsigned accounted : 1; + unsigned is_sync : 1; + unsigned requeued : 1; }; -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long); +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int); static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *); -static void cfq_update_next_crq(struct cfq_rq *); static void cfq_put_cfqd(struct cfq_data *cfqd); -/* - * what the fairness is based on (ie how processes are grouped and - * differentiated) - */ -static inline unsigned long -cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk) -{ - /* - * optimize this so that ->key_type is the offset into the struct - */ - switch (cfqd->key_type) { - case CFQ_KEY_PGID: - return process_group(tsk); - default: - case CFQ_KEY_TGID: - return tsk->tgid; - case CFQ_KEY_UID: - return tsk->uid; - case CFQ_KEY_GID: - return tsk->gid; - } -} +#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) /* * lots of deadline iosched dupes, can be abstracted later... @@ -235,16 +259,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) if (q->last_merge == crq->request) q->last_merge = NULL; - - cfq_update_next_crq(crq); } static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); - BUG_ON(!hlist_unhashed(&crq->hash)); - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); } @@ -257,8 +277,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - BUG_ON(hlist_unhashed(&crq->hash)); - if (!rq_mergeable(__rq)) { cfq_del_crq_hash(crq); continue; @@ -287,36 +305,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2) return crq2; if (crq2 == NULL) return crq1; + if (crq1->requeued) + return crq1; + if (crq2->requeued) + return crq2; s1 = crq1->request->sector; s2 = crq2->request->sector; last = cfqd->last_sector; -#if 0 - if (!list_empty(&cfqd->queue->queue_head)) { - struct list_head *entry = &cfqd->queue->queue_head; - unsigned long distance = ~0UL; - struct request *rq; - - while ((entry = entry->prev) != &cfqd->queue->queue_head) { - rq = list_entry_rq(entry); - - if (blk_barrier_rq(rq)) - break; - - if (distance < abs(s1 - rq->sector + rq->nr_sectors)) { - distance = abs(s1 - rq->sector +rq->nr_sectors); - last = rq->sector + rq->nr_sectors; - } - if (distance < abs(s2 - rq->sector + rq->nr_sectors)) { - distance = abs(s2 - rq->sector +rq->nr_sectors); - last = rq->sector + rq->nr_sectors; - } - } - } -#endif - /* * by definition, 1KiB is 2 sectors */ @@ -377,11 +375,13 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rq *crq_next = NULL, *crq_prev = NULL; struct rb_node *rbnext, *rbprev; - if (!ON_RB(&last->rb_node)) - return NULL; - - if ((rbnext = rb_next(&last->rb_node)) == NULL) + if (ON_RB(&last->rb_node)) + rbnext = rb_next(&last->rb_node); + else { rbnext = rb_first(&cfqq->sort_list); + if (rbnext == &last->rb_node) + rbnext = NULL; + } rbprev = rb_prev(&last->rb_node); @@ -401,67 +401,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq) cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); } -static int cfq_check_sort_rr_list(struct cfq_queue *cfqq) +static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) { - struct list_head *head = &cfqq->cfqd->rr_list; - struct list_head *next, *prev; - - /* - * list might still be ordered - */ - next = cfqq->cfq_list.next; - if (next != head) { - struct cfq_queue *cnext = list_entry_cfqq(next); + struct cfq_data *cfqd = cfqq->cfqd; + struct list_head *list, *entry; - if (cfqq->service_used > cnext->service_used) - return 1; - } + BUG_ON(!cfqq->on_rr); - prev = cfqq->cfq_list.prev; - if (prev != head) { - struct cfq_queue *cprev = list_entry_cfqq(prev); + list_del(&cfqq->cfq_list); - if (cfqq->service_used < cprev->service_used) - return 1; + if (cfq_class_rt(cfqq)) + list = &cfqd->cur_rr; + else if (cfq_class_idle(cfqq)) + list = &cfqd->idle_rr; + else { + /* + * if cfqq has requests in flight, don't allow it to be + * found in cfq_set_active_queue before it has finished them. + * this is done to increase fairness between a process that + * has lots of io pending vs one that only generates one + * sporadically or synchronously + */ + if (cfqq->in_flight) + list = &cfqd->busy_rr; + else + list = &cfqd->rr_list[cfqq->ioprio]; } - return 0; -} - -static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) -{ - struct list_head *entry = &cfqq->cfqd->rr_list; - - if (!cfqq->on_rr) - return; - if (!new_queue && !cfq_check_sort_rr_list(cfqq)) + /* + * if queue was preempted, just add to front to be fair. busy_rr + * isn't sorted. + */ + if (preempted || list == &cfqd->busy_rr) { + list_add(&cfqq->cfq_list, list); return; - - list_del(&cfqq->cfq_list); + } /* - * sort by our mean service_used, sub-sort by in-flight requests + * sort by when queue was last serviced */ - while ((entry = entry->prev) != &cfqq->cfqd->rr_list) { + entry = list; + while ((entry = entry->prev) != list) { struct cfq_queue *__cfqq = list_entry_cfqq(entry); - if (cfqq->service_used > __cfqq->service_used) + if (!__cfqq->service_last) + break; + if (time_before(__cfqq->service_last, cfqq->service_last)) break; - else if (cfqq->service_used == __cfqq->service_used) { - struct list_head *prv; - - while ((prv = entry->prev) != &cfqq->cfqd->rr_list) { - __cfqq = list_entry_cfqq(prv); - - WARN_ON(__cfqq->service_used > cfqq->service_used); - if (cfqq->service_used != __cfqq->service_used) - break; - if (cfqq->in_flight > __cfqq->in_flight) - break; - - entry = prv; - } - } } list_add(&cfqq->cfq_list, entry); @@ -469,28 +455,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) /* * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to requests serviced + * the pending list according to last request service */ static inline void -cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) +cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue) { - /* - * it's currently on the empty list - */ + BUG_ON(cfqq->on_rr); cfqq->on_rr = 1; cfqd->busy_queues++; - if (time_after(jiffies, cfqq->service_start + cfq_service)) - cfqq->service_used >>= 3; - - cfq_sort_rr_list(cfqq, 1); + cfq_resort_rr_list(cfqq, requeue); } static inline void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - list_move(&cfqq->cfq_list, &cfqd->empty_list); + BUG_ON(!cfqq->on_rr); cfqq->on_rr = 0; + list_move(&cfqq->cfq_list, &cfqd->empty_list); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; @@ -505,16 +487,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq) if (ON_RB(&crq->rb_node)) { struct cfq_data *cfqd = cfqq->cfqd; + const int sync = crq->is_sync; - BUG_ON(!cfqq->queued[crq->is_sync]); + BUG_ON(!cfqq->queued[sync]); + cfqq->queued[sync]--; cfq_update_next_crq(crq); - cfqq->queued[crq->is_sync]--; rb_erase(&crq->rb_node, &cfqq->sort_list); RB_CLEAR_COLOR(&crq->rb_node); - if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) + if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list)) cfq_del_cfqq_rr(cfqd, cfqq); } } @@ -562,7 +545,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq) rb_insert_color(&crq->rb_node, &cfqq->sort_list); if (!cfqq->on_rr) - cfq_add_cfqq_rr(cfqd, cfqq); + cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued); /* * check if this request is a better next-serve candidate @@ -581,11 +564,10 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) cfq_add_crq_rb(crq); } -static struct request * -cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) +static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) + { - const unsigned long key = cfq_hash_key(cfqd, current); - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid); struct rb_node *n; if (!cfqq) @@ -609,20 +591,23 @@ out: static void cfq_deactivate_request(request_queue_t *q, struct request *rq) { + struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_rq *crq = RQ_DATA(rq); if (crq) { struct cfq_queue *cfqq = crq->cfq_queue; - if (cfqq->cfqd->cfq_tagged) { - cfqq->service_used--; - cfq_sort_rr_list(cfqq, 0); - } - if (crq->accounted) { crq->accounted = 0; - cfqq->cfqd->rq_in_driver--; + WARN_ON(!cfqd->rq_in_driver); + cfqd->rq_in_driver--; } + if (crq->in_flight) { + crq->in_flight = 0; + WARN_ON(!cfqq->in_flight); + cfqq->in_flight--; + } + crq->requeued = 1; } } @@ -640,11 +625,10 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq) struct cfq_rq *crq = RQ_DATA(rq); if (crq) { - cfq_remove_merge_hints(q, crq); list_del_init(&rq->queuelist); + cfq_del_crq_rb(crq); + cfq_remove_merge_hints(q, crq); - if (crq->cfq_queue) - cfq_del_crq_rb(crq); } } @@ -662,21 +646,15 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) } __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } + if (__rq && elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_BACK_MERGE; + goto out; } __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); - if (__rq) { - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } + if (__rq && elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_FRONT_MERGE; + goto out; } return ELEVATOR_NO_MERGE; @@ -709,20 +687,194 @@ static void cfq_merged_requests(request_queue_t *q, struct request *rq, struct request *next) { - struct cfq_rq *crq = RQ_DATA(rq); - struct cfq_rq *cnext = RQ_DATA(next); - cfq_merged_request(q, rq); - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { - if (time_before(cnext->queue_start, crq->queue_start)) { - list_move(&rq->queuelist, &next->queuelist); - crq->queue_start = cnext->queue_start; + /* + * reposition in fifo if next is older than rq + */ + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(next->start_time, rq->start_time)) + list_move(&rq->queuelist, &next->queuelist); + + cfq_remove_request(q, next); +} + +static inline void +__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (cfqq) { + /* + * stop potential idle class queues waiting service + */ + del_timer(&cfqd->idle_class_timer); + + cfqq->slice_start = jiffies; + cfqq->slice_end = 0; + cfqq->slice_left = 0; + cfqq->must_alloc_slice = 0; + cfqq->fifo_expire = 0; + } + + cfqd->active_queue = cfqq; +} + +/* + * 0 + * 0,1 + * 0,1,2 + * 0,1,2,3 + * 0,1,2,3,4 + * 0,1,2,3,4,5 + * 0,1,2,3,4,5,6 + * 0,1,2,3,4,5,6,7 + */ +static int cfq_get_next_prio_level(struct cfq_data *cfqd) +{ + int prio, wrap; + + prio = -1; + wrap = 0; + do { + int p; + + for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { + if (!list_empty(&cfqd->rr_list[p])) { + prio = p; + break; + } + } + + if (prio != -1) + break; + cfqd->cur_prio = 0; + if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { + cfqd->cur_end_prio = 0; + if (wrap) + break; + wrap = 1; } + } while (1); + + if (unlikely(prio == -1)) + return -1; + + BUG_ON(prio >= CFQ_PRIO_LISTS); + + list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); + + cfqd->cur_prio = prio + 1; + if (cfqd->cur_prio > cfqd->cur_end_prio) { + cfqd->cur_end_prio = cfqd->cur_prio; + cfqd->cur_prio = 0; + } + if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { + cfqd->cur_prio = 0; + cfqd->cur_end_prio = 0; } - cfq_update_next_crq(cnext); - cfq_remove_request(q, next); + return prio; +} + +static void cfq_set_active_queue(struct cfq_data *cfqd) +{ + struct cfq_queue *cfqq = NULL; + + /* + * if current list is non-empty, grab first entry. if it is empty, + * get next prio level and grab first entry then if any are spliced + */ + if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) + cfqq = list_entry_cfqq(cfqd->cur_rr.next); + + /* + * if we have idle queues and no rt or be queues had pending + * requests, either allow immediate service if the grace period + * has passed or arm the idle grace timer + */ + if (!cfqq && !list_empty(&cfqd->idle_rr)) { + unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; + + if (time_after_eq(jiffies, end)) + cfqq = list_entry_cfqq(cfqd->idle_rr.next); + else + mod_timer(&cfqd->idle_class_timer, end); + } + + __cfq_set_active_queue(cfqd, cfqq); +} + +/* + * current cfqq expired its slice (or was too idle), select new one + */ +static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted) +{ + struct cfq_queue *cfqq = cfqd->active_queue; + + if (cfqq) { + unsigned long now = jiffies; + + if (cfqq->wait_request) + del_timer(&cfqd->idle_slice_timer); + + if (!preempted && !cfqq->in_flight) + cfqq->service_last = now; + + cfqq->must_dispatch = 0; + cfqq->wait_request = 0; + + /* + * store what was left of this slice, if the queue idled out + * or was preempted + */ + if (time_after(now, cfqq->slice_end)) + cfqq->slice_left = now - cfqq->slice_end; + else + cfqq->slice_left = 0; + + if (cfqq->on_rr) + cfq_resort_rr_list(cfqq, preempted); + + cfqd->active_queue = NULL; + + if (cfqd->active_cic) { + put_io_context(cfqd->active_cic->ioc); + cfqd->active_cic = NULL; + } + } + + cfqd->dispatch_slice = 0; +} + +static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) + +{ + WARN_ON(!RB_EMPTY(&cfqq->sort_list)); + WARN_ON(cfqq != cfqd->active_queue); + + /* + * idle is disabled, either manually or by past process history + */ + if (!cfqd->cfq_slice_idle) + return 0; + if (!cfqq->idle_window) + return 0; + /* + * task has exited, don't wait + */ + if (cfqd->active_cic && !cfqd->active_cic->ioc->task) + return 0; + + cfqq->wait_request = 1; + cfqq->must_alloc = 1; + + if (!timer_pending(&cfqd->idle_slice_timer)) { + unsigned long slice_left = cfqq->slice_end - 1; + + cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left); + add_timer(&cfqd->idle_slice_timer); + } + + return 1; } /* @@ -738,31 +890,39 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) struct request *__rq; sector_t last; - cfq_del_crq_rb(crq); - cfq_remove_merge_hints(q, crq); list_del(&crq->request->queuelist); last = cfqd->last_sector; - while ((entry = entry->prev) != head) { - __rq = list_entry_rq(entry); + list_for_each_entry_reverse(__rq, head, queuelist) { + struct cfq_rq *__crq = RQ_DATA(__rq); - if (blk_barrier_rq(crq->request)) + if (blk_barrier_rq(__rq)) + break; + if (!blk_fs_request(__rq)) break; - if (!blk_fs_request(crq->request)) + if (__crq->requeued) break; - if (crq->request->sector > __rq->sector) + if (__rq->sector <= crq->request->sector) break; if (__rq->sector > last && crq->request->sector < last) { - last = crq->request->sector; + last = crq->request->sector + crq->request->nr_sectors; break; } + entry = &__rq->queuelist; } cfqd->last_sector = last; + + cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); + + cfq_del_crq_rb(crq); + cfq_remove_merge_hints(q, crq); + crq->in_flight = 1; + crq->requeued = 0; cfqq->in_flight++; - list_add(&crq->request->queuelist, entry); + list_add_tail(&crq->request->queuelist, entry); } /* @@ -771,105 +931,176 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - const int reads = !list_empty(&cfqq->fifo[0]); - const int writes = !list_empty(&cfqq->fifo[1]); - unsigned long now = jiffies; + struct request *rq; struct cfq_rq *crq; - if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire)) + if (cfqq->fifo_expire) return NULL; - crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist)); - if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) { - cfqq->last_fifo_expire = now; - return crq; - } + if (!list_empty(&cfqq->fifo)) { + int fifo = cfq_cfqq_sync(cfqq); - crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist)); - if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) { - cfqq->last_fifo_expire = now; - return crq; + crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next)); + rq = crq->request; + if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { + cfqq->fifo_expire = 1; + return crq; + } } return NULL; } /* - * dispatch a single request from given queue + * Scale schedule slice based on io priority */ +static inline int +cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; + + WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); + + return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); +} + static inline void -cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) +cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rq *crq; + cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; +} - /* - * follow expired path, else get first next available - */ - if ((crq = cfq_check_fifo(cfqq)) == NULL) { - if (cfqd->find_best_crq) - crq = cfqq->next_crq; - else - crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - } +static inline int +cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + const int base_rq = cfqd->cfq_slice_async_rq; - cfqd->last_sector = crq->request->sector + crq->request->nr_sectors; + WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - /* - * finally, insert request into driver list - */ - cfq_dispatch_sort(q, crq); + return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); } -static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch) +/* + * get next queue for service + */ +static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force) { - struct cfq_data *cfqd = q->elevator->elevator_data; + unsigned long now = jiffies; struct cfq_queue *cfqq; - struct list_head *entry, *tmp; - int queued, busy_queues, first_round; - if (list_empty(&cfqd->rr_list)) - return 0; + cfqq = cfqd->active_queue; + if (!cfqq) + goto new_queue; - queued = 0; - first_round = 1; -restart: - busy_queues = 0; - list_for_each_safe(entry, tmp, &cfqd->rr_list) { - cfqq = list_entry_cfqq(entry); + /* + * slice has expired + */ + if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end)) + goto new_queue; - BUG_ON(RB_EMPTY(&cfqq->sort_list)); + /* + * if queue has requests, dispatch one. if not, check if + * enough slice is left to wait for one + */ + if (!RB_EMPTY(&cfqq->sort_list)) + goto keep_queue; + else if (!force && cfq_cfqq_sync(cfqq) && + time_before(now, cfqq->slice_end)) { + if (cfq_arm_slice_timer(cfqd, cfqq)) + return NULL; + } + +new_queue: + cfq_slice_expired(cfqd, 0); + cfq_set_active_queue(cfqd); +keep_queue: + return cfqd->active_queue; +} + +static int +__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, + int max_dispatch) +{ + int dispatched = 0; + + BUG_ON(RB_EMPTY(&cfqq->sort_list)); + + do { + struct cfq_rq *crq; /* - * first round of queueing, only select from queues that - * don't already have io in-flight + * follow expired path, else get first next available */ - if (first_round && cfqq->in_flight) - continue; + if ((crq = cfq_check_fifo(cfqq)) == NULL) + crq = cfqq->next_crq; + + /* + * finally, insert request into driver dispatch list + */ + cfq_dispatch_sort(cfqd->queue, crq); - cfq_dispatch_request(q, cfqd, cfqq); + cfqd->dispatch_slice++; + dispatched++; - if (!RB_EMPTY(&cfqq->sort_list)) - busy_queues++; + if (!cfqd->active_cic) { + atomic_inc(&crq->io_context->ioc->refcount); + cfqd->active_cic = crq->io_context; + } - queued++; - } + if (RB_EMPTY(&cfqq->sort_list)) + break; + + } while (dispatched < max_dispatch); + + /* + * if slice end isn't set yet, set it. if at least one request was + * sync, use the sync time slice value + */ + if (!cfqq->slice_end) + cfq_set_prio_slice(cfqd, cfqq); + + /* + * expire an async queue immediately if it has used up its slice. idle + * queue always expire after 1 dispatch round. + */ + if ((!cfq_cfqq_sync(cfqq) && + cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || + cfq_class_idle(cfqq)) + cfq_slice_expired(cfqd, 0); + + return dispatched; +} + +static int +cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq; + + if (!cfqd->busy_queues) + return 0; + + cfqq = cfq_select_queue(cfqd, force); + if (cfqq) { + cfqq->wait_request = 0; + cfqq->must_dispatch = 0; + del_timer(&cfqd->idle_slice_timer); + + if (cfq_class_idle(cfqq)) + max_dispatch = 1; - if ((queued < max_dispatch) && (busy_queues || first_round)) { - first_round = 0; - goto restart; + return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); } - return queued; + return 0; } static inline void cfq_account_dispatch(struct cfq_rq *crq) { struct cfq_queue *cfqq = crq->cfq_queue; struct cfq_data *cfqd = cfqq->cfqd; - unsigned long now, elapsed; - if (!blk_fs_request(crq->request)) + if (unlikely(!blk_fs_request(crq->request))) return; /* @@ -879,65 +1110,34 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq) if (crq->accounted) return; - now = jiffies; - if (cfqq->service_start == ~0UL) - cfqq->service_start = now; - - /* - * on drives with tagged command queueing, command turn-around time - * doesn't necessarily reflect the time spent processing this very - * command inside the drive. so do the accounting differently there, - * by just sorting on the number of requests - */ - if (cfqd->cfq_tagged) { - if (time_after(now, cfqq->service_start + cfq_service)) { - cfqq->service_start = now; - cfqq->service_used /= 10; - } - - cfqq->service_used++; - cfq_sort_rr_list(cfqq, 0); - } - - elapsed = now - crq->queue_start; - if (elapsed > max_elapsed_dispatch) - max_elapsed_dispatch = elapsed; - crq->accounted = 1; - crq->service_start = now; - - if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) { - cfqq->cfqd->cfq_tagged = 1; - printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG); - } + cfqd->rq_in_driver++; } static inline void cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq) { struct cfq_data *cfqd = cfqq->cfqd; + unsigned long now; if (!crq->accounted) return; + now = jiffies; + WARN_ON(!cfqd->rq_in_driver); cfqd->rq_in_driver--; - if (!cfqd->cfq_tagged) { - unsigned long now = jiffies; - unsigned long duration = now - crq->service_start; + if (!cfq_class_idle(cfqq)) + cfqd->last_end_request = now; - if (time_after(now, cfqq->service_start + cfq_service)) { - cfqq->service_start = now; - cfqq->service_used >>= 3; - } - - cfqq->service_used += duration; - cfq_sort_rr_list(cfqq, 0); - - if (duration > max_elapsed_crq) - max_elapsed_crq = duration; + if (!cfqq->in_flight && cfqq->on_rr) { + cfqq->service_last = now; + cfq_resort_rr_list(cfqq, 0); } + + if (crq->is_sync) + crq->io_context->last_end_request = now; } static struct request *cfq_next_request(request_queue_t *q) @@ -950,7 +1150,15 @@ static struct request *cfq_next_request(request_queue_t *q) dispatch: rq = list_entry_rq(q->queue_head.next); - if ((crq = RQ_DATA(rq)) != NULL) { + crq = RQ_DATA(rq); + if (crq) { + /* + * if idle window is disabled, allow queue buildup + */ + if (!crq->in_flight && !crq->cfq_queue->idle_window && + cfqd->rq_in_driver >= cfqd->cfq_max_depth) + return NULL; + cfq_remove_merge_hints(q, crq); cfq_account_dispatch(crq); } @@ -958,7 +1166,7 @@ dispatch: return rq; } - if (cfq_dispatch_requests(q, cfqd->cfq_quantum)) + if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0)) goto dispatch; return NULL; @@ -972,14 +1180,22 @@ dispatch: */ static void cfq_put_queue(struct cfq_queue *cfqq) { - BUG_ON(!atomic_read(&cfqq->ref)); + struct cfq_data *cfqd = cfqq->cfqd; + + BUG_ON(atomic_read(&cfqq->ref) <= 0); if (!atomic_dec_and_test(&cfqq->ref)) return; BUG_ON(rb_first(&cfqq->sort_list)); + BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); BUG_ON(cfqq->on_rr); + if (unlikely(cfqd->active_queue == cfqq)) { + cfq_slice_expired(cfqd, 0); + kblockd_schedule_work(&cfqd->unplug_work); + } + cfq_put_cfqd(cfqq->cfqd); /* @@ -991,7 +1207,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) } static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval) { struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; struct hlist_node *entry, *next; @@ -1007,94 +1223,220 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) } static struct cfq_queue * -cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key) +cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key) { return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT)); } -static inline void -cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq, - struct cfq_io_context *cic) +static void cfq_free_io_context(struct cfq_io_context *cic) { - unsigned long hashkey = cfq_hash_key(cfqd, current); - unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); - struct cfq_queue *__cfqq; - unsigned long flags; + struct cfq_io_context *__cic; + struct list_head *entry, *next; - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - - hlist_del(&(*cfqq)->cfq_hash); - - __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); - if (!__cfqq || __cfqq == *cfqq) { - __cfqq = *cfqq; - hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - __cfqq->key_type = cfqd->key_type; - } else { - atomic_inc(&__cfqq->ref); - cic->cfqq = __cfqq; - cfq_put_queue(*cfqq); - *cfqq = __cfqq; + list_for_each_safe(entry, next, &cic->list) { + __cic = list_entry(entry, struct cfq_io_context, list); + kmem_cache_free(cfq_ioc_pool, __cic); } - cic->cfqq = __cfqq; - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + kmem_cache_free(cfq_ioc_pool, cic); } -static void cfq_free_io_context(struct cfq_io_context *cic) +/* + * Called with interrupts disabled + */ +static void cfq_exit_single_io_context(struct cfq_io_context *cic) { - kmem_cache_free(cfq_ioc_pool, cic); + struct cfq_data *cfqd = cic->cfqq->cfqd; + request_queue_t *q = cfqd->queue; + + WARN_ON(!irqs_disabled()); + + spin_lock(q->queue_lock); + + if (unlikely(cic->cfqq == cfqd->active_queue)) { + cfq_slice_expired(cfqd, 0); + kblockd_schedule_work(&cfqd->unplug_work); + } + + cfq_put_queue(cic->cfqq); + cic->cfqq = NULL; + spin_unlock(q->queue_lock); } /* - * locking hierarchy is: io_context lock -> queue locks + * Another task may update the task cic list, if it is doing a queue lookup + * on its behalf. cfq_cic_lock excludes such concurrent updates */ static void cfq_exit_io_context(struct cfq_io_context *cic) { - struct cfq_queue *cfqq = cic->cfqq; - struct list_head *entry = &cic->list; - request_queue_t *q; + struct cfq_io_context *__cic; + struct list_head *entry; unsigned long flags; + local_irq_save(flags); + /* * put the reference this task is holding to the various queues */ - spin_lock_irqsave(&cic->ioc->lock, flags); - while ((entry = cic->list.next) != &cic->list) { - struct cfq_io_context *__cic; - + list_for_each(entry, &cic->list) { __cic = list_entry(entry, struct cfq_io_context, list); - list_del(entry); - - q = __cic->cfqq->cfqd->queue; - spin_lock(q->queue_lock); - cfq_put_queue(__cic->cfqq); - spin_unlock(q->queue_lock); + cfq_exit_single_io_context(__cic); } - q = cfqq->cfqd->queue; - spin_lock(q->queue_lock); - cfq_put_queue(cfqq); - spin_unlock(q->queue_lock); - - cic->cfqq = NULL; - spin_unlock_irqrestore(&cic->ioc->lock, flags); + cfq_exit_single_io_context(cic); + local_irq_restore(flags); } -static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) +static struct cfq_io_context * +cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask) { - struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags); + struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); if (cic) { - cic->dtor = cfq_free_io_context; - cic->exit = cfq_exit_io_context; INIT_LIST_HEAD(&cic->list); cic->cfqq = NULL; + cic->key = NULL; + cic->last_end_request = jiffies; + cic->ttime_total = 0; + cic->ttime_samples = 0; + cic->ttime_mean = 0; + cic->dtor = cfq_free_io_context; + cic->exit = cfq_exit_io_context; } return cic; } +static void cfq_init_prio_data(struct cfq_queue *cfqq) +{ + struct task_struct *tsk = current; + int ioprio_class; + + if (!cfqq->prio_changed) + return; + + ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); + switch (ioprio_class) { + default: + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * no prio set, place us in the middle of the BE classes + */ + cfqq->ioprio = task_nice_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + cfqq->ioprio = task_ioprio(tsk); + cfqq->ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + cfqq->ioprio_class = IOPRIO_CLASS_IDLE; + cfqq->ioprio = 7; + cfqq->idle_window = 0; + break; + } + + /* + * keep track of original prio settings in case we have to temporarily + * elevate the priority of this queue + */ + cfqq->org_ioprio = cfqq->ioprio; + cfqq->org_ioprio_class = cfqq->ioprio_class; + + if (cfqq->on_rr) + cfq_resort_rr_list(cfqq, 0); + + cfqq->prio_changed = 0; +} + +static inline void changed_ioprio(struct cfq_queue *cfqq) +{ + if (cfqq) { + struct cfq_data *cfqd = cfqq->cfqd; + + spin_lock(cfqd->queue->queue_lock); + cfqq->prio_changed = 1; + cfq_init_prio_data(cfqq); + spin_unlock(cfqd->queue->queue_lock); + } +} + +/* + * callback from sys_ioprio_set, irqs are disabled + */ +static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) +{ + struct cfq_io_context *cic = ioc->cic; + + changed_ioprio(cic->cfqq); + + list_for_each_entry(cic, &cic->list, list) + changed_ioprio(cic->cfqq); + + return 0; +} + +static struct cfq_queue * +cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask) +{ + const int hashval = hash_long(key, CFQ_QHASH_SHIFT); + struct cfq_queue *cfqq, *new_cfqq = NULL; + +retry: + cfqq = __cfq_find_cfq_hash(cfqd, key, hashval); + + if (!cfqq) { + if (new_cfqq) { + cfqq = new_cfqq; + new_cfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(cfqd->queue->queue_lock); + new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + spin_lock_irq(cfqd->queue->queue_lock); + goto retry; + } else { + cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); + if (!cfqq) + goto out; + } + + memset(cfqq, 0, sizeof(*cfqq)); + + INIT_HLIST_NODE(&cfqq->cfq_hash); + INIT_LIST_HEAD(&cfqq->cfq_list); + RB_CLEAR_ROOT(&cfqq->sort_list); + INIT_LIST_HEAD(&cfqq->fifo); + + cfqq->key = key; + hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + atomic_set(&cfqq->ref, 0); + cfqq->cfqd = cfqd; + atomic_inc(&cfqd->ref); + cfqq->service_last = 0; + /* + * set ->slice_left to allow preemption for a new process + */ + cfqq->slice_left = 2 * cfqd->cfq_slice_idle; + cfqq->idle_window = 1; + cfqq->ioprio = -1; + cfqq->ioprio_class = -1; + cfqq->prio_changed = 1; + } + + if (new_cfqq) + kmem_cache_free(cfq_pool, new_cfqq); + + atomic_inc(&cfqq->ref); +out: + WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); + return cfqq; +} + /* * Setup general io context and cfq io context. There can be several cfq * io contexts per general io context, if this process is doing io to more @@ -1102,39 +1444,39 @@ static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) * cfqq, so we don't need to worry about it disappearing */ static struct cfq_io_context * -cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) +cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask) { - struct cfq_data *cfqd = (*cfqq)->cfqd; - struct cfq_queue *__cfqq = *cfqq; + struct io_context *ioc = NULL; struct cfq_io_context *cic; - struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_WAIT); - ioc = get_io_context(gfp_flags); + ioc = get_io_context(gfp_mask); if (!ioc) return NULL; if ((cic = ioc->cic) == NULL) { - cic = cfq_alloc_io_context(gfp_flags); + cic = cfq_alloc_io_context(cfqd, gfp_mask); if (cic == NULL) goto err; + /* + * manually increment generic io_context usage count, it + * cannot go away since we are already holding one ref to it + */ ioc->cic = cic; + ioc->set_ioprio = cfq_ioc_set_ioprio; cic->ioc = ioc; - cic->cfqq = __cfqq; - atomic_inc(&__cfqq->ref); + cic->key = cfqd; + atomic_inc(&cfqd->ref); } else { struct cfq_io_context *__cic; - unsigned long flags; /* - * since the first cic on the list is actually the head - * itself, need to check this here or we'll duplicate an - * cic per ioc for no reason + * the first cic on the list is actually the head itself */ - if (cic->cfqq == __cfqq) + if (cic->key == cfqd) goto out; /* @@ -1142,152 +1484,259 @@ cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) * should be ok here, the list will usually not be more than * 1 or a few entries long */ - spin_lock_irqsave(&ioc->lock, flags); list_for_each_entry(__cic, &cic->list, list) { /* * this process is already holding a reference to * this queue, so no need to get one more */ - if (__cic->cfqq == __cfqq) { + if (__cic->key == cfqd) { cic = __cic; - spin_unlock_irqrestore(&ioc->lock, flags); goto out; } } - spin_unlock_irqrestore(&ioc->lock, flags); /* * nope, process doesn't have a cic assoicated with this * cfqq yet. get a new one and add to list */ - __cic = cfq_alloc_io_context(gfp_flags); + __cic = cfq_alloc_io_context(cfqd, gfp_mask); if (__cic == NULL) goto err; __cic->ioc = ioc; - __cic->cfqq = __cfqq; - atomic_inc(&__cfqq->ref); - spin_lock_irqsave(&ioc->lock, flags); + __cic->key = cfqd; + atomic_inc(&cfqd->ref); list_add(&__cic->list, &cic->list); - spin_unlock_irqrestore(&ioc->lock, flags); - cic = __cic; - *cfqq = __cfqq; } out: - /* - * if key_type has been changed on the fly, we lazily rehash - * each queue at lookup time - */ - if ((*cfqq)->key_type != cfqd->key_type) - cfq_rehash_cfqq(cfqd, cfqq, cic); - return cic; err: put_io_context(ioc); return NULL; } -static struct cfq_queue * -__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) +static void +cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) { - const int hashval = hash_long(key, CFQ_QHASH_SHIFT); - struct cfq_queue *cfqq, *new_cfqq = NULL; + unsigned long elapsed, ttime; -retry: - cfqq = __cfq_find_cfq_hash(cfqd, key, hashval); + /* + * if this context already has stuff queued, thinktime is from + * last queue not last end + */ +#if 0 + if (time_after(cic->last_end_request, cic->last_queue)) + elapsed = jiffies - cic->last_end_request; + else + elapsed = jiffies - cic->last_queue; +#else + elapsed = jiffies - cic->last_end_request; +#endif - if (!cfqq) { - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else { - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); - spin_lock_irq(cfqd->queue->queue_lock); + ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); - if (!new_cfqq && !(gfp_mask & __GFP_WAIT)) - goto out; - - goto retry; - } + cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; + cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; + cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; +} - memset(cfqq, 0, sizeof(*cfqq)); +#define sample_valid(samples) ((samples) > 80) - INIT_HLIST_NODE(&cfqq->cfq_hash); - INIT_LIST_HEAD(&cfqq->cfq_list); - RB_CLEAR_ROOT(&cfqq->sort_list); - INIT_LIST_HEAD(&cfqq->fifo[0]); - INIT_LIST_HEAD(&cfqq->fifo[1]); +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter + */ +static void +cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_io_context *cic) +{ + int enable_idle = cfqq->idle_window; - cfqq->key = key; - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - atomic_set(&cfqq->ref, 0); - cfqq->cfqd = cfqd; - atomic_inc(&cfqd->ref); - cfqq->key_type = cfqd->key_type; - cfqq->service_start = ~0UL; + if (!cic->ioc->task || !cfqd->cfq_slice_idle) + enable_idle = 0; + else if (sample_valid(cic->ttime_samples)) { + if (cic->ttime_mean > cfqd->cfq_slice_idle) + enable_idle = 0; + else + enable_idle = 1; } - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); + cfqq->idle_window = enable_idle; +} - atomic_inc(&cfqq->ref); -out: - WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); - return cfqq; + +/* + * Check if new_cfqq should preempt the currently active queue. Return 0 for + * no or if we aren't sure, a 1 will cause a preempt. + */ +static int +cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, + struct cfq_rq *crq) +{ + struct cfq_queue *cfqq = cfqd->active_queue; + + if (cfq_class_idle(new_cfqq)) + return 0; + + if (!cfqq) + return 1; + + if (cfq_class_idle(cfqq)) + return 1; + if (!new_cfqq->wait_request) + return 0; + /* + * if it doesn't have slice left, forget it + */ + if (new_cfqq->slice_left < cfqd->cfq_slice_idle) + return 0; + if (crq->is_sync && !cfq_cfqq_sync(cfqq)) + return 1; + + return 0; +} + +/* + * cfqq preempts the active queue. if we allowed preempt with no slice left, + * let it have half of its nominal slice. + */ +static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + struct cfq_queue *__cfqq, *next; + + list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) + cfq_resort_rr_list(__cfqq, 1); + + if (!cfqq->slice_left) + cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2; + + cfqq->slice_end = cfqq->slice_left + jiffies; + cfq_slice_expired(cfqd, 1); + __cfq_set_active_queue(cfqd, cfqq); +} + +/* + * should really be a ll_rw_blk.c helper + */ +static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + request_queue_t *q = cfqd->queue; + + if (!blk_queue_plugged(q)) + q->request_fn(q); + else + __generic_unplug_device(q); +} + +/* + * Called when a new fs request (crq) is added (to cfqq). Check if there's + * something we should do about it + */ +static void +cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_rq *crq) +{ + const int sync = crq->is_sync; + + cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); + + if (sync) { + struct cfq_io_context *cic = crq->io_context; + + cfq_update_io_thinktime(cfqd, cic); + cfq_update_idle_window(cfqd, cfqq, cic); + + cic->last_queue = jiffies; + } + + if (cfqq == cfqd->active_queue) { + /* + * if we are waiting for a request for this queue, let it rip + * immediately and flag that we must not expire this queue + * just now + */ + if (cfqq->wait_request) { + cfqq->must_dispatch = 1; + del_timer(&cfqd->idle_slice_timer); + cfq_start_queueing(cfqd, cfqq); + } + } else if (cfq_should_preempt(cfqd, cfqq, crq)) { + /* + * not the active queue - expire current slice if it is + * idle and has expired it's mean thinktime or this new queue + * has some old slice time left and is of higher priority + */ + cfq_preempt_queue(cfqd, cfqq); + cfqq->must_dispatch = 1; + cfq_start_queueing(cfqd, cfqq); + } } -static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) +static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq) { - crq->is_sync = 0; - if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE) - crq->is_sync = 1; + struct cfq_rq *crq = RQ_DATA(rq); + struct cfq_queue *cfqq = crq->cfq_queue; + + cfq_init_prio_data(cfqq); cfq_add_crq_rb(crq); - crq->queue_start = jiffies; - list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]); + list_add_tail(&rq->queuelist, &cfqq->fifo); + + if (rq_mergeable(rq)) { + cfq_add_crq_hash(cfqd, crq); + + if (!cfqd->queue->last_merge) + cfqd->queue->last_merge = rq; + } + + cfq_crq_enqueued(cfqd, cfqq, crq); } static void cfq_insert_request(request_queue_t *q, struct request *rq, int where) { struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); switch (where) { case ELEVATOR_INSERT_BACK: - while (cfq_dispatch_requests(q, cfqd->cfq_quantum)) + while (cfq_dispatch_requests(q, INT_MAX, 1)) ; list_add_tail(&rq->queuelist, &q->queue_head); + /* + * If we were idling with pending requests on + * inactive cfqqs, force dispatching will + * remove the idle timer and the queue won't + * be kicked by __make_request() afterward. + * Kick it here. + */ + kblockd_schedule_work(&cfqd->unplug_work); break; case ELEVATOR_INSERT_FRONT: list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(cfqd, crq); + cfq_enqueue(cfqd, rq); break; default: printk("%s: bad insert point %d\n", __FUNCTION__,where); return; } +} - if (rq_mergeable(rq)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = rq; - } +static inline int cfq_pending_requests(struct cfq_data *cfqd) +{ + return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues; } static int cfq_queue_empty(request_queue_t *q) { struct cfq_data *cfqd = q->elevator->elevator_data; - return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list); + return !cfq_pending_requests(cfqd); } static void cfq_completed_request(request_queue_t *q, struct request *rq) @@ -1332,51 +1781,132 @@ cfq_latter_request(request_queue_t *q, struct request *rq) return NULL; } -static int cfq_may_queue(request_queue_t *q, int rw) +/* + * we temporarily boost lower priority queues if they are holding fs exclusive + * resources. they are boosted to normal prio (CLASS_BE/4) + */ +static void cfq_prio_boost(struct cfq_queue *cfqq) { - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq; - int ret = ELV_MQUEUE_MAY; + const int ioprio_class = cfqq->ioprio_class; + const int ioprio = cfqq->ioprio; - if (current->flags & PF_MEMALLOC) - return ELV_MQUEUE_MAY; + if (has_fs_excl()) { + /* + * boost idle prio on transactions that would lock out other + * users of the filesystem + */ + if (cfq_class_idle(cfqq)) + cfqq->ioprio_class = IOPRIO_CLASS_BE; + if (cfqq->ioprio > IOPRIO_NORM) + cfqq->ioprio = IOPRIO_NORM; + } else { + /* + * check if we need to unboost the queue + */ + if (cfqq->ioprio_class != cfqq->org_ioprio_class) + cfqq->ioprio_class = cfqq->org_ioprio_class; + if (cfqq->ioprio != cfqq->org_ioprio) + cfqq->ioprio = cfqq->org_ioprio; + } - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current)); - if (cfqq) { - int limit = cfqd->max_queued; + /* + * refile between round-robin lists if we moved the priority class + */ + if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) && + cfqq->on_rr) + cfq_resort_rr_list(cfqq, 0); +} - if (cfqq->allocated[rw] < cfqd->cfq_queued) - return ELV_MQUEUE_MUST; +static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) +{ + if (rw == READ || process_sync(task)) + return task->pid; - if (cfqd->busy_queues) - limit = q->nr_requests / cfqd->busy_queues; + return CFQ_KEY_ASYNC; +} - if (limit < cfqd->cfq_queued) - limit = cfqd->cfq_queued; - else if (limit > cfqd->max_queued) - limit = cfqd->max_queued; +static inline int +__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct task_struct *task, int rw) +{ + if (cfqq->wait_request && cfqq->must_alloc) + return ELV_MQUEUE_MUST; - if (cfqq->allocated[rw] >= limit) { - if (limit > cfqq->alloc_limit[rw]) - cfqq->alloc_limit[rw] = limit; + return ELV_MQUEUE_MAY; +#if 0 + if (!cfqq || task->flags & PF_MEMALLOC) + return ELV_MQUEUE_MAY; + if (!cfqq->allocated[rw] || cfqq->must_alloc) { + if (cfqq->wait_request) + return ELV_MQUEUE_MUST; - ret = ELV_MQUEUE_NO; + /* + * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we + * can quickly flood the queue with writes from a single task + */ + if (rw == READ || !cfqq->must_alloc_slice) { + cfqq->must_alloc_slice = 1; + return ELV_MQUEUE_MUST; } + + return ELV_MQUEUE_MAY; } + if (cfq_class_idle(cfqq)) + return ELV_MQUEUE_NO; + if (cfqq->allocated[rw] >= cfqd->max_queued) { + struct io_context *ioc = get_io_context(GFP_ATOMIC); + int ret = ELV_MQUEUE_NO; - return ret; + if (ioc && ioc->nr_batch_requests) + ret = ELV_MQUEUE_MAY; + + put_io_context(ioc); + return ret; + } + + return ELV_MQUEUE_MAY; +#endif +} + +static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct cfq_queue *cfqq; + + /* + * don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be queued. + * so just lookup a possibly existing queue, or return 'may queue' + * if that fails + */ + cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw)); + if (cfqq) { + cfq_init_prio_data(cfqq); + cfq_prio_boost(cfqq); + + return __cfq_may_queue(cfqd, cfqq, tsk, rw); + } + + return ELV_MQUEUE_MAY; } static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) { + struct cfq_data *cfqd = q->elevator->elevator_data; struct request_list *rl = &q->rq; - const int write = waitqueue_active(&rl->wait[WRITE]); - const int read = waitqueue_active(&rl->wait[READ]); - if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ]) - wake_up(&rl->wait[READ]); - if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE]) - wake_up(&rl->wait[WRITE]); + if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) { + smp_mb(); + if (waitqueue_active(&rl->wait[READ])) + wake_up(&rl->wait[READ]); + } + + if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) { + smp_mb(); + if (waitqueue_active(&rl->wait[WRITE])) + wake_up(&rl->wait[WRITE]); + } } /* @@ -1389,69 +1919,59 @@ static void cfq_put_request(request_queue_t *q, struct request *rq) if (crq) { struct cfq_queue *cfqq = crq->cfq_queue; + const int rw = rq_data_dir(rq); - BUG_ON(q->last_merge == rq); - BUG_ON(!hlist_unhashed(&crq->hash)); - - if (crq->io_context) - put_io_context(crq->io_context->ioc); + BUG_ON(!cfqq->allocated[rw]); + cfqq->allocated[rw]--; - BUG_ON(!cfqq->allocated[crq->is_write]); - cfqq->allocated[crq->is_write]--; + put_io_context(crq->io_context->ioc); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; - smp_mb(); cfq_check_waiters(q, cfqq); cfq_put_queue(cfqq); } } /* - * Allocate cfq data structures associated with this request. A queue and + * Allocate cfq data structures associated with this request. */ -static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +static int +cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_context *cic; const int rw = rq_data_dir(rq); - struct cfq_queue *cfqq, *saved_cfqq; + struct cfq_queue *cfqq; struct cfq_rq *crq; unsigned long flags; might_sleep_if(gfp_mask & __GFP_WAIT); + cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask); + spin_lock_irqsave(q->queue_lock, flags); - cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask); - if (!cfqq) - goto out_lock; + if (!cic) + goto queue_fail; + + if (!cic->cfqq) { + cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask); + if (!cfqq) + goto queue_fail; -repeat: - if (cfqq->allocated[rw] >= cfqd->max_queued) - goto out_lock; + cic->cfqq = cfqq; + } else + cfqq = cic->cfqq; cfqq->allocated[rw]++; + cfqq->must_alloc = 0; + cfqd->rq_starved = 0; + atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); - /* - * if hashing type has changed, the cfq_queue might change here. - */ - saved_cfqq = cfqq; - cic = cfq_get_io_context(&cfqq, gfp_mask); - if (!cic) - goto err; - - /* - * repeat allocation checks on queue change - */ - if (unlikely(saved_cfqq != cfqq)) { - spin_lock_irqsave(q->queue_lock, flags); - saved_cfqq->allocated[rw]--; - goto repeat; - } - crq = mempool_alloc(cfqd->crq_pool, gfp_mask); if (crq) { RB_CLEAR(&crq->rb_node); @@ -1460,24 +1980,130 @@ repeat: INIT_HLIST_NODE(&crq->hash); crq->cfq_queue = cfqq; crq->io_context = cic; - crq->service_start = crq->queue_start = 0; - crq->in_flight = crq->accounted = crq->is_sync = 0; - crq->is_write = rw; + crq->in_flight = crq->accounted = 0; + crq->is_sync = (rw == READ || process_sync(current)); + crq->requeued = 0; rq->elevator_private = crq; - cfqq->alloc_limit[rw] = 0; return 0; } - put_io_context(cic->ioc); -err: spin_lock_irqsave(q->queue_lock, flags); cfqq->allocated[rw]--; + if (!(cfqq->allocated[0] + cfqq->allocated[1])) + cfqq->must_alloc = 1; cfq_put_queue(cfqq); -out_lock: +queue_fail: + if (cic) + put_io_context(cic->ioc); + /* + * mark us rq allocation starved. we need to kickstart the process + * ourselves if there are no pending requests that can do it for us. + * that would be an extremely rare OOM situation + */ + cfqd->rq_starved = 1; + kblockd_schedule_work(&cfqd->unplug_work); spin_unlock_irqrestore(q->queue_lock, flags); return 1; } +static void cfq_kick_queue(void *data) +{ + request_queue_t *q = data; + struct cfq_data *cfqd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + + if (cfqd->rq_starved) { + struct request_list *rl = &q->rq; + + /* + * we aren't guaranteed to get a request after this, but we + * have to be opportunistic + */ + smp_mb(); + if (waitqueue_active(&rl->wait[READ])) + wake_up(&rl->wait[READ]); + if (waitqueue_active(&rl->wait[WRITE])) + wake_up(&rl->wait[WRITE]); + } + + blk_remove_plug(q); + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * Timer running if the active_queue is currently idling inside its time slice + */ +static void cfq_idle_slice_timer(unsigned long data) +{ + struct cfq_data *cfqd = (struct cfq_data *) data; + struct cfq_queue *cfqq; + unsigned long flags; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + + if ((cfqq = cfqd->active_queue) != NULL) { + unsigned long now = jiffies; + + /* + * expired + */ + if (time_after(now, cfqq->slice_end)) + goto expire; + + /* + * only expire and reinvoke request handler, if there are + * other queues with pending requests + */ + if (!cfq_pending_requests(cfqd)) { + cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end); + add_timer(&cfqd->idle_slice_timer); + goto out_cont; + } + + /* + * not expired and it has a request pending, let it dispatch + */ + if (!RB_EMPTY(&cfqq->sort_list)) { + cfqq->must_dispatch = 1; + goto out_kick; + } + } +expire: + cfq_slice_expired(cfqd, 0); +out_kick: + if (cfq_pending_requests(cfqd)) + kblockd_schedule_work(&cfqd->unplug_work); +out_cont: + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +/* + * Timer running if an idle class queue is waiting for service + */ +static void cfq_idle_class_timer(unsigned long data) +{ + struct cfq_data *cfqd = (struct cfq_data *) data; + unsigned long flags, end; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + + /* + * race with a non-idle queue, reset timer + */ + end = cfqd->last_end_request + CFQ_IDLE_GRACE; + if (!time_after_eq(jiffies, end)) { + cfqd->idle_class_timer.expires = end; + add_timer(&cfqd->idle_class_timer); + } else + kblockd_schedule_work(&cfqd->unplug_work); + + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + + static void cfq_put_cfqd(struct cfq_data *cfqd) { request_queue_t *q = cfqd->queue; @@ -1485,6 +2111,8 @@ static void cfq_put_cfqd(struct cfq_data *cfqd) if (!atomic_dec_and_test(&cfqd->ref)) return; + blk_sync_queue(q); + blk_put_queue(q); mempool_destroy(cfqd->crq_pool); @@ -1495,7 +2123,11 @@ static void cfq_put_cfqd(struct cfq_data *cfqd) static void cfq_exit_queue(elevator_t *e) { - cfq_put_cfqd(e->elevator_data); + struct cfq_data *cfqd = e->elevator_data; + + del_timer_sync(&cfqd->idle_slice_timer); + del_timer_sync(&cfqd->idle_class_timer); + cfq_put_cfqd(cfqd); } static int cfq_init_queue(request_queue_t *q, elevator_t *e) @@ -1508,7 +2140,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) return -ENOMEM; memset(cfqd, 0, sizeof(*cfqd)); - INIT_LIST_HEAD(&cfqd->rr_list); + + for (i = 0; i < CFQ_PRIO_LISTS; i++) + INIT_LIST_HEAD(&cfqd->rr_list[i]); + + INIT_LIST_HEAD(&cfqd->busy_rr); + INIT_LIST_HEAD(&cfqd->cur_rr); + INIT_LIST_HEAD(&cfqd->idle_rr); INIT_LIST_HEAD(&cfqd->empty_list); cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); @@ -1533,25 +2171,32 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) cfqd->queue = q; atomic_inc(&q->refcnt); - /* - * just set it to some high value, we want anyone to be able to queue - * some requests. fairness is handled differently - */ - q->nr_requests = 1024; - cfqd->max_queued = q->nr_requests / 16; + cfqd->max_queued = q->nr_requests / 4; q->nr_batching = cfq_queued; - cfqd->key_type = CFQ_KEY_TGID; - cfqd->find_best_crq = 1; + + init_timer(&cfqd->idle_slice_timer); + cfqd->idle_slice_timer.function = cfq_idle_slice_timer; + cfqd->idle_slice_timer.data = (unsigned long) cfqd; + + init_timer(&cfqd->idle_class_timer); + cfqd->idle_class_timer.function = cfq_idle_class_timer; + cfqd->idle_class_timer.data = (unsigned long) cfqd; + + INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); + atomic_set(&cfqd->ref, 1); cfqd->cfq_queued = cfq_queued; cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r; - cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w; - cfqd->cfq_fifo_batch_expire = cfq_fifo_rate; + cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; + cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; cfqd->cfq_back_max = cfq_back_max; cfqd->cfq_back_penalty = cfq_back_penalty; - + cfqd->cfq_slice[0] = cfq_slice_async; + cfqd->cfq_slice[1] = cfq_slice_sync; + cfqd->cfq_slice_async_rq = cfq_slice_async_rq; + cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_max_depth = cfq_max_depth; return 0; out_crqpool: kfree(cfqd->cfq_hash); @@ -1595,7 +2240,6 @@ fail: return -ENOMEM; } - /* * sysfs parts below --> */ @@ -1620,45 +2264,6 @@ cfq_var_store(unsigned int *var, const char *page, size_t count) return count; } -static ssize_t -cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count) -{ - max_elapsed_dispatch = max_elapsed_crq = 0; - return count; -} - -static ssize_t -cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count) -{ - spin_lock_irq(cfqd->queue->queue_lock); - if (!strncmp(page, "pgid", 4)) - cfqd->key_type = CFQ_KEY_PGID; - else if (!strncmp(page, "tgid", 4)) - cfqd->key_type = CFQ_KEY_TGID; - else if (!strncmp(page, "uid", 3)) - cfqd->key_type = CFQ_KEY_UID; - else if (!strncmp(page, "gid", 3)) - cfqd->key_type = CFQ_KEY_GID; - spin_unlock_irq(cfqd->queue->queue_lock); - return count; -} - -static ssize_t -cfq_read_key_type(struct cfq_data *cfqd, char *page) -{ - ssize_t len = 0; - int i; - - for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) { - if (cfqd->key_type == i) - len += sprintf(page+len, "[%s] ", cfq_key_types[i]); - else - len += sprintf(page+len, "%s ", cfq_key_types[i]); - } - len += sprintf(page+len, "\n"); - return len; -} - #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ { \ @@ -1669,12 +2274,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); -SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1); -SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1); -SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1); -SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0); +SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); +SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); +SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); +SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); +SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); +SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -1694,12 +2302,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0); +STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); +STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); +STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0); #undef STORE_FUNCTION static struct cfq_fs_entry cfq_quantum_entry = { @@ -1712,25 +2323,15 @@ static struct cfq_fs_entry cfq_queued_entry = { .show = cfq_queued_show, .store = cfq_queued_store, }; -static struct cfq_fs_entry cfq_fifo_expire_r_entry = { +static struct cfq_fs_entry cfq_fifo_expire_sync_entry = { .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_r_show, - .store = cfq_fifo_expire_r_store, + .show = cfq_fifo_expire_sync_show, + .store = cfq_fifo_expire_sync_store, }; -static struct cfq_fs_entry cfq_fifo_expire_w_entry = { +static struct cfq_fs_entry cfq_fifo_expire_async_entry = { .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_expire_w_show, - .store = cfq_fifo_expire_w_store, -}; -static struct cfq_fs_entry cfq_fifo_batch_expire_entry = { - .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_fifo_batch_expire_show, - .store = cfq_fifo_batch_expire_store, -}; -static struct cfq_fs_entry cfq_find_best_entry = { - .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_find_best_show, - .store = cfq_find_best_store, + .show = cfq_fifo_expire_async_show, + .store = cfq_fifo_expire_async_store, }; static struct cfq_fs_entry cfq_back_max_entry = { .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, @@ -1742,27 +2343,43 @@ static struct cfq_fs_entry cfq_back_penalty_entry = { .show = cfq_back_penalty_show, .store = cfq_back_penalty_store, }; -static struct cfq_fs_entry cfq_clear_elapsed_entry = { - .attr = {.name = "clear_elapsed", .mode = S_IWUSR }, - .store = cfq_clear_elapsed, +static struct cfq_fs_entry cfq_slice_sync_entry = { + .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_sync_show, + .store = cfq_slice_sync_store, }; -static struct cfq_fs_entry cfq_key_type_entry = { - .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_read_key_type, - .store = cfq_set_key_type, +static struct cfq_fs_entry cfq_slice_async_entry = { + .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_async_show, + .store = cfq_slice_async_store, +}; +static struct cfq_fs_entry cfq_slice_async_rq_entry = { + .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_async_rq_show, + .store = cfq_slice_async_rq_store, +}; +static struct cfq_fs_entry cfq_slice_idle_entry = { + .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_slice_idle_show, + .store = cfq_slice_idle_store, +}; +static struct cfq_fs_entry cfq_max_depth_entry = { + .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_max_depth_show, + .store = cfq_max_depth_store, }; - static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, &cfq_queued_entry.attr, - &cfq_fifo_expire_r_entry.attr, - &cfq_fifo_expire_w_entry.attr, - &cfq_fifo_batch_expire_entry.attr, - &cfq_key_type_entry.attr, - &cfq_find_best_entry.attr, + &cfq_fifo_expire_sync_entry.attr, + &cfq_fifo_expire_async_entry.attr, &cfq_back_max_entry.attr, &cfq_back_penalty_entry.attr, - &cfq_clear_elapsed_entry.attr, + &cfq_slice_sync_entry.attr, + &cfq_slice_async_entry.attr, + &cfq_slice_async_rq_entry.attr, + &cfq_slice_idle_entry.attr, + &cfq_max_depth_entry.attr, NULL, }; @@ -1832,21 +2449,46 @@ static int __init cfq_init(void) { int ret; + /* + * could be 0 on HZ < 1000 setups + */ + if (!cfq_slice_async) + cfq_slice_async = 1; + if (!cfq_slice_idle) + cfq_slice_idle = 1; + if (cfq_slab_setup()) return -ENOMEM; ret = elv_register(&iosched_cfq); - if (!ret) { - __module_get(THIS_MODULE); - return 0; - } + if (ret) + cfq_slab_kill(); - cfq_slab_kill(); return ret; } static void __exit cfq_exit(void) { + struct task_struct *g, *p; + unsigned long flags; + + read_lock_irqsave(&tasklist_lock, flags); + + /* + * iterate each process in the system, removing our io_context + */ + do_each_thread(g, p) { + struct io_context *ioc = p->io_context; + + if (ioc && ioc->cic) { + ioc->cic->exit(ioc->cic); + cfq_free_io_context(ioc->cic); + ioc->cic = NULL; + } + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); + cfq_slab_kill(); elv_unregister(&iosched_cfq); } diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c index 4bc2fea73273..ff5201e02153 100644 --- a/drivers/block/deadline-iosched.c +++ b/drivers/block/deadline-iosched.c @@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq) } static int -deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { struct deadline_data *dd = q->elevator->elevator_data; struct deadline_rq *drq; diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index f831f08f839c..98f0126a2deb 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq) return NULL; } -int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask) +int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, + int gfp_mask) { elevator_t *e = q->elevator; if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); + return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); rq->elevator_private = NULL; return 0; @@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq) e->ops->elevator_put_req_fn(q, rq); } -int elv_may_queue(request_queue_t *q, int rw) +int elv_may_queue(request_queue_t *q, int rw, struct bio *bio) { elevator_t *e = q->elevator; if (e->ops->elevator_may_queue_fn) - return e->ops->elevator_may_queue_fn(q, rw); + return e->ops->elevator_may_queue_fn(q, rw, bio); return ELV_MQUEUE_MAY; } diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 60e64091de1b..234fdcfbdf01 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq) rq->errors = 0; rq->rq_status = RQ_ACTIVE; rq->bio = rq->biotail = NULL; + rq->ioprio = 0; rq->buffer = NULL; rq->ref_count = 1; rq->q = q; @@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q) if (!blk_remove_plug(q)) return; - /* - * was plugged, fire request_fn if queue has stuff to do - */ - if (elv_next_request(q)) - q->request_fn(q); + q->request_fn(q); } EXPORT_SYMBOL(__generic_unplug_device); @@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq) mempool_free(rq, q->rq.rq_pool); } -static inline struct request *blk_alloc_request(request_queue_t *q, int rw, - int gfp_mask) +static inline struct request * +blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw, */ rq->flags = rw; - if (!elv_set_request(q, rq, gfp_mask)) + if (!elv_set_request(q, rq, bio, gfp_mask)) return rq; mempool_free(rq, q->rq.rq_pool); @@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw) /* * Get a free request, queue_lock must not be held */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, + int gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; @@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) } } - switch (elv_may_queue(q, rw)) { + switch (elv_may_queue(q, rw, bio)) { case ELV_MQUEUE_NO: goto rq_starved; case ELV_MQUEUE_MAY: @@ -1920,7 +1918,7 @@ get_rq: set_queue_congested(q, rw); spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw, gfp_mask); + rq = blk_alloc_request(q, rw, bio, gfp_mask); if (!rq) { /* * Allocation failed presumably due to memory. Undo anything @@ -1961,7 +1959,8 @@ out: * No available requests for this queue, unplug the device and wait for some * requests to become available. */ -static struct request *get_request_wait(request_queue_t *q, int rw) +static struct request *get_request_wait(request_queue_t *q, int rw, + struct bio *bio) { DEFINE_WAIT(wait); struct request *rq; @@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw) prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw, GFP_NOIO); + rq = get_request(q, rw, bio, GFP_NOIO); if (!rq) { struct io_context *ioc; @@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask) BUG_ON(rw != READ && rw != WRITE); if (gfp_mask & __GFP_WAIT) - rq = get_request_wait(q, rw); + rq = get_request_wait(q, rw, NULL); else - rq = get_request(q, rw, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); return rq; } @@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req) return; req->rq_status = RQ_INACTIVE; - req->q = NULL; req->rl = NULL; /* @@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req, req->rq_disk->in_flight--; } + req->ioprio = ioprio_best(req->ioprio, next->ioprio); + __blk_put_request(q, next); return 1; } @@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req, *freereq = NULL; int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; + unsigned short prio; sector_t sector; sector = bio->bi_sector; nr_sectors = bio_sectors(bio); cur_nr_sectors = bio_cur_sectors(bio); + prio = bio_prio(bio); rw = bio_data_dir(bio); sync = bio_sync(bio); @@ -2559,6 +2561,7 @@ again: req->biotail->bi_next = bio; req->biotail = bio; req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_back_merge(q, req)) elv_merged_request(q, req); @@ -2583,6 +2586,7 @@ again: req->hard_cur_sectors = cur_nr_sectors; req->sector = req->hard_sector = sector; req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->ioprio = ioprio_best(req->ioprio, prio); drive_stat_acct(req, nr_sectors, 0); if (!attempt_front_merge(q, req)) elv_merged_request(q, req); @@ -2610,7 +2614,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { + if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) { /* * READA bit set */ @@ -2618,7 +2622,7 @@ get_rq: if (bio_rw_ahead(bio)) goto end_io; - freereq = get_request_wait(q, rw); + freereq = get_request_wait(q, rw, bio); } goto again; } @@ -2646,6 +2650,7 @@ get_rq: req->buffer = bio_data(bio); /* see ->buffer comment above */ req->waiting = NULL; req->bio = req->biotail = bio; + req->ioprio = prio; req->rq_disk = bio->bi_bdev->bd_disk; req->start_time = jiffies; @@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio) if (bdev != bdev->bd_contains) { struct hd_struct *p = bdev->bd_part; - switch (bio->bi_rw) { + switch (bio_data_dir(bio)) { case READ: p->read_sectors += bio_sectors(bio); p->reads++; @@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q) { struct request_list *rl = &q->rq; struct request *rq; + int requeued = 0; spin_lock_irq(q->queue_lock); clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); @@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q) rq = list_entry_rq(q->drain_list.next); list_del_init(&rq->queuelist); - __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); + elv_requeue_request(q, rq); + requeued++; } + if (requeued) + q->request_fn(q); + spin_unlock_irq(q->queue_lock); wake_up(&rl->wait[0]); @@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio) BIO_BUG_ON(!bio->bi_size); BIO_BUG_ON(!bio->bi_io_vec); - bio->bi_rw = rw; + bio->bi_rw |= rw; if (rw & WRITE) mod_page_state(pgpgout, count); else @@ -3257,8 +3267,11 @@ void exit_io_context(void) struct io_context *ioc; local_irq_save(flags); + task_lock(current); ioc = current->io_context; current->io_context = NULL; + ioc->task = NULL; + task_unlock(current); local_irq_restore(flags); if (ioc->aic && ioc->aic->exit) @@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags) ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); if (ret) { atomic_set(&ret->refcount, 1); - ret->pid = tsk->pid; + ret->task = current; + ret->set_ioprio = NULL; ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; ret->cic = NULL; - spin_lock_init(&ret->lock); local_irq_save(flags); diff --git a/fs/Makefile b/fs/Makefile index fc92e59e9faf..20edcf28bfd2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ + ioprio.o obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_COMPAT) += compat.o diff --git a/fs/ioprio.c b/fs/ioprio.c new file mode 100644 index 000000000000..663e420636d6 --- /dev/null +++ b/fs/ioprio.c @@ -0,0 +1,172 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include +#include +#include + +static int set_task_ioprio(struct task_struct *task, int ioprio) +{ + struct io_context *ioc; + + if (task->uid != current->euid && + task->uid != current->uid && !capable(CAP_SYS_NICE)) + return -EPERM; + + task_lock(task); + + task->ioprio = ioprio; + + ioc = task->io_context; + if (ioc && ioc->set_ioprio) + ioc->set_ioprio(ioc, ioprio); + + task_unlock(task); + return 0; +} + +asmlinkage int sys_ioprio_set(int which, int who, int ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; + int ret; + + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* fall through, rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + break; + default: + return -EINVAL; + } + + ret = -ESRCH; + read_lock_irq(&tasklist_lock); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_pid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (p->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + read_unlock_irq(&tasklist_lock); + return ret; +} + +asmlinkage int sys_ioprio_get(int which, int who) +{ + struct task_struct *g, *p; + struct user_struct *user; + int ret = -ESRCH; + + read_lock_irq(&tasklist_lock); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_pid(who); + if (p) + ret = p->ioprio; + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (p->uid != user->uid) + continue; + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + read_unlock_irq(&tasklist_lock); + return ret; +} + diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 7b87707acc36..d1bcf0da6728 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -645,18 +645,22 @@ struct buffer_chunk { static void write_chunk(struct buffer_chunk *chunk) { int i; + get_fs_excl(); for (i = 0; i < chunk->nr ; i++) { submit_logged_buffer(chunk->bh[i]) ; } chunk->nr = 0; + put_fs_excl(); } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; + get_fs_excl(); for (i = 0; i < chunk->nr ; i++) { submit_ordered_buffer(chunk->bh[i]) ; } chunk->nr = 0; + put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, @@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list return 0 ; } + get_fs_excl(); + /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ @@ -1055,6 +1061,7 @@ put_jl: if (retval) reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); + put_fs_excl(); return retval; } @@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s, return 0 ; } + get_fs_excl(); + /* if all the work is already done, get out of here */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -1450,6 +1459,7 @@ flush_older_and_return: put_journal_list(s, jl); if (flushall) up(&journal->j_flush_sem); + put_fs_excl(); return err ; } @@ -2719,6 +2729,7 @@ relock: th->t_trans_id = journal->j_trans_id ; unlock_journal(p_s_sb) ; INIT_LIST_HEAD (&th->t_list); + get_fs_excl(); return 0 ; out_fail: @@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b BUG_ON (th->t_refcount > 1); BUG_ON (!th->t_trans_id); + put_fs_excl(); current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(p_s_sb, "journal end"); if (journal->j_len == 0) { diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 176413fb9ae3..e25e4c71a879 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -294,8 +294,10 @@ #define __NR_add_key 286 #define __NR_request_key 287 #define __NR_keyctl 288 +#define __NR_ioprio_set 289 +#define __NR_ioprio_get 290 -#define NR_syscalls 289 +#define NR_syscalls 291 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index f7f43ec2483a..517f1649ee64 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -263,6 +263,8 @@ #define __NR_add_key 1271 #define __NR_request_key 1272 #define __NR_keyctl 1273 +#define __NR_ioprio_set 1274 +#define __NR_ioprio_get 1275 #define __NR_set_zone_reclaim 1276 #ifdef __KERNEL__ diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h index cc51e5c9acc2..e8b79220b29c 100644 --- a/include/asm-ppc/unistd.h +++ b/include/asm-ppc/unistd.h @@ -277,8 +277,10 @@ #define __NR_request_key 270 #define __NR_keyctl 271 #define __NR_waitid 272 +#define __NR_ioprio_set 273 +#define __NR_ioprio_get 274 -#define __NR_syscalls 273 +#define __NR_syscalls 275 #define __NR(n) #n diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index d767adcbf0ff..6560439a83e4 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key) __SYSCALL(__NR_request_key, sys_request_key) #define __NR_keyctl 250 __SYSCALL(__NR_keyctl, sys_keyctl) +#define __NR_ioprio_set 251 +__SYSCALL(__NR_ioprio_set, sys_ioprio_set) +#define __NR_ioprio_get 252 +__SYSCALL(__NR_ioprio_get, sys_ioprio_get) -#define __NR_syscall_max __NR_keyctl +#define __NR_syscall_max __NR_ioprio_get #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 038022763f09..36ef29fa0d8b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -22,6 +22,7 @@ #include #include +#include /* Platforms may set this to teach the BIO layer about IOMMU hardware. */ #include @@ -149,6 +150,19 @@ struct bio { #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 +/* + * upper 16 bits of bi_rw define the io priority of this bio + */ +#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) +#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) +#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) + +#define bio_set_prio(bio, prio) do { \ + WARN_ON(prio >= (1 << IOPRIO_BITS)); \ + (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ + (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ +} while (0) + /* * various member access, note that bio_data should of course not be used * on highmem page vectors diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b54a0348a890..21a8674cd149 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,16 +54,23 @@ struct as_io_context { struct cfq_queue; struct cfq_io_context { - void (*dtor)(struct cfq_io_context *); - void (*exit)(struct cfq_io_context *); - - struct io_context *ioc; - /* * circular list of cfq_io_contexts belonging to a process io context */ struct list_head list; struct cfq_queue *cfqq; + void *key; + + struct io_context *ioc; + + unsigned long last_end_request; + unsigned long last_queue; + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; + + void (*dtor)(struct cfq_io_context *); + void (*exit)(struct cfq_io_context *); }; /* @@ -73,7 +80,9 @@ struct cfq_io_context { */ struct io_context { atomic_t refcount; - pid_t pid; + struct task_struct *task; + + int (*set_ioprio)(struct io_context *, unsigned int); /* * For request batching @@ -81,8 +90,6 @@ struct io_context { unsigned long last_waited; /* Time last woken after wait for request */ int nr_batch_requests; /* Number of requests left in the batch */ - spinlock_t lock; - struct as_io_context *aic; struct cfq_io_context *cic; }; @@ -134,6 +141,8 @@ struct request { void *elevator_private; + unsigned short ioprio; + int rq_status; /* should split this into a few status bits */ struct gendisk *rq_disk; int errors; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ee54f81faad5..ea6bbc2d7407 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *); typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); -typedef int (elevator_may_queue_fn) (request_queue_t *, int); +typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *); -typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); +typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); @@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *); extern struct request *elv_latter_request(request_queue_t *, struct request *); extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); -extern int elv_may_queue(request_queue_t *, int); +extern int elv_may_queue(request_queue_t *, int, struct bio *); extern void elv_completed_request(request_queue_t *, struct request *); -extern int elv_set_request(request_queue_t *, struct request *, int); +extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int); extern void elv_put_request(request_queue_t *, struct request *); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ae8e37bdfc8..047bde30836a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -213,6 +213,7 @@ extern int dir_notify_enable; #include #include #include +#include #include #include @@ -822,16 +823,34 @@ enum { #define vfs_check_frozen(sb, level) \ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) +static inline void get_fs_excl(void) +{ + atomic_inc(¤t->fs_excl); +} + +static inline void put_fs_excl(void) +{ + atomic_dec(¤t->fs_excl); +} + +static inline int has_fs_excl(void) +{ + return atomic_read(¤t->fs_excl); +} + + /* * Superblock locking. */ static inline void lock_super(struct super_block * sb) { + get_fs_excl(); down(&sb->s_lock); } static inline void unlock_super(struct super_block * sb) { + put_fs_excl(); up(&sb->s_lock); } diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 03206a425d7a..c727c195a91a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -81,6 +81,7 @@ extern struct group_info init_groups; .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .ioprio = 0, \ .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ @@ -110,6 +111,7 @@ extern struct group_info init_groups; .proc_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h new file mode 100644 index 000000000000..7811300d88ee --- /dev/null +++ b/include/linux/ioprio.h @@ -0,0 +1,87 @@ +#ifndef IOPRIO_H +#define IOPRIO_H + +#include + +/* + * Gives us 8 prio classes with 13-bits of data for each class + */ +#define IOPRIO_BITS (16) +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) + +#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) +#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) + +#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) + +/* + * These are the io priority groups as implemented by CFQ. RT is the realtime + * class, it always gets premium service. BE is the best-effort scheduling + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +enum { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, + IOPRIO_CLASS_IDLE, +}; + +/* + * 8 best effort priority levels are supported + */ +#define IOPRIO_BE_NR (8) + +asmlinkage int sys_ioprio_set(int, int, int); +asmlinkage int sys_ioprio_get(int, int); + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +/* + * if process has set io priority explicitly, use that. if not, convert + * the cpu scheduler nice value to an io priority + */ +#define IOPRIO_NORM (4) +static inline int task_ioprio(struct task_struct *task) +{ + WARN_ON(!ioprio_valid(task->ioprio)); + return IOPRIO_PRIO_DATA(task->ioprio); +} + +static inline int task_nice_ioprio(struct task_struct *task) +{ + return (task_nice(task) + 20) / 5; +} + +/* + * For inheritance, return the highest of the two given priorities + */ +static inline int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); + unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + + if (!ioprio_valid(aprio)) + return bprio; + if (!ioprio_valid(bprio)) + return aprio; + + if (aclass == IOPRIO_CLASS_NONE) + aclass = IOPRIO_CLASS_BE; + if (bclass == IOPRIO_CLASS_NONE) + bclass = IOPRIO_CLASS_BE; + + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 9530b1903160..ff48815bd3a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -608,6 +608,8 @@ struct task_struct { struct list_head run_list; prio_array_t *array; + unsigned short ioprio; + unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ @@ -763,6 +765,7 @@ struct task_struct { nodemask_t mems_allowed; int cpuset_mems_generation; #endif + atomic_t fs_excl; /* holding fs exclusive resources */ }; static inline pid_t process_group(struct task_struct *tsk) @@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p); /* * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring - * subscriptions and synchronises with wait4(). Also used in procfs. + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1262cb43c3ab..d5c3fe1bf33d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -14,11 +14,13 @@ extern struct list_head inode_unused; * Yes, writeback.h requires sched.h * No, sched.h is not included from here. */ -static inline int current_is_pdflush(void) +static inline int task_is_pdflush(struct task_struct *task) { - return current->flags & PF_FLUSHER; + return task->flags & PF_FLUSHER; } +#define current_is_pdflush() task_is_pdflush(current) + /* * fs/fs-writeback.c */ diff --git a/kernel/exit.c b/kernel/exit.c index 3ebcd60a19c6..9d1b10ed0135 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code) profile_task_exit(tsk); + WARN_ON(atomic_read(&tsk->fs_excl)); + if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) diff --git a/kernel/fork.c b/kernel/fork.c index 2c7806873bfd..cdef6cea8900 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); } + /* + * inherit ioprio + */ + p->ioprio = current->ioprio; + SET_LINKS(p); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); diff --git a/kernel/sched.c b/kernel/sched.c index a07cff90d849..e2b0d3e4dd06 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3448,15 +3448,7 @@ int task_nice(const task_t *p) { return TASK_NICE(p); } - -/* - * The only users of task_nice are binfmt_elf and binfmt_elf32. - * binfmt_elf is no longer modular, but binfmt_elf32 still is. - * Therefore, task_nice is needed if there is a compat_mode. - */ -#ifdef CONFIG_COMPAT EXPORT_SYMBOL_GPL(task_nice); -#endif /** * idle_cpu - is a given cpu idle currently? -- cgit v1.2.3 From 313980c92724cf42877a7bdafdef439ee9d68ccb Mon Sep 17 00:00:00 2001 From: David Brownell Date: Mon, 11 Apr 2005 15:38:25 -0700 Subject: [PATCH] USB: omap_udc updates (mostly cleanups) Various USB patches, mostly for portability: - Fifo mode 1 didn't work previously (oopsed), so now it's fixed and (why not) defines even more endpoints for composite devices. - OMAP 1710 doesn't have an internal transceiver. - Small PM update: if the USB link is suspended, don't disconnect on entry to deep sleep. - Be more correct about handling zero length control reads. OMAP seems to mis-handle that protocol peculiarity though; best avoided. - Platform device resources (for UDC and OTG controllers) now use physical addresses, so /proc/iomem is more consistent. - Minor cleanups, notably (by volume) for "sparse" NULL warnings. Signed-off-by: David Brownell --- arch/arm/mach-omap/usb.c | 8 +-- drivers/usb/gadget/omap_udc.c | 150 ++++++++++++++++++++++++++---------------- 2 files changed, 97 insertions(+), 61 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap/usb.c b/arch/arm/mach-omap/usb.c index 6e805d451d0e..7f37857b1a28 100644 --- a/arch/arm/mach-omap/usb.c +++ b/arch/arm/mach-omap/usb.c @@ -288,8 +288,8 @@ static void usb_release(struct device *dev) static struct resource udc_resources[] = { /* order is significant! */ { /* registers */ - .start = IO_ADDRESS(UDC_BASE), - .end = IO_ADDRESS(UDC_BASE + 0xff), + .start = UDC_BASE, + .end = UDC_BASE + 0xff, .flags = IORESOURCE_MEM, }, { /* general IRQ */ .start = IH2_BASE + 20, @@ -355,8 +355,8 @@ static struct platform_device ohci_device = { static struct resource otg_resources[] = { /* order is significant! */ { - .start = IO_ADDRESS(OTG_BASE), - .end = IO_ADDRESS(OTG_BASE + 0xff), + .start = OTG_BASE, + .end = OTG_BASE + 0xff, .flags = IORESOURCE_MEM, }, { .start = IH2_BASE + 8, diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index 98cbcbc16cc1..4ec91a68f96c 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -52,7 +52,6 @@ #include #include -#include #include #include "omap_udc.h" @@ -253,7 +252,7 @@ static int omap_ep_disable(struct usb_ep *_ep) } spin_lock_irqsave(&ep->udc->lock, flags); - ep->desc = 0; + ep->desc = NULL; nuke (ep, -ESHUTDOWN); ep->ep.maxpacket = ep->maxpacket; ep->has_dma = 0; @@ -388,8 +387,8 @@ done(struct omap_ep *ep, struct omap_req *req, int status) /*-------------------------------------------------------------------------*/ -#define FIFO_FULL (UDC_NON_ISO_FIFO_FULL | UDC_ISO_FIFO_FULL) -#define FIFO_UNWRITABLE (UDC_EP_HALTED | FIFO_FULL) +#define UDC_FIFO_FULL (UDC_NON_ISO_FIFO_FULL | UDC_ISO_FIFO_FULL) +#define UDC_FIFO_UNWRITABLE (UDC_EP_HALTED | UDC_FIFO_FULL) #define FIFO_EMPTY (UDC_NON_ISO_FIFO_EMPTY | UDC_ISO_FIFO_EMPTY) #define FIFO_UNREADABLE (UDC_EP_HALTED | FIFO_EMPTY) @@ -433,7 +432,7 @@ static int write_fifo(struct omap_ep *ep, struct omap_req *req) /* PIO-IN isn't double buffered except for iso */ ep_stat = UDC_STAT_FLG_REG; - if (ep_stat & FIFO_UNWRITABLE) + if (ep_stat & UDC_FIFO_UNWRITABLE) return 0; count = ep->ep.maxpacket; @@ -504,7 +503,7 @@ static int read_fifo(struct omap_ep *ep, struct omap_req *req) if (ep_stat & UDC_EP_HALTED) break; - if (ep_stat & FIFO_FULL) + if (ep_stat & UDC_FIFO_FULL) avail = ep->ep.maxpacket; else { avail = UDC_RXFSTAT_REG; @@ -856,7 +855,7 @@ static void dma_channel_release(struct omap_ep *ep) if (!list_empty(&ep->queue)) req = container_of(ep->queue.next, struct omap_req, queue); else - req = 0; + req = NULL; active = ((1 << 7) & omap_readl(OMAP_DMA_CCR(ep->lch))) != 0; @@ -997,18 +996,19 @@ omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags) UDC_IRQ_EN_REG = irq_en; } - /* STATUS is reverse direction */ - UDC_EP_NUM_REG = is_in - ? UDC_EP_SEL - : (UDC_EP_SEL|UDC_EP_DIR); + /* STATUS for zero length DATA stages is + * always an IN ... even for IN transfers, + * a wierd case which seem to stall OMAP. + */ + UDC_EP_NUM_REG = (UDC_EP_SEL|UDC_EP_DIR); UDC_CTRL_REG = UDC_CLR_EP; UDC_CTRL_REG = UDC_SET_FIFO_EN; - UDC_EP_NUM_REG = udc->ep0_in ? 0 : UDC_EP_DIR; + UDC_EP_NUM_REG = UDC_EP_DIR; /* cleanup */ udc->ep0_pending = 0; done(ep, req, 0); - req = 0; + req = NULL; /* non-empty DATA stage */ } else if (is_in) { @@ -1029,7 +1029,7 @@ omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags) (is_in ? next_in_dma : next_out_dma)(ep, req); else if (req) { if ((is_in ? write_fifo : read_fifo)(ep, req) == 1) - req = 0; + req = NULL; deselect_ep(); if (!is_in) { UDC_CTRL_REG = UDC_SET_FIFO_EN; @@ -1041,7 +1041,7 @@ omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags) irq_wait: /* irq handler advances the queue */ - if (req != 0) + if (req != NULL) list_add_tail(&req->queue, &ep->queue); spin_unlock_irqrestore(&udc->lock, flags); @@ -1238,6 +1238,8 @@ static int can_pullup(struct omap_udc *udc) static void pullup_enable(struct omap_udc *udc) { + udc->gadget.dev.parent->power.power_state = PMSG_ON; + udc->gadget.dev.power.power_state = PMSG_ON; UDC_SYSCON1_REG |= UDC_PULLUP_EN; #ifndef CONFIG_USB_OTG if (!cpu_is_omap15xx()) @@ -1382,7 +1384,7 @@ static void update_otg(struct omap_udc *udc) static void ep0_irq(struct omap_udc *udc, u16 irq_src) { struct omap_ep *ep0 = &udc->ep[0]; - struct omap_req *req = 0; + struct omap_req *req = NULL; ep0->irqs++; @@ -1438,7 +1440,7 @@ static void ep0_irq(struct omap_udc *udc, u16 irq_src) if (req) done(ep0, req, 0); } - req = 0; + req = NULL; } else if (stat & UDC_STALL) { UDC_CTRL_REG = UDC_CLR_HALT; UDC_EP_NUM_REG = UDC_EP_DIR; @@ -1511,9 +1513,6 @@ static void ep0_irq(struct omap_udc *udc, u16 irq_src) u.word[3] = UDC_DATA_REG; UDC_EP_NUM_REG = 0; } while (UDC_IRQ_SRC_REG & UDC_SETUP); - le16_to_cpus (&u.r.wValue); - le16_to_cpus (&u.r.wIndex); - le16_to_cpus (&u.r.wLength); /* Delegate almost all control requests to the gadget driver, * except for a handful of ch9 status/feature requests that @@ -1566,6 +1565,11 @@ static void ep0_irq(struct omap_udc *udc, u16 irq_src) UDC_CTRL_REG = UDC_SET_FIFO_EN; ep->ackwait = 1 + ep->double_buf; } + /* NOTE: assumes the host behaves sanely, + * only clearing real halts. Else we may + * need to kill pending transfers and then + * restart the queue... very messy for DMA! + */ } VDBG("%s halt cleared by host\n", ep->name); goto ep0out_status_stage; @@ -2013,7 +2017,7 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver) udc->softconnect = 1; /* hook up the driver */ - driver->driver.bus = 0; + driver->driver.bus = NULL; udc->driver = driver; udc->gadget.dev.driver = &driver->driver; spin_unlock_irqrestore(&udc->lock, flags); @@ -2021,8 +2025,8 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver) status = driver->bind (&udc->gadget); if (status) { DBG("bind to %s --> %d\n", driver->driver.name, status); - udc->gadget.dev.driver = 0; - udc->driver = 0; + udc->gadget.dev.driver = NULL; + udc->driver = NULL; goto done; } DBG("bound to driver %s\n", driver->driver.name); @@ -2035,8 +2039,8 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver) if (status < 0) { ERR("can't bind to transceiver\n"); driver->unbind (&udc->gadget); - udc->gadget.dev.driver = 0; - udc->driver = 0; + udc->gadget.dev.driver = NULL; + udc->driver = NULL; goto done; } } else { @@ -2071,7 +2075,7 @@ int usb_gadget_unregister_driver (struct usb_gadget_driver *driver) omap_vbus_session(&udc->gadget, 0); if (udc->transceiver) - (void) otg_set_peripheral(udc->transceiver, 0); + (void) otg_set_peripheral(udc->transceiver, NULL); else pullup_disable(udc); @@ -2080,9 +2084,8 @@ int usb_gadget_unregister_driver (struct usb_gadget_driver *driver) spin_unlock_irqrestore(&udc->lock, flags); driver->unbind(&udc->gadget); - udc->gadget.dev.driver = 0; - udc->driver = 0; - + udc->gadget.dev.driver = NULL; + udc->driver = NULL; DBG("unregistered driver '%s'\n", driver->driver.name); return status; @@ -2178,7 +2181,7 @@ static int proc_otg_show(struct seq_file *s) tmp = OTG_REV_REG; trans = USB_TRANSCEIVER_CTRL_REG; - seq_printf(s, "OTG rev %d.%d, transceiver_ctrl %03x\n", + seq_printf(s, "\nOTG rev %d.%d, transceiver_ctrl %03x\n", tmp >> 4, tmp & 0xf, trans); tmp = OTG_SYSCON_1_REG; seq_printf(s, "otg_syscon1 %08x usb2 %s, usb1 %s, usb0 %s," @@ -2235,6 +2238,7 @@ static int proc_otg_show(struct seq_file *s) seq_printf(s, "otg_outctrl %04x" "\n", tmp); tmp = OTG_TEST_REG; seq_printf(s, "otg_test %04x" "\n", tmp); + return 0; } static int proc_udc_show(struct seq_file *s, void *_) @@ -2378,7 +2382,7 @@ static int proc_udc_show(struct seq_file *s, void *_) static int proc_udc_open(struct inode *inode, struct file *file) { - return single_open(file, proc_udc_show, 0); + return single_open(file, proc_udc_show, NULL); } static struct file_operations proc_ops = { @@ -2399,7 +2403,7 @@ static void create_proc_file(void) static void remove_proc_file(void) { - remove_proc_entry(proc_filename, 0); + remove_proc_entry(proc_filename, NULL); } #else @@ -2505,7 +2509,7 @@ static void omap_udc_release(struct device *dev) { complete(udc->done); kfree (udc); - udc = 0; + udc = NULL; } static int __init @@ -2577,23 +2581,33 @@ omap_udc_setup(struct platform_device *odev, struct otg_transceiver *xceiv) case 1: OMAP_BULK_EP("ep1in", USB_DIR_IN | 1); OMAP_BULK_EP("ep2out", USB_DIR_OUT | 2); + OMAP_INT_EP("ep9in", USB_DIR_IN | 9, 16); + OMAP_BULK_EP("ep3in", USB_DIR_IN | 3); OMAP_BULK_EP("ep4out", USB_DIR_OUT | 4); + OMAP_INT_EP("ep10in", USB_DIR_IN | 10, 16); OMAP_BULK_EP("ep5in", USB_DIR_IN | 5); OMAP_BULK_EP("ep5out", USB_DIR_OUT | 5); + OMAP_INT_EP("ep11in", USB_DIR_IN | 11, 16); + OMAP_BULK_EP("ep6in", USB_DIR_IN | 6); OMAP_BULK_EP("ep6out", USB_DIR_OUT | 6); + OMAP_INT_EP("ep12in", USB_DIR_IN | 12, 16); OMAP_BULK_EP("ep7in", USB_DIR_IN | 7); OMAP_BULK_EP("ep7out", USB_DIR_OUT | 7); + OMAP_INT_EP("ep13in", USB_DIR_IN | 13, 16); + OMAP_INT_EP("ep13out", USB_DIR_OUT | 13, 16); + OMAP_BULK_EP("ep8in", USB_DIR_IN | 8); OMAP_BULK_EP("ep8out", USB_DIR_OUT | 8); + OMAP_INT_EP("ep14in", USB_DIR_IN | 14, 16); + OMAP_INT_EP("ep14out", USB_DIR_OUT | 14, 16); + + OMAP_BULK_EP("ep15in", USB_DIR_IN | 15); + OMAP_BULK_EP("ep15out", USB_DIR_OUT | 15); - OMAP_INT_EP("ep9in", USB_DIR_IN | 9, 16); - OMAP_INT_EP("ep10out", USB_DIR_IN | 10, 16); - OMAP_INT_EP("ep11in", USB_DIR_IN | 9, 16); - OMAP_INT_EP("ep12out", USB_DIR_IN | 10, 16); break; #ifdef USE_ISO @@ -2640,8 +2654,8 @@ static int __init omap_udc_probe(struct device *dev) struct platform_device *odev = to_platform_device(dev); int status = -ENODEV; int hmc; - struct otg_transceiver *xceiv = 0; - const char *type = 0; + struct otg_transceiver *xceiv = NULL; + const char *type = NULL; struct omap_usb_config *config = dev->platform_data; /* NOTE: "knows" the order of the resources! */ @@ -2678,6 +2692,15 @@ static int __init omap_udc_probe(struct device *dev) } else { hmc = HMC_1610; switch (hmc) { + case 0: /* POWERUP DEFAULT == 0 */ + case 4: + case 12: + case 20: + if (!cpu_is_omap1710()) { + type = "integrated"; + break; + } + /* FALL THROUGH */ case 3: case 11: case 16: @@ -2688,21 +2711,15 @@ static int __init omap_udc_probe(struct device *dev) DBG("external transceiver not registered!\n"); if (config->otg) goto cleanup0; - type = "(unknown external)"; + type = "unknown"; } else type = xceiv->label; break; - case 0: /* POWERUP DEFAULT == 0 */ - case 4: - case 12: - case 20: - type = "INTEGRATED"; - break; case 21: /* internal loopback */ - type = "(loopback)"; + type = "loopback"; break; case 14: /* transceiverless */ - type = "(none)"; + type = "no"; break; default: @@ -2710,14 +2727,14 @@ static int __init omap_udc_probe(struct device *dev) return -ENODEV; } } - INFO("hmc mode %d, transceiver %s\n", hmc, type); + INFO("hmc mode %d, %s transceiver\n", hmc, type); /* a "gadget" abstracts/virtualizes the controller */ status = omap_udc_setup(odev, xceiv); if (status) { goto cleanup0; } - xceiv = 0; + xceiv = NULL; // "udc" is now valid pullup_disable(udc); #if defined(CONFIG_USB_OHCI_HCD) || defined(CONFIG_USB_OHCI_HCD_MODULE) @@ -2765,7 +2782,7 @@ cleanup2: cleanup1: kfree (udc); - udc = 0; + udc = NULL; cleanup0: if (xceiv) @@ -2788,7 +2805,7 @@ static int __exit omap_udc_remove(struct device *dev) pullup_disable(udc); if (udc->transceiver) { put_device(udc->transceiver->dev); - udc->transceiver = 0; + udc->transceiver = NULL; } UDC_SYSCON1_REG = 0; @@ -2809,13 +2826,32 @@ static int __exit omap_udc_remove(struct device *dev) return 0; } -static int omap_udc_suspend(struct device *dev, pm_message_t state, u32 level) +/* suspend/resume/wakeup from sysfs (echo > power/state) or when the + * system is forced into deep sleep + * + * REVISIT we should probably reject suspend requests when there's a host + * session active, rather than disconnecting, at least on boards that can + * report VBUS irqs (UDC_DEVSTAT_REG.UDC_ATT). And in any case, we need to + * make host resumes and VBUS detection trigger OMAP wakeup events; that + * may involve talking to an external transceiver (e.g. isp1301). + */ +static int omap_udc_suspend(struct device *dev, pm_message_t message, u32 level) { - if (level != 0) + u32 devstat; + + if (level != SUSPEND_POWER_DOWN) return 0; + devstat = UDC_DEVSTAT_REG; + + /* we're requesting 48 MHz clock if the pullup is enabled + * (== we're attached to the host) and we're not suspended, + * which would prevent entry to deep sleep... + */ + if ((devstat & UDC_ATT) != 0 && (devstat & UDC_SUS) == 0) { + WARN("session active; suspend requires disconnect\n"); + omap_pullup(&udc->gadget, 0); + } - DBG("suspend, state %d\n", state); - omap_pullup(&udc->gadget, 0); udc->gadget.dev.power.power_state = PMSG_SUSPEND; udc->gadget.dev.parent->power.power_state = PMSG_SUSPEND; return 0; @@ -2823,7 +2859,7 @@ static int omap_udc_suspend(struct device *dev, pm_message_t state, u32 level) static int omap_udc_resume(struct device *dev, u32 level) { - if (level != 0) + if (level != RESUME_POWER_ON) return 0; DBG("resume + wakeup/SRP\n"); -- cgit v1.2.3 From 9c4142a133f7efee08238722b157656c3da7ca97 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 27 Jun 2005 14:36:16 -0700 Subject: [PATCH] ppc32: Fix compiling of sandpoint platform Lost a curly brace in translation. Everything is better now. Signed-off-by: Matt McClintock Signed-off-by: Kumar Gala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/platforms/sandpoint.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/ppc/platforms/sandpoint.c b/arch/ppc/platforms/sandpoint.c index 70e58f43f2b8..8b149c2fc54f 100644 --- a/arch/ppc/platforms/sandpoint.c +++ b/arch/ppc/platforms/sandpoint.c @@ -324,6 +324,7 @@ sandpoint_setup_arch(void) pdata[1].irq = 0; pdata[1].mapbase = 0; } + } printk(KERN_INFO "Motorola SPS Sandpoint Test Platform\n"); printk(KERN_INFO "Port by MontaVista Software, Inc. (source@mvista.com)\n"); -- cgit v1.2.3 From d4b3a80e399c989028acd5185c792fab82eda035 Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Mon, 27 Jun 2005 14:36:30 -0700 Subject: [PATCH] mips: fixed try_to_freeze build error arch/mips/kernel/signal.c: In function 'do_signal': arch/mips/kernel/signal.c:460: error: too many arguments to function 'try_to_freeze' Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 508026ae5842..65ee15396ffd 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -457,7 +457,7 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs) if (!user_mode(regs)) return 1; - if (try_to_freeze(0)) + if (try_to_freeze()) goto no_signal; if (!oldset) -- cgit v1.2.3 From bb1657468152c5e5232c7bf35cf0e9c41b5d9910 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 27 Jun 2005 13:09:00 -0300 Subject: [PATCH] 8xx: avoid "dcbst" misbehaviour with unpopulated TLB The proposed _tlbie call at update_mmu_cache() is safe because: Addresses for which update_mmu_cache() gets invocated are never inside the static kernel virtual mapping, meaning that there is no risk for the _tlbie() here to be thrashing the pinned entry, as Dan suspected. The intermediate TLB state in which this bug can be triggered is not visible by userspace or any other contexts, except the page fault handling path. So there is no need to worry about userspace dcbxxx users. The other solution to this is to avoid dcbst misbehaviour in the first place, which involves changing in-kernel "dcbst" callers to use 8xx specific SPR's. Summary: On 8xx, cache control instructions (particularly "dcbst" from flush_dcache_icache) fault as write operation if there is an unpopulated TLB entry for the address in question. To workaround that, we invalidate the TLB here, thus avoiding dcbst misbehaviour. Signed-off-by: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/mm/init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 334ef4150d92..6164a2b34733 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -606,9 +606,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, struct page *page = pfn_to_page(pfn); if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - if (vma->vm_mm == current->active_mm) + if (vma->vm_mm == current->active_mm) { +#ifdef CONFIG_8xx + /* On 8xx, cache control instructions (particularly + * "dcbst" from flush_dcache_icache) fault as write + * operation if there is an unpopulated TLB entry + * for the address in question. To workaround that, + * we invalidate the TLB here, thus avoiding dcbst + * misbehaviour. + */ + _tlbie(address); +#endif __flush_dcache_icache((void *) address); - else + } else flush_dcache_icache_page(page); set_bit(PG_arch_1, &page->flags); } -- cgit v1.2.3 From e4ee69c8c1e7ff9790fbce29c7be50db57323a6f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 27 Jun 2005 14:36:32 -0700 Subject: [PATCH] ppc32: Bump PMU interrupt priority The Power Management Unit on PowerMacs is very sensitive to timeouts during async message exchanges. It uses rather crude protocol based on a shift register with an interrupt and is almost continuously exchanging messages with the host CPU on laptops. This patch adds a routine to the open_pic driver to be able to select a PMU driver so that it bumps it's interrupt priority to above the normal level. This will allow PMU interrupts to occur while another interrupt is pending, and thus reduce the risk of machine beeing abruptly shutdown by the PMU due to a timeout in PMU communication caused by excessive interrupt latency. The problem is very rare, and usually just doesn't happen, but it is still useful to make things even more robust. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/syslib/open_pic.c | 26 +++++++++++++++++++------- drivers/macintosh/via-pmu.c | 10 ++++++++++ include/asm-ppc/open_pic.h | 6 ++++++ 3 files changed, 35 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c index b45d8268bf93..ad39b86ca92c 100644 --- a/arch/ppc/syslib/open_pic.c +++ b/arch/ppc/syslib/open_pic.c @@ -370,8 +370,9 @@ void __init openpic_init(int offset) /* Initialize IPI interrupts */ if ( ppc_md.progress ) ppc_md.progress("openpic: ipi",0x3bb); for (i = 0; i < OPENPIC_NUM_IPI; i++) { - /* Disabled, Priority 10..13 */ - openpic_initipi(i, 10+i, OPENPIC_VEC_IPI+i+offset); + /* Disabled, increased priorities 10..13 */ + openpic_initipi(i, OPENPIC_PRIORITY_IPI_BASE+i, + OPENPIC_VEC_IPI+i+offset); /* IPIs are per-CPU */ irq_desc[OPENPIC_VEC_IPI+i+offset].status |= IRQ_PER_CPU; irq_desc[OPENPIC_VEC_IPI+i+offset].handler = &open_pic_ipi; @@ -399,8 +400,9 @@ void __init openpic_init(int offset) if (sense & IRQ_SENSE_MASK) irq_desc[i+offset].status = IRQ_LEVEL; - /* Enabled, Priority 8 */ - openpic_initirq(i, 8, i+offset, (sense & IRQ_POLARITY_MASK), + /* Enabled, Default priority */ + openpic_initirq(i, OPENPIC_PRIORITY_DEFAULT, i+offset, + (sense & IRQ_POLARITY_MASK), (sense & IRQ_SENSE_MASK)); /* Processor 0 */ openpic_mapirq(i, CPU_MASK_CPU0, CPU_MASK_NONE); @@ -655,6 +657,18 @@ static void __init openpic_maptimer(u_int timer, cpumask_t cpumask) cpus_addr(phys)[0]); } +/* + * Change the priority of an interrupt + */ +void __init +openpic_set_irq_priority(u_int irq, u_int pri) +{ + check_arg_irq(irq); + openpic_safe_writefield(&ISR[irq - open_pic_irq_offset]->Vector_Priority, + OPENPIC_PRIORITY_MASK, + pri << OPENPIC_PRIORITY_SHIFT); +} + /* * Initalize the interrupt source which will generate an NMI. * This raises the interrupt's priority from 8 to 9. @@ -665,9 +679,7 @@ void __init openpic_init_nmi_irq(u_int irq) { check_arg_irq(irq); - openpic_safe_writefield(&ISR[irq - open_pic_irq_offset]->Vector_Priority, - OPENPIC_PRIORITY_MASK, - 9 << OPENPIC_PRIORITY_SHIFT); + openpic_set_irq_priority(irq, OPENPIC_PRIORITY_NMI); } /* diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index b941ee220997..5375df03c6f3 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -63,6 +63,10 @@ #include #endif +#ifdef CONFIG_PPC32 +#include +#endif + /* Some compile options */ #undef SUSPEND_USES_PMU #define DEBUG_SLEEP @@ -407,6 +411,12 @@ static int __init via_pmu_start(void) batt_req.complete = 1; #endif +#ifdef CONFIG_PPC32 + if (pmu_kind == PMU_KEYLARGO_BASED) + openpic_set_irq_priority(vias->intrs[0].line, + OPENPIC_PRIORITY_DEFAULT + 1); +#endif + if (request_irq(vias->intrs[0].line, via_pmu_interrupt, 0, "VIA-PMU", (void *)0)) { printk(KERN_ERR "VIA-PMU: can't get irq %d\n", diff --git a/include/asm-ppc/open_pic.h b/include/asm-ppc/open_pic.h index dbe853319741..7848aa610c05 100644 --- a/include/asm-ppc/open_pic.h +++ b/include/asm-ppc/open_pic.h @@ -25,6 +25,11 @@ #define OPENPIC_VEC_IPI 118 /* and up */ #define OPENPIC_VEC_SPURIOUS 255 +/* Priorities */ +#define OPENPIC_PRIORITY_IPI_BASE 10 +#define OPENPIC_PRIORITY_DEFAULT 4 +#define OPENPIC_PRIORITY_NMI 9 + /* OpenPIC IRQ controller structure */ extern struct hw_interrupt_type open_pic; @@ -42,6 +47,7 @@ extern int epic_serial_mode; extern void openpic_set_sources(int first_irq, int num_irqs, void __iomem *isr); extern void openpic_init(int linux_irq_offset); extern void openpic_init_nmi_irq(u_int irq); +extern void openpic_set_irq_priority(u_int irq, u_int pri); extern void openpic_hookup_cascade(u_int irq, char *name, int (*cascade_fn)(struct pt_regs *)); extern u_int openpic_irq(void); -- cgit v1.2.3 From 8c8709334cec803368a432a33e0f2e116d48fe07 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 27 Jun 2005 14:36:34 -0700 Subject: [PATCH] ppc32: Remove CONFIG_PMAC_PBOOK This patch removes CONFIG_PMAC_PBOOK (PowerBook support). This is now split into CONFIG_PMAC_MEDIABAY for the actual hotswap bay that some powerbooks have, CONFIG_PM for power management related code, and just left out of any CONFIG_* option for some generally useful stuff that can be used on non-laptops as well. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/platforms/pmac_sleep.S | 4 +- arch/ppc/platforms/pmac_time.c | 8 +- drivers/block/swim3.c | 10 +- drivers/char/misc.c | 3 - drivers/ide/ppc/pmac.c | 8 +- drivers/ieee1394/ohci1394.c | 10 +- drivers/macintosh/Kconfig | 35 ++----- drivers/macintosh/Makefile | 2 +- drivers/macintosh/adb.c | 10 +- drivers/macintosh/via-pmu.c | 70 ++++++-------- drivers/usb/host/ohci-pci.c | 13 +-- drivers/video/aty/aty128fb.c | 14 --- drivers/video/chipsfb.c | 176 +++++++++++++++++++----------------- include/linux/pmu.h | 6 +- sound/oss/dmasound/dmasound_awacs.c | 14 +-- sound/ppc/awacs.c | 8 +- sound/ppc/daca.c | 6 +- sound/ppc/pmac.c | 11 ++- sound/ppc/pmac.h | 2 +- sound/ppc/tumbler.c | 4 +- 20 files changed, 193 insertions(+), 221 deletions(-) (limited to 'arch') diff --git a/arch/ppc/platforms/pmac_sleep.S b/arch/ppc/platforms/pmac_sleep.S index f459ade1bd63..016a74649155 100644 --- a/arch/ppc/platforms/pmac_sleep.S +++ b/arch/ppc/platforms/pmac_sleep.S @@ -46,7 +46,7 @@ .section .text .align 5 -#if defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ_PMAC) +#if defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ_PMAC) /* This gets called by via-pmu.c late during the sleep process. * The PMU was already send the sleep command and will shut us down @@ -382,7 +382,7 @@ turn_on_mmu: isync rfi -#endif /* defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ) */ +#endif /* defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ) */ .section .data .balign L1_CACHE_LINE_SIZE diff --git a/arch/ppc/platforms/pmac_time.c b/arch/ppc/platforms/pmac_time.c index de60ccc7db9f..778ce4fec368 100644 --- a/arch/ppc/platforms/pmac_time.c +++ b/arch/ppc/platforms/pmac_time.c @@ -206,7 +206,7 @@ via_calibrate_decr(void) return 1; } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * Reset the time after a sleep. */ @@ -238,7 +238,7 @@ time_sleep_notify(struct pmu_sleep_notifier *self, int when) static struct pmu_sleep_notifier time_sleep_notifier __pmacdata = { time_sleep_notify, SLEEP_LEVEL_MISC, }; -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* * Query the OF and get the decr frequency. @@ -251,9 +251,9 @@ pmac_calibrate_decr(void) struct device_node *cpu; unsigned int freq, *fp; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM pmu_register_sleep_notifier(&time_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* We assume MacRISC2 machines have correct device-tree * calibration. That's better since the VIA itself seems diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 5b09cf154ac7..e5f7494c00ee 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -253,7 +253,7 @@ static int floppy_revalidate(struct gendisk *disk); static int swim3_add_device(struct device_node *swims); int swim3_init(void); -#ifndef CONFIG_PMAC_PBOOK +#ifndef CONFIG_PMAC_MEDIABAY #define check_media_bay(which, what) 1 #endif @@ -297,9 +297,11 @@ static void do_fd_request(request_queue_t * q) int i; for(i=0;imedia_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif switch (cmd) { case FDEJECT: @@ -881,8 +885,10 @@ static int floppy_open(struct inode *inode, struct file *filp) int n, err = 0; if (fs->ref_count == 0) { +#ifdef CONFIG_PMAC_MEDIABAY if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); out_8(&sw->control_bic, 0xff); out_8(&sw->mode, 0x95); @@ -967,8 +973,10 @@ static int floppy_revalidate(struct gendisk *disk) struct swim3 __iomem *sw; int ret, n; +#ifdef CONFIG_PMAC_MEDIABAY if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD)) return -ENXIO; +#endif sw = fs->swim3; grab_drive(fs, revalidating, 0); diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 31cf84d69026..931efd58f87a 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -308,9 +308,6 @@ static int __init misc_init(void) #endif #ifdef CONFIG_BVME6000 rtc_DP8570A_init(); -#endif -#ifdef CONFIG_PMAC_PBOOK - pmu_device_init(); #endif if (register_chrdev(MISC_MAJOR,"misc",&misc_fops)) { printk("unable to get major %d for misc devices\n", diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c index 569f16767442..818380b5fd27 100644 --- a/drivers/ide/ppc/pmac.c +++ b/drivers/ide/ppc/pmac.c @@ -1324,9 +1324,9 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif) /* XXX FIXME: Media bay stuff need re-organizing */ if (np->parent && np->parent->name && strcasecmp(np->parent->name, "media-bay") == 0) { -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PMAC_MEDIABAY media_bay_set_ide_infos(np->parent, pmif->regbase, pmif->irq, hwif->index); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PMAC_MEDIABAY */ pmif->mediabay = 1; if (!bidp) pmif->aapl_bus_id = 1; @@ -1382,10 +1382,10 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif) hwif->index, model_name[pmif->kind], pmif->aapl_bus_id, pmif->mediabay ? " (mediabay)" : "", hwif->irq); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PMAC_MEDIABAY if (pmif->mediabay && check_media_bay_by_base(pmif->regbase, MB_CD) == 0) hwif->noprobe = 0; -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PMAC_MEDIABAY */ hwif->sg_max_nents = MAX_DCMDS; diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c index 36e25ac823dc..b3d3d22fde64 100644 --- a/drivers/ieee1394/ohci1394.c +++ b/drivers/ieee1394/ohci1394.c @@ -3538,8 +3538,8 @@ static void ohci1394_pci_remove(struct pci_dev *pdev) static int ohci1394_pci_resume (struct pci_dev *pdev) { -#ifdef CONFIG_PMAC_PBOOK - { +#ifdef CONFIG_PPC_PMAC + if (_machine == _MACH_Pmac) { struct device_node *of_node; /* Re-enable 1394 */ @@ -3547,7 +3547,7 @@ static int ohci1394_pci_resume (struct pci_dev *pdev) if (of_node) pmac_call_feature (PMAC_FTR_1394_ENABLE, of_node, 0, 1); } -#endif +#endif /* CONFIG_PPC_PMAC */ pci_enable_device(pdev); @@ -3557,8 +3557,8 @@ static int ohci1394_pci_resume (struct pci_dev *pdev) static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state) { -#ifdef CONFIG_PMAC_PBOOK - { +#ifdef CONFIG_PPC_PMAC + if (_machine == _MACH_Pmac) { struct device_node *of_node; /* Disable 1394 */ diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index b0ace5bc950c..91691a6c004e 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -86,33 +86,18 @@ config PMAC_SMU on the "SMU" system control chip which replaces the old PMU. If you don't know, say Y. -config PMAC_PBOOK - bool "Power management support for PowerBooks" - depends on ADB_PMU - ---help--- - This provides support for putting a PowerBook to sleep; it also - enables media bay support. Power management works on the - PB2400/3400/3500, Wallstreet, Lombard, and Bronze PowerBook G3 and - the Titanium Powerbook G4, as well as the iBooks. You should get - the power management daemon, pmud, to make it work and you must have - the /dev/pmu device (see the pmud README). - - Get pmud from . - - If you have a PowerBook, you should say Y here. - - You may also want to compile the dma sound driver as a module and - have it autoloaded. The act of removing the module shuts down the - sound hardware for more power savings. - -config PM - bool - depends on PPC_PMAC && ADB_PMU && PMAC_PBOOK - default y - config PMAC_APM_EMU tristate "APM emulation" - depends on PMAC_PBOOK + depends on PPC_PMAC && PPC32 && PM + +config PMAC_MEDIABAY + bool "Support PowerBook hotswap media bay" + depends on PPC_PMAC && PPC32 + help + This option adds support for older PowerBook's hotswap media bay + that can contains batteries, floppy drives, or IDE devices. PCI + devices are not fully supported in the bay as I never had one to + try with # made a separate option since backlight may end up beeing used # on non-powerbook machines (but only on PMU based ones AFAIK) diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile index b3f88a4fcef7..f5ae171dbfef 100644 --- a/drivers/macintosh/Makefile +++ b/drivers/macintosh/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_PPC_PMAC) += macio_asic.o -obj-$(CONFIG_PMAC_PBOOK) += mediabay.o +obj-$(CONFIG_PMAC_MEDIABAY) += mediabay.o obj-$(CONFIG_MAC_EMUMOUSEBTN) += mac_hid.o obj-$(CONFIG_INPUT_ADBHID) += adbhid.o obj-$(CONFIG_ANSLCD) += ans-lcd.o diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 493e2afa191c..c0dc1e3fa58b 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -90,7 +90,7 @@ static int sleepy_trackpad; static int autopoll_devs; int __adb_probe_sync; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM static int adb_notify_sleep(struct pmu_sleep_notifier *self, int when); static struct pmu_sleep_notifier adb_sleep_notifier = { adb_notify_sleep, @@ -320,9 +320,9 @@ int __init adb_init(void) printk(KERN_WARNING "Warning: no ADB interface detected\n"); adb_controller = NULL; } else { -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM pmu_register_sleep_notifier(&adb_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ #ifdef CONFIG_PPC if (machine_is_compatible("AAPL,PowerBook1998") || machine_is_compatible("PowerBook1,1")) @@ -337,7 +337,7 @@ int __init adb_init(void) __initcall(adb_init); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * notify clients before sleep and reset bus afterwards */ @@ -378,7 +378,7 @@ adb_notify_sleep(struct pmu_sleep_notifier *self, int when) } return PBOOK_SLEEP_OK; } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ static int do_adb_reset_bus(void) diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 5375df03c6f3..4a0a0ad2d03c 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -155,10 +155,10 @@ static spinlock_t pmu_lock; static u8 pmu_intr_mask; static int pmu_version; static int drop_interrupts; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM static int option_lid_wakeup = 1; static int sleep_in_progress; -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ static unsigned long async_req_locks; static unsigned int pmu_irq_stats[11]; @@ -168,7 +168,6 @@ static struct proc_dir_entry *proc_pmu_irqstats; static struct proc_dir_entry *proc_pmu_options; static int option_server_mode; -#ifdef CONFIG_PMAC_PBOOK int pmu_battery_count; int pmu_cur_battery; unsigned int pmu_power_flags; @@ -176,7 +175,6 @@ struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES]; static int query_batt_timer = BATTERY_POLLING_COUNT; static struct adb_request batt_req; static struct proc_dir_entry *proc_pmu_batt[PMU_MAX_BATTERIES]; -#endif /* CONFIG_PMAC_PBOOK */ #if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT) extern int disable_kernel_backlight; @@ -210,11 +208,9 @@ static int proc_get_irqstats(char *page, char **start, off_t off, static int pmu_set_backlight_level(int level, void* data); static int pmu_set_backlight_enable(int on, int level, void* data); #endif /* CONFIG_PMAC_BACKLIGHT */ -#ifdef CONFIG_PMAC_PBOOK static void pmu_pass_intr(unsigned char *data, int len); static int proc_get_batt(char *page, char **start, off_t off, int count, int *eof, void *data); -#endif /* CONFIG_PMAC_PBOOK */ static int proc_read_options(char *page, char **start, off_t off, int count, int *eof, void *data); static int proc_write_options(struct file *file, const char __user *buffer, @@ -407,9 +403,7 @@ static int __init via_pmu_start(void) bright_req_1.complete = 1; bright_req_2.complete = 1; -#ifdef CONFIG_PMAC_PBOOK batt_req.complete = 1; -#endif #ifdef CONFIG_PPC32 if (pmu_kind == PMU_KEYLARGO_BASED) @@ -468,7 +462,7 @@ static int __init via_pmu_dev_init(void) register_backlight_controller(&pmu_backlight_controller, NULL, "pmu"); #endif /* CONFIG_PMAC_BACKLIGHT */ -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PPC32 if (machine_is_compatible("AAPL,3400/2400") || machine_is_compatible("AAPL,3500")) { int mb = pmac_call_feature(PMAC_FTR_GET_MB_INFO, @@ -496,20 +490,19 @@ static int __init via_pmu_dev_init(void) pmu_batteries[1].flags |= PMU_BATT_TYPE_SMART; } } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PPC32 */ + /* Create /proc/pmu */ proc_pmu_root = proc_mkdir("pmu", NULL); if (proc_pmu_root) { -#ifdef CONFIG_PMAC_PBOOK - int i; + long i; for (i=0; i= 0) p += sprintf(p, "lid_wakeup=%d\n", option_lid_wakeup); -#endif /* CONFIG_PMAC_PBOOK */ +#endif if (pmu_kind == PMU_KEYLARGO_BASED) p += sprintf(p, "server_mode=%d\n", option_server_mode); @@ -932,12 +917,12 @@ proc_write_options(struct file *file, const char __user *buffer, *(val++) = 0; while(*val == ' ') val++; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM if (pmu_kind == PMU_KEYLARGO_BASED && pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0) if (!strcmp(label, "lid_wakeup")) option_lid_wakeup = ((*val) == '1'); -#endif /* CONFIG_PMAC_PBOOK */ +#endif if (pmu_kind == PMU_KEYLARGO_BASED && !strcmp(label, "server_mode")) { int new_value; new_value = ((*val) == '1'); @@ -1432,7 +1417,6 @@ next: } /* Tick interrupt */ else if ((1 << pirq) & PMU_INT_TICK) { -#ifdef CONFIG_PMAC_PBOOK /* Environement or tick interrupt, query batteries */ if (pmu_battery_count) { if ((--query_batt_timer) == 0) { @@ -1447,7 +1431,6 @@ next: pmu_pass_intr(data, len); } else { pmu_pass_intr(data, len); -#endif /* CONFIG_PMAC_PBOOK */ } goto next; } @@ -2062,7 +2045,7 @@ pmu_i2c_simple_write(int bus, int addr, u8* data, int len) return -1; } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM static LIST_HEAD(sleep_notifiers); @@ -2715,6 +2698,8 @@ powerbook_sleep_3400(void) return 0; } +#endif /* CONFIG_PM */ + /* * Support for /dev/pmu device */ @@ -2894,11 +2879,11 @@ static int __pmac pmu_ioctl(struct inode * inode, struct file *filp, u_int cmd, u_long arg) { - struct pmu_private *pp = filp->private_data; __u32 __user *argp = (__u32 __user *)arg; - int error; + int error = -EINVAL; switch (cmd) { +#ifdef CONFIG_PM case PMU_IOC_SLEEP: if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -2920,12 +2905,13 @@ pmu_ioctl(struct inode * inode, struct file *filp, error = -ENOSYS; } sleep_in_progress = 0; - return error; + break; case PMU_IOC_CAN_SLEEP: if (pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) < 0) return put_user(0, argp); else return put_user(1, argp); +#endif /* CONFIG_PM */ #ifdef CONFIG_PMAC_BACKLIGHT /* Backlight should have its own device or go via @@ -2946,11 +2932,13 @@ pmu_ioctl(struct inode * inode, struct file *filp, error = get_user(value, argp); if (!error) error = set_backlight_level(value); - return error; + break; } #ifdef CONFIG_INPUT_ADBHID case PMU_IOC_GRAB_BACKLIGHT: { + struct pmu_private *pp = filp->private_data; unsigned long flags; + if (pp->backlight_locker) return 0; pp->backlight_locker = 1; @@ -2966,7 +2954,7 @@ pmu_ioctl(struct inode * inode, struct file *filp, case PMU_IOC_HAS_ADB: return put_user(pmu_has_adb, argp); } - return -EINVAL; + return error; } static struct file_operations pmu_device_fops __pmacdata = { @@ -2982,14 +2970,16 @@ static struct miscdevice pmu_device __pmacdata = { PMU_MINOR, "pmu", &pmu_device_fops }; -void pmu_device_init(void) +static int pmu_device_init(void) { if (!via) - return; + return 0; if (misc_register(&pmu_device) < 0) printk(KERN_ERR "via-pmu: cannot register misc device.\n"); + return 0; } -#endif /* CONFIG_PMAC_PBOOK */ +device_initcall(pmu_device_init); + #ifdef DEBUG_SLEEP static inline void __pmac @@ -3157,12 +3147,12 @@ EXPORT_SYMBOL(pmu_i2c_combined_read); EXPORT_SYMBOL(pmu_i2c_stdsub_write); EXPORT_SYMBOL(pmu_i2c_simple_read); EXPORT_SYMBOL(pmu_i2c_simple_write); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM EXPORT_SYMBOL(pmu_register_sleep_notifier); EXPORT_SYMBOL(pmu_unregister_sleep_notifier); EXPORT_SYMBOL(pmu_enable_irled); EXPORT_SYMBOL(pmu_battery_count); EXPORT_SYMBOL(pmu_batteries); EXPORT_SYMBOL(pmu_power_flags); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c index 57fd07d00549..eede6be098d2 100644 --- a/drivers/usb/host/ohci-pci.c +++ b/drivers/usb/host/ohci-pci.c @@ -14,14 +14,11 @@ * This file is licenced under the GPL. */ -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PPC_PMAC #include #include #include #include -#ifndef CONFIG_PM -# define CONFIG_PM -#endif #endif #ifndef CONFIG_PCI @@ -132,7 +129,7 @@ static int ohci_pci_suspend (struct usb_hcd *hcd, pm_message_t message) /* let things settle down a bit */ msleep (100); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PPC_PMAC if (_machine == _MACH_Pmac) { struct device_node *of_node; @@ -141,7 +138,7 @@ static int ohci_pci_suspend (struct usb_hcd *hcd, pm_message_t message) if (of_node) pmac_call_feature(PMAC_FTR_USB_ENABLE, of_node, 0, 0); } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PPC_PMAC */ return 0; } @@ -151,7 +148,7 @@ static int ohci_pci_resume (struct usb_hcd *hcd) struct ohci_hcd *ohci = hcd_to_ohci (hcd); int retval = 0; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PPC_PMAC if (_machine == _MACH_Pmac) { struct device_node *of_node; @@ -160,7 +157,7 @@ static int ohci_pci_resume (struct usb_hcd *hcd) if (of_node) pmac_call_feature (PMAC_FTR_USB_ENABLE, of_node, 0, 1); } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PPC_PMAC */ /* resume root hub */ if (time_before (jiffies, ohci->next_statechange)) diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c index 9789115980a5..7bc1d44d8814 100644 --- a/drivers/video/aty/aty128fb.c +++ b/drivers/video/aty/aty128fb.c @@ -350,10 +350,8 @@ static int default_vmode __initdata = VMODE_1024_768_60; static int default_cmode __initdata = CMODE_8; #endif -#ifdef CONFIG_PMAC_PBOOK static int default_crt_on __initdata = 0; static int default_lcd_on __initdata = 1; -#endif #ifdef CONFIG_MTRR static int mtrr = 1; @@ -1249,7 +1247,6 @@ static int aty128_crtc_to_var(const struct aty128_crtc *crtc, return 0; } -#ifdef CONFIG_PMAC_PBOOK static void aty128_set_crt_enable(struct aty128fb_par *par, int on) { if (on) { @@ -1284,7 +1281,6 @@ static void aty128_set_lcd_enable(struct aty128fb_par *par, int on) aty_st_le32(LVDS_GEN_CNTL, reg); } } -#endif /* CONFIG_PMAC_PBOOK */ static void aty128_set_pll(struct aty128_pll *pll, const struct aty128fb_par *par) { @@ -1491,12 +1487,10 @@ static int aty128fb_set_par(struct fb_info *info) info->fix.visual = par->crtc.bpp == 8 ? FB_VISUAL_PSEUDOCOLOR : FB_VISUAL_DIRECTCOLOR; -#ifdef CONFIG_PMAC_PBOOK if (par->chip_gen == rage_M3) { aty128_set_crt_enable(par, par->crt_on); aty128_set_lcd_enable(par, par->lcd_on); } -#endif if (par->accel_flags & FB_ACCELF_TEXT) aty128_init_engine(par); @@ -1652,7 +1646,6 @@ static int __init aty128fb_setup(char *options) return 0; while ((this_opt = strsep(&options, ",")) != NULL) { -#ifdef CONFIG_PMAC_PBOOK if (!strncmp(this_opt, "lcd:", 4)) { default_lcd_on = simple_strtoul(this_opt+4, NULL, 0); continue; @@ -1660,7 +1653,6 @@ static int __init aty128fb_setup(char *options) default_crt_on = simple_strtoul(this_opt+4, NULL, 0); continue; } -#endif #ifdef CONFIG_MTRR if(!strncmp(this_opt, "nomtrr", 6)) { mtrr = 0; @@ -1752,10 +1744,8 @@ static int __init aty128_init(struct pci_dev *pdev, const struct pci_device_id * info->fbops = &aty128fb_ops; info->flags = FBINFO_FLAG_DEFAULT; -#ifdef CONFIG_PMAC_PBOOK par->lcd_on = default_lcd_on; par->crt_on = default_crt_on; -#endif var = default_var; #ifdef CONFIG_PPC_PMAC @@ -2035,12 +2025,10 @@ static int aty128fb_blank(int blank, struct fb_info *fb) aty_st_8(CRTC_EXT_CNTL+1, state); -#ifdef CONFIG_PMAC_PBOOK if (par->chip_gen == rage_M3) { aty128_set_crt_enable(par, par->crt_on && !blank); aty128_set_lcd_enable(par, par->lcd_on && !blank); } -#endif #ifdef CONFIG_PMAC_BACKLIGHT if ((_machine == _MACH_Pmac) && !blank) set_backlight_enable(1); @@ -2124,7 +2112,6 @@ static int aty128fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long arg, struct fb_info *info) { -#ifdef CONFIG_PMAC_PBOOK struct aty128fb_par *par = info->par; u32 value; int rc; @@ -2149,7 +2136,6 @@ static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd, value = (par->crt_on << 1) | par->lcd_on; return put_user(value, (__u32 __user *)arg); } -#endif return -EINVAL; } diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c index 95e72550d43f..e75a965ec760 100644 --- a/drivers/video/chipsfb.c +++ b/drivers/video/chipsfb.c @@ -28,22 +28,17 @@ #include #include #include +#include #include #ifdef CONFIG_PMAC_BACKLIGHT #include #endif -#ifdef CONFIG_PMAC_PBOOK -#include -#include -#endif /* * Since we access the display with inb/outb to fixed port numbers, * we can only handle one 6555x chip. -- paulus */ -static struct fb_info chipsfb_info; - #define write_ind(num, val, ap, dp) do { \ outb((num), (ap)); outb((val), (dp)); \ } while (0) @@ -74,14 +69,6 @@ static struct fb_info chipsfb_info; inb(0x3da); read_ind(num, var, 0x3c0, 0x3c1); \ } while (0) -#ifdef CONFIG_PMAC_PBOOK -static unsigned char *save_framebuffer; -int chips_sleep_notify(struct pmu_sleep_notifier *self, int when); -static struct pmu_sleep_notifier chips_sleep_notifier = { - chips_sleep_notify, SLEEP_LEVEL_VIDEO, -}; -#endif - /* * Exported functions */ @@ -356,6 +343,8 @@ static struct fb_var_screeninfo chipsfb_var __initdata = { static void __init init_chips(struct fb_info *p, unsigned long addr) { + memset(p->screen_base, 0, 0x100000); + p->fix = chipsfb_fix; p->fix.smem_start = addr; @@ -366,34 +355,41 @@ static void __init init_chips(struct fb_info *p, unsigned long addr) fb_alloc_cmap(&p->cmap, 256, 0); - if (register_framebuffer(p) < 0) { - printk(KERN_ERR "C&T 65550 framebuffer failed to register\n"); - return; - } - - printk(KERN_INFO "fb%d: Chips 65550 frame buffer (%dK RAM detected)\n", - p->node, p->fix.smem_len / 1024); - chips_hw_init(); } static int __devinit chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) { - struct fb_info *p = &chipsfb_info; + struct fb_info *p; unsigned long addr, size; unsigned short cmd; + int rc = -ENODEV; + + if (pci_enable_device(dp) < 0) { + dev_err(&dp->dev, "Cannot enable PCI device\n"); + goto err_out; + } if ((dp->resource[0].flags & IORESOURCE_MEM) == 0) - return -ENODEV; + goto err_disable; addr = pci_resource_start(dp, 0); size = pci_resource_len(dp, 0); if (addr == 0) - return -ENODEV; - if (p->screen_base != 0) - return -EBUSY; - if (!request_mem_region(addr, size, "chipsfb")) - return -EBUSY; + goto err_disable; + + p = framebuffer_alloc(0, &dp->dev); + if (p == NULL) { + dev_err(&dp->dev, "Cannot allocate framebuffer structure\n"); + rc = -ENOMEM; + goto err_disable; + } + + if (pci_request_region(dp, 0, "chipsfb") != 0) { + dev_err(&dp->dev, "Cannot request framebuffer\n"); + rc = -EBUSY; + goto err_release_fb; + } #ifdef __BIG_ENDIAN addr += 0x800000; // Use big-endian aperture @@ -411,37 +407,89 @@ chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) set_backlight_enable(1); #endif /* CONFIG_PMAC_BACKLIGHT */ +#ifdef CONFIG_PPC p->screen_base = __ioremap(addr, 0x200000, _PAGE_NO_CACHE); +#else + p->screen_base = ioremap(addr, 0x200000); +#endif if (p->screen_base == NULL) { - release_mem_region(addr, size); - return -ENOMEM; + dev_err(&dp->dev, "Cannot map framebuffer\n"); + rc = -ENOMEM; + goto err_release_pci; } + + pci_set_drvdata(dp, p); p->device = &dp->dev; + init_chips(p, addr); -#ifdef CONFIG_PMAC_PBOOK - pmu_register_sleep_notifier(&chips_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ + if (register_framebuffer(p) < 0) { + dev_err(&dp->dev,"C&T 65550 framebuffer failed to register\n"); + goto err_unmap; + } + + dev_info(&dp->dev,"fb%d: Chips 65550 frame buffer" + " (%dK RAM detected)\n", + p->node, p->fix.smem_len / 1024); - pci_set_drvdata(dp, p); return 0; + + err_unmap: + iounmap(p->screen_base); + err_release_pci: + pci_release_region(dp, 0); + err_release_fb: + framebuffer_release(p); + err_disable: + err_out: + return rc; } static void __devexit chipsfb_remove(struct pci_dev *dp) { struct fb_info *p = pci_get_drvdata(dp); - if (p != &chipsfb_info || p->screen_base == NULL) + if (p->screen_base == NULL) return; unregister_framebuffer(p); iounmap(p->screen_base); p->screen_base = NULL; - release_mem_region(pci_resource_start(dp, 0), pci_resource_len(dp, 0)); + pci_release_region(dp, 0); +} + +#ifdef CONFIG_PM +static int chipsfb_pci_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct fb_info *p = pci_get_drvdata(pdev); + + if (state == pdev->dev.power.power_state) + return 0; + if (state != PM_SUSPEND_MEM) + goto done; + + acquire_console_sem(); + chipsfb_blank(1, p); + fb_set_suspend(p, 1); + release_console_sem(); + done: + pdev->dev.power.power_state = state; + return 0; +} + +static int chipsfb_pci_resume(struct pci_dev *pdev) +{ + struct fb_info *p = pci_get_drvdata(pdev); -#ifdef CONFIG_PMAC_PBOOK - pmu_unregister_sleep_notifier(&chips_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ + acquire_console_sem(); + fb_set_suspend(p, 0); + chipsfb_blank(0, p); + release_console_sem(); + + pdev->dev.power.power_state = PMSG_ON; + return 0; } +#endif /* CONFIG_PM */ + static struct pci_device_id chipsfb_pci_tbl[] = { { PCI_VENDOR_ID_CT, PCI_DEVICE_ID_CT_65550, PCI_ANY_ID, PCI_ANY_ID }, @@ -455,6 +503,10 @@ static struct pci_driver chipsfb_driver = { .id_table = chipsfb_pci_tbl, .probe = chipsfb_pci_init, .remove = __devexit_p(chipsfb_remove), +#ifdef CONFIG_PM + .suspend = chipsfb_pci_suspend, + .resume = chipsfb_pci_resume, +#endif }; int __init chips_init(void) @@ -472,48 +524,4 @@ static void __exit chipsfb_exit(void) pci_unregister_driver(&chipsfb_driver); } -#ifdef CONFIG_PMAC_PBOOK -/* - * Save the contents of the frame buffer when we go to sleep, - * and restore it when we wake up again. - */ -int -chips_sleep_notify(struct pmu_sleep_notifier *self, int when) -{ - struct fb_info *p = &chipsfb_info; - int nb = p->var.yres * p->fix.line_length; - - if (p->screen_base == NULL) - return PBOOK_SLEEP_OK; - - switch (when) { - case PBOOK_SLEEP_REQUEST: - save_framebuffer = vmalloc(nb); - if (save_framebuffer == NULL) - return PBOOK_SLEEP_REFUSE; - break; - case PBOOK_SLEEP_REJECT: - if (save_framebuffer) { - vfree(save_framebuffer); - save_framebuffer = NULL; - } - break; - case PBOOK_SLEEP_NOW: - chipsfb_blank(1, p); - if (save_framebuffer) - memcpy(save_framebuffer, p->screen_base, nb); - break; - case PBOOK_WAKE: - if (save_framebuffer) { - memcpy(p->screen_base, save_framebuffer, nb); - vfree(save_framebuffer); - save_framebuffer = NULL; - } - chipsfb_blank(0, p); - break; - } - return PBOOK_SLEEP_OK; -} -#endif /* CONFIG_PMAC_PBOOK */ - MODULE_LICENSE("GPL"); diff --git a/include/linux/pmu.h b/include/linux/pmu.h index 6d73eada277e..373bd3b9b330 100644 --- a/include/linux/pmu.h +++ b/include/linux/pmu.h @@ -166,7 +166,7 @@ extern int pmu_i2c_simple_read(int bus, int addr, u8* data, int len); extern int pmu_i2c_simple_write(int bus, int addr, u8* data, int len); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * Stuff for putting the powerbook to sleep and waking it again. * @@ -208,6 +208,8 @@ struct pmu_sleep_notifier int pmu_register_sleep_notifier(struct pmu_sleep_notifier* notifier); int pmu_unregister_sleep_notifier(struct pmu_sleep_notifier* notifier); +#endif /* CONFIG_PM */ + #define PMU_MAX_BATTERIES 2 /* values for pmu_power_flags */ @@ -235,6 +237,4 @@ extern int pmu_battery_count; extern struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES]; extern unsigned int pmu_power_flags; -#endif /* CONFIG_PMAC_PBOOK */ - #endif /* __KERNEL__ */ diff --git a/sound/oss/dmasound/dmasound_awacs.c b/sound/oss/dmasound/dmasound_awacs.c index 33108661e671..2704e1598add 100644 --- a/sound/oss/dmasound/dmasound_awacs.c +++ b/sound/oss/dmasound/dmasound_awacs.c @@ -255,7 +255,7 @@ static int awacs_burgundy_read_mvolume(unsigned address); static volatile struct dbdma_cmd *emergency_dbdma_cmd; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * Stuff for restoring after a sleep. */ @@ -263,7 +263,7 @@ static int awacs_sleep_notify(struct pmu_sleep_notifier *self, int when); struct pmu_sleep_notifier awacs_sleep_notifier = { awacs_sleep_notify, SLEEP_LEVEL_SOUND, }; -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* for (soft) sample rate translations */ int expand_bal; /* Balance factor for expanding (not volume!) */ @@ -675,7 +675,7 @@ static void PMacIrqCleanup(void) kfree(awacs_rx_cmd_space); kfree(beep_dbdma_cmd_space); kfree(beep_buf); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM pmu_unregister_sleep_notifier(&awacs_sleep_notifier); #endif } @@ -1415,7 +1415,7 @@ load_awacs(void) } } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* * Save state when going to sleep, restore it afterwards. */ @@ -1551,7 +1551,7 @@ static int awacs_sleep_notify(struct pmu_sleep_notifier *self, int when) } return PBOOK_SLEEP_OK; } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* All the burgundy functions: */ @@ -3053,9 +3053,9 @@ printk("dmasound_pmac: Awacs/Screamer Codec Mfct: %d Rev %d\n", mfg, rev); if ((res=setup_beep())) return res ; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM pmu_register_sleep_notifier(&awacs_sleep_notifier); -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ /* Powerbooks have odd ways of enabling inputs such as an expansion-bay CD or sound from an internal modem diff --git a/sound/ppc/awacs.c b/sound/ppc/awacs.c index e052bd071e5b..061e52d3d771 100644 --- a/sound/ppc/awacs.c +++ b/sound/ppc/awacs.c @@ -90,7 +90,7 @@ snd_pmac_awacs_write_noreg(pmac_t *chip, int reg, int val) snd_pmac_awacs_write(chip, val | (reg << 12)); } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* Recalibrate chip */ static void screamer_recalibrate(pmac_t *chip) { @@ -642,7 +642,7 @@ static void awacs_restore_all_regs(pmac_t *chip) } } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM static void snd_pmac_awacs_suspend(pmac_t *chip) { snd_pmac_awacs_write_noreg(chip, 1, (chip->awacs_reg[1] @@ -676,7 +676,7 @@ static void snd_pmac_awacs_resume(pmac_t *chip) } #endif } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ #ifdef PMAC_SUPPORT_AUTOMUTE /* @@ -883,7 +883,7 @@ snd_pmac_awacs_init(pmac_t *chip) * set lowlevel callbacks */ chip->set_format = snd_pmac_awacs_set_format; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM chip->suspend = snd_pmac_awacs_suspend; chip->resume = snd_pmac_awacs_resume; #endif diff --git a/sound/ppc/daca.c b/sound/ppc/daca.c index f24a91693616..a737f298e77d 100644 --- a/sound/ppc/daca.c +++ b/sound/ppc/daca.c @@ -218,7 +218,7 @@ static snd_kcontrol_new_t daca_mixers[] = { }; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM static void daca_resume(pmac_t *chip) { pmac_daca_t *mix = chip->mixer_data; @@ -227,7 +227,7 @@ static void daca_resume(pmac_t *chip) mix->amp_on ? 0x05 : 0x04); daca_set_volume(mix); } -#endif /* CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ static void daca_cleanup(pmac_t *chip) @@ -275,7 +275,7 @@ int __init snd_pmac_daca_init(pmac_t *chip) return err; } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM chip->resume = daca_resume; #endif diff --git a/sound/ppc/pmac.c b/sound/ppc/pmac.c index 080ef3928465..75b8b7423036 100644 --- a/sound/ppc/pmac.c +++ b/sound/ppc/pmac.c @@ -36,7 +36,7 @@ #include -#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK) +#ifdef CONFIG_PM static int snd_pmac_register_sleep_notifier(pmac_t *chip); static int snd_pmac_unregister_sleep_notifier(pmac_t *chip); static int snd_pmac_suspend(snd_card_t *card, pm_message_t state); @@ -782,7 +782,7 @@ static int snd_pmac_free(pmac_t *chip) } snd_pmac_sound_feature(chip, 0); -#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK) +#ifdef CONFIG_PM snd_pmac_unregister_sleep_notifier(chip); #endif @@ -1292,7 +1292,7 @@ int __init snd_pmac_new(snd_card_t *card, pmac_t **chip_return) /* Reset dbdma channels */ snd_pmac_dbdma_reset(chip); -#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK) +#ifdef CONFIG_PM /* add sleep notifier */ if (! snd_pmac_register_sleep_notifier(chip)) snd_card_set_pm_callback(chip->card, snd_pmac_suspend, snd_pmac_resume, chip); @@ -1316,7 +1316,7 @@ int __init snd_pmac_new(snd_card_t *card, pmac_t **chip_return) * sleep notify for powerbook */ -#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK) +#ifdef CONFIG_PM /* * Save state when going to sleep, restore it afterwards. @@ -1414,4 +1414,5 @@ static int snd_pmac_unregister_sleep_notifier(pmac_t *chip) return 0; } -#endif /* CONFIG_PM && CONFIG_PMAC_PBOOK */ +#endif /* CONFIG_PM */ + diff --git a/sound/ppc/pmac.h b/sound/ppc/pmac.h index 0a84c05f714b..582db5220119 100644 --- a/sound/ppc/pmac.h +++ b/sound/ppc/pmac.h @@ -167,7 +167,7 @@ struct snd_pmac { void (*set_format)(pmac_t *chip); void (*update_automute)(pmac_t *chip, int do_notify); int (*detect_headphone)(pmac_t *chip); -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM void (*suspend)(pmac_t *chip); void (*resume)(pmac_t *chip); #endif diff --git a/sound/ppc/tumbler.c b/sound/ppc/tumbler.c index 9332237cb6a4..36c5d5d45bb1 100644 --- a/sound/ppc/tumbler.c +++ b/sound/ppc/tumbler.c @@ -1128,7 +1128,7 @@ static void tumbler_reset_audio(pmac_t *chip) } } -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM /* suspend mixer */ static void tumbler_suspend(pmac_t *chip) { @@ -1370,7 +1370,7 @@ int __init snd_pmac_tumbler_init(pmac_t *chip) if ((err = snd_ctl_add(chip->card, chip->drc_sw_ctl)) < 0) return err; -#ifdef CONFIG_PMAC_PBOOK +#ifdef CONFIG_PM chip->suspend = tumbler_suspend; chip->resume = tumbler_resume; #endif -- cgit v1.2.3 From 6ae3db110e62b0846aae1b5c6e661484ee3a5ed1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 27 Jun 2005 14:36:35 -0700 Subject: [PATCH] ppc64: Add missing exports This patch adds a couple of missing symbol exports. flush_dcache_page is used by the AGP driver and rtc_lock by the RTC driver. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/ppc_ksyms.c | 1 + arch/ppc64/kernel/time.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/ppc64/kernel/ppc_ksyms.c b/arch/ppc64/kernel/ppc_ksyms.c index b230a63fe4c8..705742f4eec6 100644 --- a/arch/ppc64/kernel/ppc_ksyms.c +++ b/arch/ppc64/kernel/ppc_ksyms.c @@ -75,6 +75,7 @@ EXPORT_SYMBOL(giveup_fpu); EXPORT_SYMBOL(giveup_altivec); #endif EXPORT_SYMBOL(__flush_icache_range); +EXPORT_SYMBOL(flush_dcache_range); #ifdef CONFIG_SMP #ifdef CONFIG_PPC_ISERIES diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 2348a75e050d..2a532db9138a 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -91,6 +91,7 @@ unsigned long tb_to_xs; unsigned tb_to_us; unsigned long processor_freq; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL_GPL(rtc_lock); unsigned long tb_to_ns_scale; unsigned long tb_to_ns_shift; -- cgit v1.2.3 From ffaa8bd6c904d1ab79b677905067349a5ff51d84 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 27 Jun 2005 14:36:36 -0700 Subject: [PATCH] seccomp: tsc disable I believe at least for seccomp it's worth to turn off the tsc, not just for HT but for the L2 cache too. So it's up to you, either you turn it off completely (which isn't very nice IMHO) or I recommend to apply this below patch. This has been tested successfully on x86-64 against current cogito repository (i686 compiles so I didn't bother testing ;). People selling the cpu through cpushare may appreciate this bit for a peace of mind. There's no way to get any timing info anymore with this applied (gettimeofday is forbidden of course). The seccomp environment is completely deterministic so it can't be allowed to get timing info, it has to be deterministic so in the future I can enable a computing mode that does a parallel computing for each task with server side transparent checkpointing and verification that the output is the same from all the 2/3 seller computers for each task, without the buyer even noticing (for now the verification is left to the buyer client side and there's no checkpointing, since that would require more kernel changes to track the dirty bits but it'll be easy to extend once the basic mode is finished). Eliminating a cold-cache read of the cr4 global variable will save one cacheline during the tlb flush while making the code per-cpu-safe at the same time. Thanks to Mikael Pettersson for noticing the tlb flush wasn't per-cpu-safe. The global tlb flush can run from irq (IPI calling do_flush_tlb_all) but it'll be transparent to the switch_to code since the IPI won't make any change to the cr4 contents from the point of view of the interrupted code and since it's now all per-cpu stuff, it will not race. So no need to disable irqs in switch_to slow path. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/process.c | 29 +++++++++++++++++++++++++++++ arch/x86_64/kernel/process.c | 29 +++++++++++++++++++++++++++++ include/asm-i386/tlbflush.h | 12 +++++++----- include/asm-x86_64/tlbflush.h | 12 +++++++----- include/linux/seccomp.h | 10 ++++++++++ 5 files changed, 82 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 5f8cfa6b7940..ba243a4cc119 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -616,6 +616,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } +/* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + /* * switch_to(x,yn) should switch tasks from x to y. * @@ -695,6 +722,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) handle_io_bitmap(next, tss); + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 1d91271796e5..7577f9d7a75d 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -481,6 +481,33 @@ out: return err; } +/* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + /* * This special macro can be used to load a debugging register */ @@ -599,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h index f22fab0cea26..ab216e1370ef 100644 --- a/include/asm-i386/tlbflush.h +++ b/include/asm-i386/tlbflush.h @@ -22,16 +22,18 @@ */ #define __flush_tlb_global() \ do { \ - unsigned int tmpreg; \ + unsigned int tmpreg, cr4, cr4_orig; \ \ __asm__ __volatile__( \ - "movl %1, %%cr4; # turn off PGE \n" \ + "movl %%cr4, %2; # turn off PGE \n" \ + "movl %2, %1; \n" \ + "andl %3, %1; \n" \ + "movl %1, %%cr4; \n" \ "movl %%cr3, %0; \n" \ "movl %0, %%cr3; # flush TLB \n" \ "movl %2, %%cr4; # turn PGE back on \n" \ - : "=&r" (tmpreg) \ - : "r" (mmu_cr4_features & ~X86_CR4_PGE), \ - "r" (mmu_cr4_features) \ + : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \ + : "i" (~X86_CR4_PGE) \ : "memory"); \ } while (0) diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h index 2e811ac262af..061742382520 100644 --- a/include/asm-x86_64/tlbflush.h +++ b/include/asm-x86_64/tlbflush.h @@ -22,16 +22,18 @@ */ #define __flush_tlb_global() \ do { \ - unsigned long tmpreg; \ + unsigned long tmpreg, cr4, cr4_orig; \ \ __asm__ __volatile__( \ - "movq %1, %%cr4; # turn off PGE \n" \ + "movq %%cr4, %2; # turn off PGE \n" \ + "movq %2, %1; \n" \ + "andq %3, %1; \n" \ + "movq %1, %%cr4; \n" \ "movq %%cr3, %0; # flush TLB \n" \ "movq %0, %%cr3; \n" \ "movq %2, %%cr4; # turn PGE back on \n" \ - : "=&r" (tmpreg) \ - : "r" (mmu_cr4_features & ~X86_CR4_PGE), \ - "r" (mmu_cr4_features) \ + : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \ + : "i" (~X86_CR4_PGE) \ : "memory"); \ } while (0) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 3a2702bbb1d6..dc89116bb1ca 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -19,6 +19,11 @@ static inline void secure_computing(int this_syscall) __secure_computing(this_syscall); } +static inline int has_secure_computing(struct thread_info *ti) +{ + return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP)); +} + #else /* CONFIG_SECCOMP */ #if (__GNUC__ > 2) @@ -28,6 +33,11 @@ static inline void secure_computing(int this_syscall) #endif #define secure_computing(x) do { } while (0) +/* static inline to preserve typechecking */ +static inline int has_secure_computing(struct thread_info *ti) +{ + return 0; +} #endif /* CONFIG_SECCOMP */ -- cgit v1.2.3 From 9ec4b1f356b3bad928ae8e2aa9caebfa737d52df Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Mon, 27 Jun 2005 15:17:01 -0700 Subject: [PATCH] kprobes: fix single-step out of line - take2 Now that PPC64 has no-execute support, here is a second try to fix the single step out of line during kprobe execution. Kprobes on x86_64 already solved this problem by allocating an executable page and using it as the scratch area for stepping out of line. Reuse that. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/kprobes.c | 26 ++++++++-- arch/x86_64/kernel/kprobes.c | 113 +------------------------------------------ include/asm-ia64/kprobes.h | 1 + include/asm-ppc64/kprobes.h | 2 +- include/linux/kprobes.h | 2 + kernel/kprobes.c | 101 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 117 deletions(-) (limited to 'arch') diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c index 782ce3efa2c1..86cc5496db9f 100644 --- a/arch/ppc64/kernel/kprobes.c +++ b/arch/ppc64/kernel/kprobes.c @@ -36,6 +36,8 @@ #include #include +static DECLARE_MUTEX(kprobe_mutex); + static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_saved_msr; static struct kprobe *kprobe_prev; @@ -54,6 +56,15 @@ int arch_prepare_kprobe(struct kprobe *p) printk("Cannot register a kprobe on rfid or mtmsrd\n"); ret = -EINVAL; } + + /* insn must be on a special executable page on ppc64 */ + if (!ret) { + up(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + down(&kprobe_mutex); + if (!p->ainsn.insn) + ret = -ENOMEM; + } return ret; } @@ -79,16 +90,22 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { + up(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + down(&kprobe_mutex); } static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { + kprobe_opcode_t insn = *p->ainsn.insn; + regs->msr |= MSR_SE; - /*single step inline if it a breakpoint instruction*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) + + /* single step inline if it is a trap variant */ + if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn)) regs->nip = (unsigned long)p->addr; else - regs->nip = (unsigned long)&p->ainsn.insn; + regs->nip = (unsigned long)p->ainsn.insn; } static inline void save_previous_kprobe(void) @@ -205,9 +222,10 @@ no_kprobe: static void resume_execution(struct kprobe *p, struct pt_regs *regs) { int ret; + unsigned int insn = *p->ainsn.insn; regs->nip = (unsigned long)p->addr; - ret = emulate_step(regs, p->ainsn.insn[0]); + ret = emulate_step(regs, insn); if (ret == 0) regs->nip = (unsigned long)p->addr + 4; } diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 4e680f87a75f..6a1c88376bef 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -38,7 +38,7 @@ #include #include #include -#include + #include #include #include @@ -51,8 +51,6 @@ static struct kprobe *kprobe_prev; static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; -static kprobe_opcode_t *get_insn_slot(void); -static void free_insn_slot(kprobe_opcode_t *slot); void jprobe_return_end(void); /* copy of the kernel stack at the probe fire time */ @@ -681,112 +679,3 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } - -/* - * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. - * By default on x86_64, pages we get from kmalloc or vmalloc are not - * executable. Single-stepping an instruction on such a page yields an - * oops. So instead of storing the instruction copies in their respective - * kprobe objects, we allocate a page, map it executable, and store all the - * instruction copies there. (We can allocate additional pages if somebody - * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE - * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) - * bytes. - */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) -struct kprobe_insn_page { - struct hlist_node hlist; - kprobe_opcode_t *insns; /* page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; - int nused; -}; - -static struct hlist_head kprobe_insn_pages; - -/** - * get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -static kprobe_opcode_t *get_insn_slot(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->nused < INSNS_PER_PAGE) { - int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (!kip->slot_used[i]) { - kip->slot_used[i] = 1; - kip->nused++; - return kip->insns + (i*MAX_INSN_SIZE); - } - } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; - } - } - - /* All out of space. Need to allocate a new page. Use slot 0.*/ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { - return NULL; - } - - /* - * For the %rip-relative displacement fixups to be doable, we - * need our instruction copy to be within +/- 2GB of any data it - * might access via %rip. That is, within 2GB of where the - * kernel image and loaded module images reside. So we allocate - * a page in the module loading area. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, 0, INSNS_PER_PAGE); - kip->slot_used[0] = 1; - kip->nused = 1; - return kip->insns; -} - -/** - * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). - */ -static void free_insn_slot(kprobe_opcode_t *slot) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->insns <= slot - && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } - } - return; - } - } -} diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index 7b700035e36d..25d8b1edfcba 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -28,6 +28,7 @@ #include #include +#define MAX_INSN_SIZE 16 #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) typedef union cmp_inst { diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h index 19b468bed059..790cf7c52774 100644 --- a/include/asm-ppc64/kprobes.h +++ b/include/asm-ppc64/kprobes.h @@ -45,7 +45,7 @@ typedef unsigned int kprobe_opcode_t; /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ - kprobe_opcode_t insn[MAX_INSN_SIZE]; + kprobe_opcode_t *insn; }; #ifdef CONFIG_KPROBES diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 5e1a7b0d7b3f..d304d4579856 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -177,6 +177,8 @@ extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern void arch_remove_kprobe(struct kprobe *p); extern void show_registers(struct pt_regs *regs); +extern kprobe_opcode_t *get_insn_slot(void); +extern void free_insn_slot(kprobe_opcode_t *slot); /* Get the kprobe at this addr (if any). Must have called lock_kprobes */ struct kprobe *get_kprobe(void *addr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 334f37472c56..65242529a75f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,106 @@ unsigned int kprobe_cpu = NR_CPUS; static DEFINE_SPINLOCK(kprobe_lock); static struct kprobe *curr_kprobe; +/* + * kprobe->ainsn.insn points to the copy of the instruction to be + * single-stepped. x86_64, POWER4 and above have no-exec support and + * stepping on the instruction on a vmalloced/kmalloced/data page + * is a recipe for disaster + */ +#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + +struct kprobe_insn_page { + struct hlist_node hlist; + kprobe_opcode_t *insns; /* Page of instruction slots */ + char slot_used[INSNS_PER_PAGE]; + int nused; +}; + +static struct hlist_head kprobe_insn_pages; + +/** + * get_insn_slot() - Find a slot on an executable page for an instruction. + * We allocate an executable page if there's no room on existing ones. + */ +kprobe_opcode_t *get_insn_slot(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->nused < INSNS_PER_PAGE) { + int i; + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (!kip->slot_used[i]) { + kip->slot_used[i] = 1; + kip->nused++; + return kip->insns + (i * MAX_INSN_SIZE); + } + } + /* Surprise! No unused slots. Fix kip->nused. */ + kip->nused = INSNS_PER_PAGE; + } + } + + /* All out of space. Need to allocate a new page. Use slot 0.*/ + kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + if (!kip) { + return NULL; + } + + /* + * Use module_alloc so this page is within +/- 2GB of where the + * kernel image and loaded module images reside. This is required + * so x86_64 can correctly handle the %rip-relative fixups. + */ + kip->insns = module_alloc(PAGE_SIZE); + if (!kip->insns) { + kfree(kip); + return NULL; + } + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, &kprobe_insn_pages); + memset(kip->slot_used, 0, INSNS_PER_PAGE); + kip->slot_used[0] = 1; + kip->nused = 1; + return kip->insns; +} + +void free_insn_slot(kprobe_opcode_t *slot) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->insns <= slot && + slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { + int i = (slot - kip->insns) / MAX_INSN_SIZE; + kip->slot_used[i] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + } + return; + } + } +} + /* Locks kprobe: irqs must be disabled */ void lock_kprobes(void) { -- cgit v1.2.3 From 4bdbd37f6d01abc4c002bb8de90ea2c3bc7abe7e Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Mon, 27 Jun 2005 15:17:09 -0700 Subject: [PATCH] Return probe redesign: i386 specific changes The following patch contains the i386 specific changes for the new return probe design. Changes include: * Removing the architecture specific functions for querying a return probe instance off a stack address * Complete rework onf arch_prepare_kretprobe() and trampoline_probe_handler() * Removing trampoline_post_handler() * Adding arch_init() so that now we handle registering the return probe trampoline instead of kernel/kprobes.c doing it Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 133 ++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 3762f6b35ab2..fc8b17521761 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -127,48 +127,23 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->eip = (unsigned long)&p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)®s->esp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -286,36 +261,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = ((unsigned long *) ®s->esp) - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)®s->esp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->eip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->eip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -403,8 +401,7 @@ static inline int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; /*Restore back the original saved kprobes variables and continue. */ @@ -534,3 +531,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + return register_kprobe(&trampoline_p); +} -- cgit v1.2.3 From ba8af12f432c4f00ddb0bc1068b57b20aac93ecf Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Mon, 27 Jun 2005 15:17:10 -0700 Subject: [PATCH] Return probe redesign: x86_64 specific changes The following patch contains the x86_64 specific changes for the new return probe design. Changes include: * Removing the architecture specific functions for querying a return probe instance off a stack address * Complete rework onf arch_prepare_kretprobe() and trampoline_probe_handler() * Removing trampoline_post_handler() * Adding arch_init() so that now we handle registering the return probe trampoline instead of kernel/kprobes.c doing it NOTE: Note that with this new design, the dependency on calculating a pointer to the task off the stack pointer no longer exist (resolving the problem of interruption stacks as pointed out in the original feedback to this port.) Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/kprobes.c | 133 +++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 6a1c88376bef..acd2a778ebe6 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -272,48 +272,23 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)regs->rsp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -426,36 +401,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = (unsigned long *)regs->rsp - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)regs->rsp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->rip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->rip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -548,8 +546,7 @@ int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; /* Restore the original saved kprobes variables and continue. */ @@ -679,3 +676,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + return register_kprobe(&trampoline_p); +} -- cgit v1.2.3 From 9508dbfe39112813612085c00d55bacd398eddc6 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Mon, 27 Jun 2005 15:17:12 -0700 Subject: [PATCH] Return probe redesign: ia64 specific implementation The following patch implements function return probes for ia64 using the revised design. With this new design we no longer need to do some of the odd hacks previous required on the last ia64 return probe port that I sent out for comments. Note that this new implementation still does not resolve the problem noted by Keith Owens where backtrace data is lost after a return probe is hit. Changes include: * Addition of kretprobe_trampoline to act as a dummy function for instrumented functions to return to, and for the return probe infrastructure to place a kprobe on on, gaining control so that the return probe handler can be called, and so that the instruction pointer can be moved back to the original return address. * Addition of arch_init(), allowing a kprobe to be registered on kretprobe_trampoline * Addition of trampoline_probe_handler() which is used as the pre_handler for the kprobe inserted on kretprobe_implementation. This is the function that handles the details for calling the return probe handler function and returning control back at the original return address * Addition of arch_prepare_kretprobe() which is setup as the pre_handler for a kprobe registered at the beginning of the target function by kernel/kprobes.c so that a return probe instance can be setup when a caller enters the target function. (A return probe instance contains all the needed information for trampoline_probe_handler to do it's job.) * Hooks added to the exit path of a task so that we can cleanup any left-over return probe instances (i.e. if a task dies while inside a targeted function then the return probe instance was reserved at the beginning of the function but the function never returns so we need to mark the instance as unused.) Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 103 ++++++++++++++++++++++++++++++++++++++++++++- arch/ia64/kernel/process.c | 16 +++++++ include/asm-ia64/kprobes.h | 13 +++--- 3 files changed, 125 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 5978823d5c63..c97e18e634ca 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -290,6 +290,94 @@ static inline void set_current_kprobe(struct kprobe *p) current_kprobe = p; } +static void kretprobe_trampoline(void) +{ +} + +/* + * At this point the target function has been tricked into + * returning into our trampoline. Lookup the associated instance + * and then: + * - call the handler function + * - cleanup by marking the instance as unused + * - long jump back to the original return address + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address = + ((struct fnptr *)kretprobe_trampoline)->ip; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->cr_iip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->b0; + + /* Replace the return addr with trampoline addr */ + regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + int arch_prepare_kprobe(struct kprobe *p) { unsigned long addr = (unsigned long) p->addr; @@ -492,8 +580,8 @@ static int pre_kprobes_handler(struct die_args *args) if (p->pre_handler && p->pre_handler(p, regs)) /* * Our pre-handler is specifically requesting that we just - * do a return. This is handling the case where the - * pre-handler is really our special jprobe pre-handler. + * do a return. This is used for both the jprobe pre-handler + * and the kretprobe trampoline */ return 1; @@ -599,3 +687,14 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) *regs = jprobe_saved_regs; return 1; } + +static struct kprobe trampoline_p = { + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + trampoline_p.addr = + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + return register_kprobe(&trampoline_p); +} diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index ebb71f3d6d19..6e35bff05d59 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -707,6 +708,13 @@ kernel_thread_helper (int (*fn)(void *), void *arg) void flush_thread (void) { + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + /* drop floating-point and debug-register state if it exists: */ current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); ia64_drop_fpu(current); @@ -721,6 +729,14 @@ flush_thread (void) void exit_thread (void) { + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + ia64_drop_fpu(current); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index 25d8b1edfcba..bf36a32e37e4 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -64,6 +64,8 @@ typedef struct _bundle { #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define ARCH_SUPPORTS_KRETPROBES + #define SLOT0_OPCODE_SHIFT (37) #define SLOT1_p1_OPCODE_SHIFT (37 - (64-46)) #define SLOT2_OPCODE_SHIFT (37) @@ -94,11 +96,6 @@ struct arch_specific_insn { unsigned short target_br_reg; }; -/* ia64 does not need this */ -static inline void jprobe_return(void) -{ -} - /* ia64 does not need this */ static inline void arch_copy_kprobe(struct kprobe *p) { @@ -107,6 +104,12 @@ static inline void arch_copy_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); + +/* ia64 does not need this */ +static inline void jprobe_return(void) +{ +} + #else /* !CONFIG_KPROBES */ static inline int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) -- cgit v1.2.3 From 97f7943d70ff0e1e92ea627c44cfacfdae65dbc4 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Mon, 27 Jun 2005 15:17:15 -0700 Subject: [PATCH] Return probe redesign: ppc64 specific implementation The following is a patch provided by Ananth Mavinakayanahalli that implements the new PPC64 specific parts of the new function return probe design. NOTE: Since getting Ananth's patch, I changed trampoline_probe_handler() to consume each of the outstanding return probem instances (feedback on my original RFC after Ananth cut a patch), and also added the arch_init() function (adding arch specific initialization.) I have cross compiled but have not testing this on a PPC64 machine. Changes include: * Addition of kretprobe_trampoline to act as a dummy function for instrumented functions to return to, and for the return probe infrastructure to place a kprobe on on, gaining control so that the return probe handler can be called, and so that the instruction pointer can be moved back to the original return address. * Addition of arch_init(), allowing a kprobe to be registered on kretprobe_trampoline * Addition of trampoline_probe_handler() which is used as the pre_handler for the kprobe inserted on kretprobe_implementation. This is the function that handles the details for calling the return probe handler function and returning control back at the original return address * Addition of arch_prepare_kretprobe() which is setup as the pre_handler for a kprobe registered at the beginning of the target function by kernel/kprobes.c so that a return probe instance can be setup when a caller enters the target function. (A return probe instance contains all the needed information for trampoline_probe_handler to do it's job.) * Hooks added to the exit path of a task so that we can cleanup any left-over return probe instances (i.e. if a task dies while inside a targeted function then the return probe instance was reserved at the beginning of the function but the function never returns so we need to mark the instance as unused.) Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/kprobes.c | 99 +++++++++++++++++++++++++++++++++++++++++++++ arch/ppc64/kernel/process.c | 4 ++ include/asm-ppc64/kprobes.h | 3 ++ 3 files changed, 106 insertions(+) (limited to 'arch') diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c index 86cc5496db9f..1d2ff6d6b0b3 100644 --- a/arch/ppc64/kernel/kprobes.c +++ b/arch/ppc64/kernel/kprobes.c @@ -122,6 +122,23 @@ static inline void restore_previous_kprobe(void) kprobe_saved_msr = kprobe_saved_msr_prev; } +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + static inline int kprobe_handler(struct pt_regs *regs) { struct kprobe *p; @@ -211,6 +228,78 @@ no_kprobe: return ret; } +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +void kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->nip = orig_ret_address; + + unlock_kprobes(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + /* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" @@ -349,3 +438,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) memcpy(regs, &jprobe_saved_regs, sizeof(struct pt_regs)); return 1; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index aba89554d89d..f7cae05e40fb 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -307,6 +308,8 @@ void show_regs(struct pt_regs * regs) void exit_thread(void) { + kprobe_flush_task(current); + #ifndef CONFIG_SMP if (last_task_used_math == current) last_task_used_math = NULL; @@ -321,6 +324,7 @@ void flush_thread(void) { struct thread_info *t = current_thread_info(); + kprobe_flush_task(current); if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h index 790cf7c52774..0802919c3235 100644 --- a/include/asm-ppc64/kprobes.h +++ b/include/asm-ppc64/kprobes.h @@ -42,6 +42,9 @@ typedef unsigned int kprobe_opcode_t; #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)((func_descr_t *)pentry) +#define ARCH_SUPPORTS_KRETPROBES +void kretprobe_trampoline(void); + /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ -- cgit v1.2.3 From a528e21c235862cc1ae50e7809eb9116dc40ea0c Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Mon, 27 Jun 2005 15:17:15 -0700 Subject: [PATCH] kprobes/ia64: refuse inserting kprobe on slot 1 Without the ability to atomically write 16 bytes, we can not update the middle slot of a bundle, slot 1, unless we stop the machine first. This patch will ensure the ability to robustly insert and remove a kprobe by refusing to insert a kprobe on slot 1 until a mechanism is in place to safely handle this case. Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index c97e18e634ca..ec2ceade12b0 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -270,6 +270,13 @@ static int valid_kprobe_addr(int template, int slot, unsigned long addr) addr); return -EINVAL; } + + if (slot == 1 && bundle_encoding[template][1] != L) { + printk(KERN_WARNING "Inserting kprobes on slot #1 " + "is not supported\n"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From c7b645f934e52a54af58142d91fb51f881f8ce26 Mon Sep 17 00:00:00 2001 From: Keshavamurthy Anil S Date: Mon, 27 Jun 2005 15:17:16 -0700 Subject: [PATCH] kprobes/ia64: refuse kprobe on ivt code Not safe to insert kprobes on IVT code. This patch checks to see if the address on which Kprobes is being inserted is in ivt code and if it is in ivt code then refuse to register kprobe. Signed-off-by: Anil S Keshavamurthy Acked-by: David Mosberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/kprobes.c | 18 ++++++++++++++++-- arch/ia64/kernel/vmlinux.lds.S | 7 ++++++- include/asm-ia64/sections.h | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index ec2ceade12b0..3aa3167edbec 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -34,6 +34,7 @@ #include #include +#include extern void jprobe_inst_return(void); @@ -263,14 +264,27 @@ static inline void get_kprobe_inst(bundle_t *bundle, uint slot, } } +/* Returns non-zero if the addr is in the Interrupt Vector Table */ +static inline int in_ivt_functions(unsigned long addr) +{ + return (addr >= (unsigned long)__start_ivt_text + && addr < (unsigned long)__end_ivt_text); +} + static int valid_kprobe_addr(int template, int slot, unsigned long addr) { if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) { - printk(KERN_WARNING "Attempting to insert unaligned kprobe at 0x%lx\n", - addr); + printk(KERN_WARNING "Attempting to insert unaligned kprobe " + "at 0x%lx\n", addr); return -EINVAL; } + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " + "IVT functions at 0x%lx\n", addr); + return -EINVAL; + } + if (slot == 1 && bundle_encoding[template][1] != L) { printk(KERN_WARNING "Inserting kprobes on slot #1 " "is not supported\n"); diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index b9f0db4c1b04..a676e79e0681 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -8,6 +8,11 @@ #define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) #include +#define IVT_TEXT \ + VMLINUX_SYMBOL(__start_ivt_text) = .; \ + *(.text.ivt) \ + VMLINUX_SYMBOL(__end_ivt_text) = .; + OUTPUT_FORMAT("elf64-ia64-little") OUTPUT_ARCH(ia64) ENTRY(phys_start) @@ -39,7 +44,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { - *(.text.ivt) + IVT_TEXT *(.text) SCHED_TEXT LOCK_TEXT diff --git a/include/asm-ia64/sections.h b/include/asm-ia64/sections.h index 8e3dbde1b429..e9eb7f62d32b 100644 --- a/include/asm-ia64/sections.h +++ b/include/asm-ia64/sections.h @@ -17,6 +17,7 @@ extern char __start_gate_vtop_patchlist[], __end_gate_vtop_patchlist[]; extern char __start_gate_fsyscall_patchlist[], __end_gate_fsyscall_patchlist[]; extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_bubble_down_patchlist[]; extern char __start_unwind[], __end_unwind[]; +extern char __start_ivt_text[], __end_ivt_text[]; #endif /* _ASM_IA64_SECTIONS_H */ -- cgit v1.2.3 From b445e26cbf784cdba10f2b6c3e2cd3ee7bab360a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Jun 2005 15:42:04 -0700 Subject: [SPARC64]: Avoid membar instructions in delay slots. In particular, avoid membar instructions in the delay slot of a jmpl instruction. UltraSPARC-I, II, IIi, and IIe have a bug, documented in the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51 The long and short of it is that if the IMU unit misses on a branch or jmpl, and there is a store buffer synchronizing membar in the delay slot, the chip can stop fetching instructions. If interrupts are enabled or some other trap is enabled, the chip will unwedge itself, but performance will suffer. We already had a workaround for this bug in a few spots, but it's better to have the entire tree sanitized for this rule. Signed-off-by: David S. Miller --- arch/sparc64/kernel/entry.S | 6 ++- arch/sparc64/kernel/semaphore.c | 12 +++-- arch/sparc64/kernel/trampoline.S | 3 +- arch/sparc64/lib/U1memcpy.S | 103 ++++++++++++++++++++------------------- arch/sparc64/lib/VISsave.S | 15 +++++- arch/sparc64/lib/atomic.S | 42 ++++++++++------ arch/sparc64/lib/bitops.S | 31 +++++++----- arch/sparc64/lib/debuglocks.c | 6 ++- arch/sparc64/lib/dec_and_lock.S | 6 ++- arch/sparc64/lib/rwsem.S | 15 ++++-- arch/sparc64/mm/init.c | 6 ++- arch/sparc64/mm/ultra.S | 3 +- include/asm-sparc64/rwsem.h | 3 +- include/asm-sparc64/spinlock.h | 29 +++++++---- include/asm-sparc64/spitfire.h | 1 - 15 files changed, 171 insertions(+), 110 deletions(-) (limited to 'arch') diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index a47f2d0b1a29..ffe717ab7f83 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -271,8 +271,9 @@ cplus_fptrap_insn_1: fmuld %f0, %f2, %f26 faddd %f0, %f2, %f28 fmuld %f0, %f2, %f30 + membar #Sync b,pt %xcc, fpdis_exit - membar #Sync + nop 2: andcc %g5, FPRS_DU, %g0 bne,pt %icc, 3f fzero %f32 @@ -301,8 +302,9 @@ cplus_fptrap_insn_2: fmuld %f32, %f34, %f58 faddd %f32, %f34, %f60 fmuld %f32, %f34, %f62 + membar #Sync ba,pt %xcc, fpdis_exit - membar #Sync + nop 3: mov SECONDARY_CONTEXT, %g3 add %g6, TI_FPREGS, %g1 ldxa [%g3] ASI_DMMU, %g5 diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index 63496c43fe17..a809e63f03ef 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -32,8 +32,9 @@ static __inline__ int __sem_update_count(struct semaphore *sem, int incr) " add %1, %4, %1\n" " cas [%3], %0, %1\n" " cmp %0, %1\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%icc, 1b\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (&sem->count), "r" (incr), "m" (sem->count) : "cc"); @@ -71,8 +72,9 @@ void up(struct semaphore *sem) " cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " addcc %%g7, 1, %%g0\n" +" membar #StoreLoad | #StoreStore\n" " ble,pn %%icc, 3f\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" "2:\n" " .subsection 2\n" "3: mov %0, %%g1\n" @@ -128,8 +130,9 @@ void __sched down(struct semaphore *sem) " cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " cmp %%g7, 1\n" +" membar #StoreLoad | #StoreStore\n" " bl,pn %%icc, 3f\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" "2:\n" " .subsection 2\n" "3: mov %0, %%g1\n" @@ -233,8 +236,9 @@ int __sched down_interruptible(struct semaphore *sem) " cmp %%g1, %%g7\n" " bne,pn %%icc, 1b\n" " cmp %%g7, 1\n" +" membar #StoreLoad | #StoreStore\n" " bl,pn %%icc, 3f\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" "2:\n" " .subsection 2\n" "3: mov %2, %%g1\n" diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S index 2c8f9344b4ee..3a145fc39cf2 100644 --- a/arch/sparc64/kernel/trampoline.S +++ b/arch/sparc64/kernel/trampoline.S @@ -98,8 +98,9 @@ startup_continue: sethi %hi(prom_entry_lock), %g2 1: ldstub [%g2 + %lo(prom_entry_lock)], %g1 + membar #StoreLoad | #StoreStore brnz,pn %g1, 1b - membar #StoreLoad | #StoreStore + nop sethi %hi(p1275buf), %g2 or %g2, %lo(p1275buf), %g2 diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S index da9b520c7189..bafd2fc07acb 100644 --- a/arch/sparc64/lib/U1memcpy.S +++ b/arch/sparc64/lib/U1memcpy.S @@ -87,14 +87,17 @@ #define LOOP_CHUNK3(src, dest, len, branch_dest) \ MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) +#define DO_SYNC membar #Sync; #define STORE_SYNC(dest, fsrc) \ EX_ST(STORE_BLK(%fsrc, %dest)); \ - add %dest, 0x40, %dest; + add %dest, 0x40, %dest; \ + DO_SYNC #define STORE_JUMP(dest, fsrc, target) \ EX_ST(STORE_BLK(%fsrc, %dest)); \ add %dest, 0x40, %dest; \ - ba,pt %xcc, target; + ba,pt %xcc, target; \ + nop; #define FINISH_VISCHUNK(dest, f0, f1, left) \ subcc %left, 8, %left;\ @@ -239,17 +242,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f0, %f2, %f48 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) - STORE_JUMP(o0, f48, 40f) membar #Sync + STORE_JUMP(o0, f48, 40f) 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) - STORE_JUMP(o0, f48, 48f) membar #Sync + STORE_JUMP(o0, f48, 48f) 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) - STORE_JUMP(o0, f48, 56f) membar #Sync + STORE_JUMP(o0, f48, 56f) 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -260,17 +263,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f2, %f4, %f48 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) - STORE_JUMP(o0, f48, 41f) membar #Sync + STORE_JUMP(o0, f48, 41f) 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) - STORE_JUMP(o0, f48, 49f) membar #Sync + STORE_JUMP(o0, f48, 49f) 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) - STORE_JUMP(o0, f48, 57f) membar #Sync + STORE_JUMP(o0, f48, 57f) 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -281,17 +284,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f4, %f6, %f48 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) - STORE_JUMP(o0, f48, 42f) membar #Sync + STORE_JUMP(o0, f48, 42f) 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) - STORE_JUMP(o0, f48, 50f) membar #Sync + STORE_JUMP(o0, f48, 50f) 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) - STORE_JUMP(o0, f48, 58f) membar #Sync + STORE_JUMP(o0, f48, 58f) 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -302,17 +305,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f6, %f8, %f48 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) - STORE_JUMP(o0, f48, 43f) membar #Sync + STORE_JUMP(o0, f48, 43f) 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) - STORE_JUMP(o0, f48, 51f) membar #Sync + STORE_JUMP(o0, f48, 51f) 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) - STORE_JUMP(o0, f48, 59f) membar #Sync + STORE_JUMP(o0, f48, 59f) 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -323,17 +326,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f8, %f10, %f48 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) - STORE_JUMP(o0, f48, 44f) membar #Sync + STORE_JUMP(o0, f48, 44f) 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) - STORE_JUMP(o0, f48, 52f) membar #Sync + STORE_JUMP(o0, f48, 52f) 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) - STORE_JUMP(o0, f48, 60f) membar #Sync + STORE_JUMP(o0, f48, 60f) 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -344,17 +347,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f10, %f12, %f48 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) - STORE_JUMP(o0, f48, 45f) membar #Sync + STORE_JUMP(o0, f48, 45f) 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) - STORE_JUMP(o0, f48, 53f) membar #Sync + STORE_JUMP(o0, f48, 53f) 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) - STORE_JUMP(o0, f48, 61f) membar #Sync + STORE_JUMP(o0, f48, 61f) 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -365,17 +368,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f12, %f14, %f48 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) - STORE_JUMP(o0, f48, 46f) membar #Sync + STORE_JUMP(o0, f48, 46f) 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) - STORE_JUMP(o0, f48, 54f) membar #Sync + STORE_JUMP(o0, f48, 54f) 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) - STORE_JUMP(o0, f48, 62f) membar #Sync + STORE_JUMP(o0, f48, 62f) 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) @@ -386,17 +389,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ ba,pt %xcc, 1b+4 faligndata %f14, %f16, %f48 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) - STORE_JUMP(o0, f48, 47f) membar #Sync + STORE_JUMP(o0, f48, 47f) 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) - STORE_JUMP(o0, f48, 55f) membar #Sync + STORE_JUMP(o0, f48, 55f) 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) - STORE_SYNC(o0, f48) membar #Sync + STORE_SYNC(o0, f48) FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) - STORE_JUMP(o0, f48, 63f) membar #Sync + STORE_JUMP(o0, f48, 63f) 40: FINISH_VISCHUNK(o0, f0, f2, g3) 41: FINISH_VISCHUNK(o0, f2, f4, g3) diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S index 65e328d600a8..4e18989bd602 100644 --- a/arch/sparc64/lib/VISsave.S +++ b/arch/sparc64/lib/VISsave.S @@ -72,7 +72,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3 stda %f48, [%g3 + %g1] ASI_BLK_P 5: membar #Sync - jmpl %g7 + %g0, %g0 + ba,pt %xcc, 80f + nop + + .align 32 +80: jmpl %g7 + %g0, %g0 nop 6: ldub [%g3 + TI_FPSAVED], %o5 @@ -87,8 +91,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3 stda %f32, [%g2 + %g1] ASI_BLK_P stda %f48, [%g3 + %g1] ASI_BLK_P membar #Sync - jmpl %g7 + %g0, %g0 + ba,pt %xcc, 80f + nop + .align 32 +80: jmpl %g7 + %g0, %g0 nop .align 32 @@ -126,6 +133,10 @@ VISenterhalf: stda %f0, [%g2 + %g1] ASI_BLK_P stda %f16, [%g3 + %g1] ASI_BLK_P membar #Sync + ba,pt %xcc, 4f + nop + + .align 32 4: and %o5, FPRS_DU, %o5 jmpl %g7 + %g0, %g0 wr %o5, FPRS_FEF, %fprs diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S index e528b8d1a3e6..faf87c31598b 100644 --- a/arch/sparc64/lib/atomic.S +++ b/arch/sparc64/lib/atomic.S @@ -7,18 +7,6 @@ #include #include - /* On SMP we need to use memory barriers to ensure - * correct memory operation ordering, nop these out - * for uniprocessor. - */ -#ifdef CONFIG_SMP -#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad -#define ATOMIC_POST_BARRIER membar #StoreLoad | #StoreStore -#else -#define ATOMIC_PRE_BARRIER nop -#define ATOMIC_POST_BARRIER nop -#endif - .text /* Two versions of the atomic routines, one that @@ -52,6 +40,24 @@ atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ nop .size atomic_sub, .-atomic_sub + /* On SMP we need to use memory barriers to ensure + * correct memory operation ordering, nop these out + * for uniprocessor. + */ +#ifdef CONFIG_SMP + +#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad; +#define ATOMIC_POST_BARRIER \ + ba,pt %xcc, 80b; \ + membar #StoreLoad | #StoreStore + +80: retl + nop +#else +#define ATOMIC_PRE_BARRIER +#define ATOMIC_POST_BARRIER +#endif + .globl atomic_add_ret .type atomic_add_ret,#function atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ @@ -62,9 +68,10 @@ atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ cmp %g1, %g7 bne,pn %icc, 1b add %g7, %o0, %g7 + sra %g7, 0, %o0 ATOMIC_POST_BARRIER retl - sra %g7, 0, %o0 + nop .size atomic_add_ret, .-atomic_add_ret .globl atomic_sub_ret @@ -77,9 +84,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ cmp %g1, %g7 bne,pn %icc, 1b sub %g7, %o0, %g7 + sra %g7, 0, %o0 ATOMIC_POST_BARRIER retl - sra %g7, 0, %o0 + nop .size atomic_sub_ret, .-atomic_sub_ret .globl atomic64_add @@ -118,9 +126,10 @@ atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ cmp %g1, %g7 bne,pn %xcc, 1b add %g7, %o0, %g7 + mov %g7, %o0 ATOMIC_POST_BARRIER retl - mov %g7, %o0 + nop .size atomic64_add_ret, .-atomic64_add_ret .globl atomic64_sub_ret @@ -133,7 +142,8 @@ atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ cmp %g1, %g7 bne,pn %xcc, 1b sub %g7, %o0, %g7 + mov %g7, %o0 ATOMIC_POST_BARRIER retl - mov %g7, %o0 + nop .size atomic64_sub_ret, .-atomic64_sub_ret diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S index 886dcd2b376a..31afbfe6c1e8 100644 --- a/arch/sparc64/lib/bitops.S +++ b/arch/sparc64/lib/bitops.S @@ -7,20 +7,26 @@ #include #include + .text + /* On SMP we need to use memory barriers to ensure * correct memory operation ordering, nop these out * for uniprocessor. */ + #ifdef CONFIG_SMP #define BITOP_PRE_BARRIER membar #StoreLoad | #LoadLoad -#define BITOP_POST_BARRIER membar #StoreLoad | #StoreStore +#define BITOP_POST_BARRIER \ + ba,pt %xcc, 80b; \ + membar #StoreLoad | #StoreStore + +80: retl + nop #else -#define BITOP_PRE_BARRIER nop -#define BITOP_POST_BARRIER nop +#define BITOP_PRE_BARRIER +#define BITOP_POST_BARRIER #endif - .text - .globl test_and_set_bit .type test_and_set_bit,#function test_and_set_bit: /* %o0=nr, %o1=addr */ @@ -37,10 +43,11 @@ test_and_set_bit: /* %o0=nr, %o1=addr */ cmp %g7, %g1 bne,pn %xcc, 1b and %g7, %o2, %g2 - BITOP_POST_BARRIER clr %o0 + movrne %g2, 1, %o0 + BITOP_POST_BARRIER retl - movrne %g2, 1, %o0 + nop .size test_and_set_bit, .-test_and_set_bit .globl test_and_clear_bit @@ -59,10 +66,11 @@ test_and_clear_bit: /* %o0=nr, %o1=addr */ cmp %g7, %g1 bne,pn %xcc, 1b and %g7, %o2, %g2 - BITOP_POST_BARRIER clr %o0 + movrne %g2, 1, %o0 + BITOP_POST_BARRIER retl - movrne %g2, 1, %o0 + nop .size test_and_clear_bit, .-test_and_clear_bit .globl test_and_change_bit @@ -81,10 +89,11 @@ test_and_change_bit: /* %o0=nr, %o1=addr */ cmp %g7, %g1 bne,pn %xcc, 1b and %g7, %o2, %g2 - BITOP_POST_BARRIER clr %o0 + movrne %g2, 1, %o0 + BITOP_POST_BARRIER retl - movrne %g2, 1, %o0 + nop .size test_and_change_bit, .-test_and_change_bit .globl set_bit diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c index c421e0c65325..f03344cf784e 100644 --- a/arch/sparc64/lib/debuglocks.c +++ b/arch/sparc64/lib/debuglocks.c @@ -252,8 +252,9 @@ wlock_again: " andn %%g1, %%g3, %%g7\n" " casx [%0], %%g1, %%g7\n" " cmp %%g1, %%g7\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%xcc, 1b\n" -" membar #StoreLoad | #StoreStore" +" nop" : /* no outputs */ : "r" (&(rw->lock)) : "g3", "g1", "g7", "cc", "memory"); @@ -351,8 +352,9 @@ int _do_write_trylock (rwlock_t *rw, char *str) " andn %%g1, %%g3, %%g7\n" " casx [%0], %%g1, %%g7\n" " cmp %%g1, %%g7\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%xcc, 1b\n" -" membar #StoreLoad | #StoreStore" +" nop" : /* no outputs */ : "r" (&(rw->lock)) : "g3", "g1", "g7", "cc", "memory"); diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S index 7e6fdaebedba..8ee288dd0afc 100644 --- a/arch/sparc64/lib/dec_and_lock.S +++ b/arch/sparc64/lib/dec_and_lock.S @@ -48,8 +48,9 @@ start_to_zero: #endif to_zero: ldstub [%o1], %g3 + membar #StoreLoad | #StoreStore brnz,pn %g3, spin_on_lock - membar #StoreLoad | #StoreStore + nop loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */ cmp %g2, %g7 @@ -71,8 +72,9 @@ loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */ nop spin_on_lock: ldub [%o1], %g3 + membar #LoadLoad brnz,pt %g3, spin_on_lock - membar #LoadLoad + nop ba,pt %xcc, to_zero nop nop diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S index 174ff7b9164c..75f0e6b951d6 100644 --- a/arch/sparc64/lib/rwsem.S +++ b/arch/sparc64/lib/rwsem.S @@ -17,8 +17,9 @@ __down_read: bne,pn %icc, 1b add %g7, 1, %g7 cmp %g7, 0 + membar #StoreLoad | #StoreStore bl,pn %icc, 3f - membar #StoreLoad | #StoreStore + nop 2: retl nop @@ -57,8 +58,9 @@ __down_write: cmp %g3, %g7 bne,pn %icc, 1b cmp %g7, 0 + membar #StoreLoad | #StoreStore bne,pn %icc, 3f - membar #StoreLoad | #StoreStore + nop 2: retl nop 3: @@ -97,8 +99,9 @@ __up_read: cmp %g1, %g7 bne,pn %icc, 1b cmp %g7, 0 + membar #StoreLoad | #StoreStore bl,pn %icc, 3f - membar #StoreLoad | #StoreStore + nop 2: retl nop 3: sethi %hi(RWSEM_ACTIVE_MASK), %g1 @@ -126,8 +129,9 @@ __up_write: bne,pn %icc, 1b sub %g7, %g1, %g7 cmp %g7, 0 + membar #StoreLoad | #StoreStore bl,pn %icc, 3f - membar #StoreLoad | #StoreStore + nop 2: retl nop @@ -151,8 +155,9 @@ __downgrade_write: bne,pn %icc, 1b sub %g7, %g1, %g7 cmp %g7, 0 + membar #StoreLoad | #StoreStore bl,pn %icc, 3f - membar #StoreLoad | #StoreStore + nop 2: retl nop diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 9c5222075da9..8fc413cb6acd 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -136,8 +136,9 @@ static __inline__ void set_dcache_dirty(struct page *page, int this_cpu) "or %%g1, %0, %%g1\n\t" "casx [%2], %%g7, %%g1\n\t" "cmp %%g7, %%g1\n\t" + "membar #StoreLoad | #StoreStore\n\t" "bne,pn %%xcc, 1b\n\t" - " membar #StoreLoad | #StoreStore" + " nop" : /* no outputs */ : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) : "g1", "g7"); @@ -157,8 +158,9 @@ static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long c " andn %%g7, %1, %%g1\n\t" "casx [%2], %%g7, %%g1\n\t" "cmp %%g7, %%g1\n\t" + "membar #StoreLoad | #StoreStore\n\t" "bne,pn %%xcc, 1b\n\t" - " membar #StoreLoad | #StoreStore\n" + " nop\n" "2:" : /* no outputs */ : "r" (cpu), "r" (mask), "r" (&page->flags), diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S index 7a0934321010..7a2431d3abc7 100644 --- a/arch/sparc64/mm/ultra.S +++ b/arch/sparc64/mm/ultra.S @@ -266,8 +266,9 @@ __cheetah_flush_tlb_pending: /* 22 insns */ andn %o3, 1, %o3 stxa %g0, [%o3] ASI_IMMU_DEMAP 2: stxa %g0, [%o3] ASI_DMMU_DEMAP + membar #Sync brnz,pt %o1, 1b - membar #Sync + nop stxa %g2, [%o4] ASI_DMMU flush %g6 wrpr %g0, 0, %tl diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h index bf2ae90ed3df..a1cc94f95984 100644 --- a/include/asm-sparc64/rwsem.h +++ b/include/asm-sparc64/rwsem.h @@ -55,8 +55,9 @@ static __inline__ int rwsem_atomic_update(int delta, struct rw_semaphore *sem) "add %%g1, %1, %%g7\n\t" "cas [%2], %%g1, %%g7\n\t" "cmp %%g1, %%g7\n\t" + "membar #StoreLoad | #StoreStore\n\t" "bne,pn %%icc, 1b\n\t" - " membar #StoreLoad | #StoreStore\n\t" + " nop\n\t" "mov %%g7, %0\n\t" : "=&r" (tmp) : "0" (tmp), "r" (sem) diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h index db7581bdb531..9cb93a5c2b4f 100644 --- a/include/asm-sparc64/spinlock.h +++ b/include/asm-sparc64/spinlock.h @@ -52,12 +52,14 @@ static inline void _raw_spin_lock(spinlock_t *lock) __asm__ __volatile__( "1: ldstub [%1], %0\n" +" membar #StoreLoad | #StoreStore\n" " brnz,pn %0, 2f\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" " .subsection 2\n" "2: ldub [%1], %0\n" +" membar #LoadLoad\n" " brnz,pt %0, 2b\n" -" membar #LoadLoad\n" +" nop\n" " ba,a,pt %%xcc, 1b\n" " .previous" : "=&r" (tmp) @@ -95,16 +97,18 @@ static inline void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags) __asm__ __volatile__( "1: ldstub [%2], %0\n" -" brnz,pn %0, 2f\n" " membar #StoreLoad | #StoreStore\n" +" brnz,pn %0, 2f\n" +" nop\n" " .subsection 2\n" "2: rdpr %%pil, %1\n" " wrpr %3, %%pil\n" "3: ldub [%2], %0\n" -" brnz,pt %0, 3b\n" " membar #LoadLoad\n" +" brnz,pt %0, 3b\n" +" nop\n" " ba,pt %%xcc, 1b\n" -" wrpr %1, %%pil\n" +" wrpr %1, %%pil\n" " .previous" : "=&r" (tmp1), "=&r" (tmp2) : "r"(lock), "r"(flags) @@ -162,12 +166,14 @@ static void inline __read_lock(rwlock_t *lock) "4: add %0, 1, %1\n" " cas [%2], %0, %1\n" " cmp %0, %1\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%icc, 1b\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" " .subsection 2\n" "2: ldsw [%2], %0\n" +" membar #LoadLoad\n" " brlz,pt %0, 2b\n" -" membar #LoadLoad\n" +" nop\n" " ba,a,pt %%xcc, 4b\n" " .previous" : "=&r" (tmp1), "=&r" (tmp2) @@ -204,12 +210,14 @@ static void inline __write_lock(rwlock_t *lock) "4: or %0, %3, %1\n" " cas [%2], %0, %1\n" " cmp %0, %1\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%icc, 1b\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" " .subsection 2\n" "2: lduw [%2], %0\n" +" membar #LoadLoad\n" " brnz,pt %0, 2b\n" -" membar #LoadLoad\n" +" nop\n" " ba,a,pt %%xcc, 4b\n" " .previous" : "=&r" (tmp1), "=&r" (tmp2) @@ -240,8 +248,9 @@ static int inline __write_trylock(rwlock_t *lock) " or %0, %4, %1\n" " cas [%3], %0, %1\n" " cmp %0, %1\n" +" membar #StoreLoad | #StoreStore\n" " bne,pn %%icc, 1b\n" -" membar #StoreLoad | #StoreStore\n" +" nop\n" " mov 1, %2\n" "2:" : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result) diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h index 9d7613eea812..1aa932773af8 100644 --- a/include/asm-sparc64/spitfire.h +++ b/include/asm-sparc64/spitfire.h @@ -111,7 +111,6 @@ static __inline__ void spitfire_put_dcache_tag(unsigned long addr, unsigned long "membar #Sync" : /* No outputs */ : "r" (tag), "r" (addr), "i" (ASI_DCACHE_TAG)); - __asm__ __volatile__ ("membar #Sync" : : : "memory"); } /* The instruction cache lines are flushed with this, but note that -- cgit v1.2.3 From 63b614522cba5a015923c0e8f284be6e01c13f1a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 27 Jun 2005 17:04:45 -0700 Subject: [SPARC64]: Get rid of fast IRQ feature. The only real user was the assembler floppy interrupt handler, which does not need to be in assembly. This makes it so that there are less pieces of code which know about the internal layout of ivector_table[] and friends. Signed-off-by: David S. Miller --- arch/sparc64/kernel/auxio.c | 2 +- arch/sparc64/kernel/entry.S | 110 ----------------------- arch/sparc64/kernel/irq.c | 171 +++++++++++------------------------- arch/sparc64/kernel/sparc64_ksyms.c | 1 - include/asm-sparc64/auxio.h | 2 + include/asm-sparc64/floppy.h | 16 ++-- include/asm-sparc64/irq.h | 7 +- 7 files changed, 62 insertions(+), 247 deletions(-) (limited to 'arch') diff --git a/arch/sparc64/kernel/auxio.c b/arch/sparc64/kernel/auxio.c index a0716ccc2f4a..8852c20c8d99 100644 --- a/arch/sparc64/kernel/auxio.c +++ b/arch/sparc64/kernel/auxio.c @@ -16,7 +16,7 @@ #include #include -/* This cannot be static, as it is referenced in entry.S */ +/* This cannot be static, as it is referenced in irq.c */ void __iomem *auxio_register = NULL; enum auxio_type { diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index ffe717ab7f83..eee516a71c14 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -701,116 +701,6 @@ utrap_ill: ba,pt %xcc, rtrap clr %l6 -#ifdef CONFIG_BLK_DEV_FD - .globl floppy_hardint -floppy_hardint: - wr %g0, (1 << 11), %clear_softint - sethi %hi(doing_pdma), %g1 - ld [%g1 + %lo(doing_pdma)], %g2 - brz,pn %g2, floppy_dosoftint - sethi %hi(fdc_status), %g3 - ldx [%g3 + %lo(fdc_status)], %g3 - sethi %hi(pdma_vaddr), %g5 - ldx [%g5 + %lo(pdma_vaddr)], %g4 - sethi %hi(pdma_size), %g5 - ldx [%g5 + %lo(pdma_size)], %g5 - -next_byte: - lduba [%g3] ASI_PHYS_BYPASS_EC_E, %g7 - andcc %g7, 0x80, %g0 - be,pn %icc, floppy_fifo_emptied - andcc %g7, 0x20, %g0 - be,pn %icc, floppy_overrun - andcc %g7, 0x40, %g0 - be,pn %icc, floppy_write - sub %g5, 1, %g5 - - inc %g3 - lduba [%g3] ASI_PHYS_BYPASS_EC_E, %g7 - dec %g3 - orcc %g0, %g5, %g0 - stb %g7, [%g4] - bne,pn %xcc, next_byte - add %g4, 1, %g4 - - b,pt %xcc, floppy_tdone - nop - -floppy_write: - ldub [%g4], %g7 - orcc %g0, %g5, %g0 - inc %g3 - stba %g7, [%g3] ASI_PHYS_BYPASS_EC_E - dec %g3 - bne,pn %xcc, next_byte - add %g4, 1, %g4 - -floppy_tdone: - sethi %hi(pdma_vaddr), %g1 - stx %g4, [%g1 + %lo(pdma_vaddr)] - sethi %hi(pdma_size), %g1 - stx %g5, [%g1 + %lo(pdma_size)] - sethi %hi(auxio_register), %g1 - ldx [%g1 + %lo(auxio_register)], %g7 - lduba [%g7] ASI_PHYS_BYPASS_EC_E, %g5 - or %g5, AUXIO_AUX1_FTCNT, %g5 -/* andn %g5, AUXIO_AUX1_MASK, %g5 */ - stba %g5, [%g7] ASI_PHYS_BYPASS_EC_E - andn %g5, AUXIO_AUX1_FTCNT, %g5 -/* andn %g5, AUXIO_AUX1_MASK, %g5 */ - - nop; nop; nop; nop; nop; nop; - nop; nop; nop; nop; nop; nop; - - stba %g5, [%g7] ASI_PHYS_BYPASS_EC_E - sethi %hi(doing_pdma), %g1 - b,pt %xcc, floppy_dosoftint - st %g0, [%g1 + %lo(doing_pdma)] - -floppy_fifo_emptied: - sethi %hi(pdma_vaddr), %g1 - stx %g4, [%g1 + %lo(pdma_vaddr)] - sethi %hi(pdma_size), %g1 - stx %g5, [%g1 + %lo(pdma_size)] - sethi %hi(irq_action), %g1 - or %g1, %lo(irq_action), %g1 - ldx [%g1 + (11 << 3)], %g3 ! irqaction[floppy_irq] - ldx [%g3 + 0x08], %g4 ! action->flags>>48==ino - sethi %hi(ivector_table), %g3 - srlx %g4, 48, %g4 - or %g3, %lo(ivector_table), %g3 - sllx %g4, 5, %g4 - ldx [%g3 + %g4], %g4 ! &ivector_table[ino] - ldx [%g4 + 0x10], %g4 ! bucket->iclr - stwa %g0, [%g4] ASI_PHYS_BYPASS_EC_E ! ICLR_IDLE - membar #Sync ! probably not needed... - retry - -floppy_overrun: - sethi %hi(pdma_vaddr), %g1 - stx %g4, [%g1 + %lo(pdma_vaddr)] - sethi %hi(pdma_size), %g1 - stx %g5, [%g1 + %lo(pdma_size)] - sethi %hi(doing_pdma), %g1 - st %g0, [%g1 + %lo(doing_pdma)] - -floppy_dosoftint: - rdpr %pil, %g2 - wrpr %g0, 15, %pil - sethi %hi(109f), %g7 - b,pt %xcc, etrap_irq -109: or %g7, %lo(109b), %g7 - - mov 11, %o0 - mov 0, %o1 - call sparc_floppy_irq - add %sp, PTREGS_OFF, %o2 - - b,pt %xcc, rtrap_irq - nop - -#endif /* CONFIG_BLK_DEV_FD */ - /* XXX Here is stuff we still need to write... -DaveM XXX */ .globl netbsd_syscall netbsd_syscall: diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 4dcb8af94090..424712577307 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_SMP static void distribute_irqs(void); @@ -834,137 +835,65 @@ void handler_irq(int irq, struct pt_regs *regs) } #ifdef CONFIG_BLK_DEV_FD -extern void floppy_interrupt(int irq, void *dev_cookie, struct pt_regs *regs); +extern irqreturn_t floppy_interrupt(int, void *, struct pt_regs *);; -void sparc_floppy_irq(int irq, void *dev_cookie, struct pt_regs *regs) -{ - struct irqaction *action = *(irq + irq_action); - struct ino_bucket *bucket; - int cpu = smp_processor_id(); - - irq_enter(); - kstat_this_cpu.irqs[irq]++; - - *(irq_work(cpu, irq)) = 0; - bucket = get_ino_in_irqaction(action) + ivector_table; - - bucket->flags |= IBF_INPROGRESS; - - floppy_interrupt(irq, dev_cookie, regs); - upa_writel(ICLR_IDLE, bucket->iclr); - - bucket->flags &= ~IBF_INPROGRESS; - - irq_exit(); -} -#endif - -/* The following assumes that the branch lies before the place we - * are branching to. This is the case for a trap vector... - * You have been warned. - */ -#define SPARC_BRANCH(dest_addr, inst_addr) \ - (0x10800000 | ((((dest_addr)-(inst_addr))>>2)&0x3fffff)) - -#define SPARC_NOP (0x01000000) +/* XXX No easy way to include asm/floppy.h XXX */ +extern unsigned char *pdma_vaddr; +extern unsigned long pdma_size; +extern volatile int doing_pdma; +extern unsigned long fdc_status; -static void install_fast_irq(unsigned int cpu_irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *)) +irqreturn_t sparc_floppy_irq(int irq, void *dev_cookie, struct pt_regs *regs) { - extern unsigned long sparc64_ttable_tl0; - unsigned long ttent = (unsigned long) &sparc64_ttable_tl0; - unsigned int *insns; - - ttent += 0x820; - ttent += (cpu_irq - 1) << 5; - insns = (unsigned int *) ttent; - insns[0] = SPARC_BRANCH(((unsigned long) handler), - ((unsigned long)&insns[0])); - insns[1] = SPARC_NOP; - __asm__ __volatile__("membar #StoreStore; flush %0" : : "r" (ttent)); -} - -int request_fast_irq(unsigned int irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, const char *name, void *dev_id) -{ - struct irqaction *action; - struct ino_bucket *bucket = __bucket(irq); - unsigned long flags; - - /* No pil0 dummy buckets allowed here. */ - if (bucket < &ivector_table[0] || - bucket >= &ivector_table[NUM_IVECS]) { - unsigned int *caller; - - __asm__ __volatile__("mov %%i7, %0" : "=r" (caller)); - printk(KERN_CRIT "request_fast_irq: Old style IRQ registry attempt " - "from %p, irq %08x.\n", caller, irq); - return -EINVAL; - } - - if (!handler) - return -EINVAL; + if (likely(doing_pdma)) { + void __iomem *stat = (void __iomem *) fdc_status; + unsigned char *vaddr = pdma_vaddr; + unsigned long size = pdma_size; + u8 val; + + while (size) { + val = readb(stat); + if (unlikely(!(val & 0x80))) { + pdma_vaddr = vaddr; + pdma_size = size; + return IRQ_HANDLED; + } + if (unlikely(!(val & 0x20))) { + pdma_vaddr = vaddr; + pdma_size = size; + doing_pdma = 0; + goto main_interrupt; + } + if (val & 0x40) { + /* read */ + *vaddr++ = readb(stat + 1); + } else { + unsigned char data = *vaddr++; - if ((bucket->pil == 0) || (bucket->pil == 14)) { - printk("request_fast_irq: Trying to register shared IRQ 0 or 14.\n"); - return -EBUSY; - } + /* write */ + writeb(data, stat + 1); + } + size--; + } - spin_lock_irqsave(&irq_action_lock, flags); + pdma_vaddr = vaddr; + pdma_size = size; - action = *(bucket->pil + irq_action); - if (action) { - if (action->flags & SA_SHIRQ) - panic("Trying to register fast irq when already shared.\n"); - if (irqflags & SA_SHIRQ) - panic("Trying to register fast irq as shared.\n"); - printk("request_fast_irq: Trying to register yet already owned.\n"); - spin_unlock_irqrestore(&irq_action_lock, flags); - return -EBUSY; - } + /* Send Terminal Count pulse to floppy controller. */ + val = readb(auxio_register); + val |= AUXIO_AUX1_FTCNT; + writeb(val, auxio_register); + val &= AUXIO_AUX1_FTCNT; + writeb(val, auxio_register); - /* - * We do not check for SA_SAMPLE_RANDOM in this path. Neither do we - * support smp intr affinity in this path. - */ - if (irqflags & SA_STATIC_ALLOC) { - if (static_irq_count < MAX_STATIC_ALLOC) - action = &static_irqaction[static_irq_count++]; - else - printk("Request for IRQ%d (%s) SA_STATIC_ALLOC failed " - "using kmalloc\n", bucket->pil, name); - } - if (action == NULL) - action = (struct irqaction *)kmalloc(sizeof(struct irqaction), - GFP_ATOMIC); - if (!action) { - spin_unlock_irqrestore(&irq_action_lock, flags); - return -ENOMEM; + doing_pdma = 0; } - install_fast_irq(bucket->pil, handler); - bucket->irq_info = action; - bucket->flags |= IBF_ACTIVE; - - action->handler = handler; - action->flags = irqflags; - action->dev_id = NULL; - action->name = name; - action->next = NULL; - put_ino_in_irqaction(action, irq); - put_smpaff_in_irqaction(action, CPU_MASK_NONE); - - *(bucket->pil + irq_action) = action; - enable_irq(irq); - - spin_unlock_irqrestore(&irq_action_lock, flags); - -#ifdef CONFIG_SMP - distribute_irqs(); -#endif - return 0; +main_interrupt: + return floppy_interrupt(irq, dev_cookie, regs); } +EXPORT_SYMBOL(sparc_floppy_irq); +#endif /* We really don't need these at all on the Sparc. We only have * stubs here because they are exported to modules. diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index e78cc53594fa..56cd96f4a5cd 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -227,7 +227,6 @@ EXPORT_SYMBOL(__flush_dcache_range); EXPORT_SYMBOL(mostek_lock); EXPORT_SYMBOL(mstk48t02_regs); -EXPORT_SYMBOL(request_fast_irq); #ifdef CONFIG_SUN_AUXIO EXPORT_SYMBOL(auxio_set_led); EXPORT_SYMBOL(auxio_set_lte); diff --git a/include/asm-sparc64/auxio.h b/include/asm-sparc64/auxio.h index 5eb01dd47150..81a590a50a1f 100644 --- a/include/asm-sparc64/auxio.h +++ b/include/asm-sparc64/auxio.h @@ -75,6 +75,8 @@ #ifndef __ASSEMBLY__ +extern void __iomem *auxio_register; + #define AUXIO_LTE_ON 1 #define AUXIO_LTE_OFF 0 diff --git a/include/asm-sparc64/floppy.h b/include/asm-sparc64/floppy.h index e071b4b4edfd..49d49a285943 100644 --- a/include/asm-sparc64/floppy.h +++ b/include/asm-sparc64/floppy.h @@ -159,7 +159,7 @@ static void sun_82077_fd_outb(unsigned char value, unsigned long port) * underruns. If non-zero, doing_pdma encodes the direction of * the transfer for debugging. 1=read 2=write */ -char *pdma_vaddr; +unsigned char *pdma_vaddr; unsigned long pdma_size; volatile int doing_pdma = 0; @@ -209,8 +209,7 @@ static void sun_fd_enable_dma(void) pdma_areasize = pdma_size; } -/* Our low-level entry point in arch/sparc/kernel/entry.S */ -extern irqreturn_t floppy_hardint(int irq, void *unused, struct pt_regs *regs); +extern irqreturn_t sparc_floppy_irq(int, void *, struct pt_regs *); static int sun_fd_request_irq(void) { @@ -220,8 +219,8 @@ static int sun_fd_request_irq(void) if(!once) { once = 1; - error = request_fast_irq(FLOPPY_IRQ, floppy_hardint, - SA_INTERRUPT, "floppy", NULL); + error = request_irq(FLOPPY_IRQ, sparc_floppy_irq, + SA_INTERRUPT, "floppy", NULL); return ((error == 0) ? 0 : -1); } @@ -615,7 +614,7 @@ static unsigned long __init sun_floppy_init(void) struct linux_ebus *ebus; struct linux_ebus_device *edev = NULL; unsigned long config = 0; - unsigned long auxio_reg; + void __iomem *auxio_reg; for_each_ebus(ebus) { for_each_ebusdev(edev, ebus) { @@ -642,7 +641,7 @@ static unsigned long __init sun_floppy_init(void) /* Make sure the high density bit is set, some systems * (most notably Ultra5/Ultra10) come up with it clear. */ - auxio_reg = edev->resource[2].start; + auxio_reg = (void __iomem *) edev->resource[2].start; writel(readl(auxio_reg)|0x2, auxio_reg); sun_pci_ebus_dev = ebus->self; @@ -650,7 +649,8 @@ static unsigned long __init sun_floppy_init(void) spin_lock_init(&sun_pci_fd_ebus_dma.lock); /* XXX ioremap */ - sun_pci_fd_ebus_dma.regs = edev->resource[1].start; + sun_pci_fd_ebus_dma.regs = (void __iomem *) + edev->resource[1].start; if (!sun_pci_fd_ebus_dma.regs) return 0; diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h index 3aef0ca67750..018e2e46082b 100644 --- a/include/asm-sparc64/irq.h +++ b/include/asm-sparc64/irq.h @@ -19,7 +19,7 @@ /* You should not mess with this directly. That's the job of irq.c. * * If you make changes here, please update hand coded assembler of - * SBUS/floppy interrupt handler in entry.S -DaveM + * the vectored interrupt trap handler in entry.S -DaveM * * This is currently one DCACHE line, two buckets per L2 cache * line. Keep this in mind please. @@ -122,11 +122,6 @@ extern void enable_irq(unsigned int); extern unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long imap); extern unsigned int sbus_build_irq(void *sbus, unsigned int ino); -extern int request_fast_irq(unsigned int irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long flags, __const__ char *devname, - void *dev_id); - static __inline__ void set_softint(unsigned long bits) { __asm__ __volatile__("wr %0, 0x0, %%set_softint" -- cgit v1.2.3