X-Git-Url: http://pilppa.org/gitweb/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=206920796f5f68cb176afa138238bd3584106a5a;hb=763f356cd8de9e158836d236b3fd9dd149d696f9;hp=2019c1b19254e4b2085064c911b40b3f962a32bb;hpb=1e7e5a9048b30c57ba1ddaa6cdf59b21b65cde99;p=linux-2.6-omap-h63xx.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b1925..206920796f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | + page->flags &= ~(1 << PG_lru | + 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); @@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapcount(page) || - (page->flags & ( + if ( page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return allocated; } +#ifdef CONFIG_NUMA +/* Called from the slab reaper to drain remote pagesets */ +void drain_remote_pages(void) +{ + struct zone *zone; + int i; + unsigned long flags; + + local_irq_save(flags); + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + /* Do not drain local pagesets */ + if (zone->zone_pgdat->node_id == numa_node_id()) + continue; + + pset = zone->pageset[smp_processor_id()]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + if (pcp->count) + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } + local_irq_restore(flags); +} +#endif + #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { @@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; - pset = &zone->pageset[cpu]; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) local_irq_save(flags); cpu = smp_processor_id(); - p = &z->pageset[cpu]; + p = zone_pcp(z,cpu); if (pg == orig) { - z->pageset[cpu].numa_hit++; + p->numa_hit++; } else { p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; } if (pg == NODE_DATA(numa_node_id())) p->local_node++; @@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); list_add(&page->lru, &pcp->list); pcp->count++; + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); local_irq_restore(flags); put_cpu(); } @@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) if (order == 0) { struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, @@ -854,7 +889,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask, order); + did_some_progress = try_to_free_pages(zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -930,6 +965,7 @@ nopage: " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); + show_mem(); } return NULL; got_pg: @@ -1139,7 +1175,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); } -unsigned long __read_page_state(unsigned offset) +unsigned long __read_page_state(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1153,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset) return ret; } -void __mod_page_state(unsigned offset, unsigned long delta) +void __mod_page_state(unsigned long offset, unsigned long delta) { unsigned long flags; void* ptr; @@ -1262,22 +1298,23 @@ void show_free_areas(void) if (!cpu_possible(cpu)) continue; - pageset = zone->pageset + cpu; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", + printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", pageset->pcp[temperature].low, pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); } } get_page_state(&ps); get_zone_counts(&active, &inactive, &free); - printk("\nFree pages: %11ukB (%ukB HighMem)\n", + printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); @@ -1645,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static int __devinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + return batch; +} + +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); +} + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * These will be discarded when bootup is complete. + */ +static struct per_cpu_pageset + boot_pageset[NR_CPUS] __initdata; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __devinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + + for_each_zone(zone) { + + zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!zone->pageset[cpu]) + goto bad; + + setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(dzone->pageset[cpu]); + dzone->pageset[cpu] = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +#endif +} + +static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + free_zone_pagesets(cpu); + break; +#endif + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset() +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -1687,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. - * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. - */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); @@ -1929,7 +2083,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) struct per_cpu_pageset *pageset; int j; - pageset = &zone->pageset[i]; + pageset = zone_pcp(zone, i); for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { if (pageset->pcp[j].count) break;