X-Git-Url: http://pilppa.org/gitweb/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=9f86191bb632955a224d94ddf6c53980c64e9244;hb=ba262e4a4d4c23b5e6c15dbb3a99696b562e8035;hp=97d6827c7d669529fb7e5607cad4ba838d1847bf;hpb=63d39fe88ffabbd82d9db42e9b603c58532fc918;p=linux-2.6-omap-h63xx.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97d6827c7d6..9f86191bb63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,8 +37,10 @@ #include #include #include +#include #include +#include #include "internal.h" /* @@ -82,8 +84,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; -unsigned long __initdata nr_kernel_pages; -unsigned long __initdata nr_all_pages; +unsigned long __meminitdata nr_kernel_pages; +unsigned long __meminitdata nr_all_pages; #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -232,11 +234,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); __SetPageBuddy(page); } @@ -262,7 +266,7 @@ static inline void rmv_page_order(struct page *page) * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contigious at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline struct page * __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) @@ -283,25 +287,30 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && * (b) the buddy is in the buddy system && - * (c) a page and its buddy have the same order. + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we use PG_buddy. * Setting, clearing, and testing PG_buddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, int order) +static inline int page_is_buddy(struct page *page, struct page *buddy, + int order) { #ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid(page_to_pfn(buddy))) return 0; #endif - if (PageBuddy(page) && page_order(page) == order) { - BUG_ON(page_count(page) != 0); - return 1; + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + if (PageBuddy(buddy) && page_order(buddy) == order) { + BUG_ON(page_count(buddy) != 0); + return 1; } - return 0; + return 0; } /* @@ -349,7 +358,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; buddy = __page_find_buddy(page, page_idx, order); - if (!page_is_buddy(buddy, order)) + if (!page_is_buddy(page, buddy, order)) break; /* Move the buddy up one level. */ list_del(&buddy->lru); @@ -948,8 +957,7 @@ restart: goto got_pg; do { - if (cpuset_zone_allowed(*z, gfp_mask)) - wakeup_kswapd(*z, order); + wakeup_kswapd(*z, order); } while (*(++z)); /* @@ -967,7 +975,8 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - alloc_flags |= ALLOC_CPUSET; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -1481,7 +1490,7 @@ void show_free_areas(void) } for_each_zone(zone) { - unsigned long nr, flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -1492,11 +1501,12 @@ void show_free_areas(void) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + nr[order] = zone->free_area[order].nr_free; + total += nr[order] << order; } spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", nr[order], K(1UL) << order); printk("= %lukB\n", K(total)); } @@ -1508,7 +1518,7 @@ void show_free_areas(void) * * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, +static int __meminit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; @@ -1544,7 +1554,7 @@ static inline int highest_zone(int zone_bits) #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; +static int __meminitdata node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1559,7 +1569,7 @@ static int __initdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -1605,7 +1615,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1657,7 +1667,7 @@ static void __init build_zonelists(pg_data_t *pgdat) #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1695,14 +1705,29 @@ static void __init build_zonelists(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -void __init build_all_zonelists(void) +/* return values int ....just for stop_machine_run() */ +static int __meminit __build_all_zonelists(void *dummy) { - int i; + int nid; + for_each_online_node(nid) + build_zonelists(NODE_DATA(nid)); + return 0; +} - for_each_online_node(i) - build_zonelists(NODE_DATA(i)); - printk("Built %i zonelists\n", num_online_nodes()); - cpuset_init_current_mems_allowed(); +void __meminit build_all_zonelists(void) +{ + if (system_state == SYSTEM_BOOTING) { + __build_all_zonelists(0); + cpuset_init_current_mems_allowed(); + } else { + /* we have to stop all cpus to guaranntee there is no user + of zonelist */ + stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + /* cpuset refresh routine should be here */ + } + vm_total_pages = nr_free_pagecache_pages(); + printk("Built %i zonelists. Total pages: %ld\n", + num_online_nodes(), vm_total_pages); } /* @@ -1718,7 +1743,8 @@ void __init build_all_zonelists(void) */ #define PAGES_PER_WAITQUEUE 256 -static inline unsigned long wait_table_size(unsigned long pages) +#ifndef CONFIG_MEMORY_HOTPLUG +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; @@ -1736,6 +1762,29 @@ static inline unsigned long wait_table_size(unsigned long pages) return max(size, 4UL); } +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif /* * This is an integer logarithm so that shifts can be used later @@ -1960,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, +static int pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -2001,23 +2050,46 @@ void __init setup_per_cpu_pageset(void) #endif static __meminit -void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size; /* * The per-page waitqueue mechanism uses hashed waitqueues * per zone. */ - zone->wait_table_size = wait_table_size(zone_size_pages); - zone->wait_table_bits = wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (system_state == SYSTEM_BOOTING) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; - for(i = 0; i < zone->wait_table_size; ++i) + for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); + + return 0; } static __meminit void zone_pcp_init(struct zone *zone) @@ -2039,12 +2111,15 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->name, zone->present_pages, batch); } -static __meminit void init_currently_empty_zone(struct zone *zone, - unsigned long zone_start_pfn, unsigned long size) +__meminit int init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - - zone_wait_table_init(zone, size); + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -2052,6 +2127,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + return 0; } /* @@ -2060,12 +2137,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone, * - mark all memory queues empty * - clear the memory bitmaps */ -static void __init free_area_init_core(struct pglist_data *pgdat, +static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; @@ -2107,7 +2185,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, continue; zonetable_add(zone, nid, j, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); + BUG_ON(ret); zone_start_pfn += size; } } @@ -2121,14 +2200,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size; + unsigned long size, start, end; struct page *map; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = alloc_bootmem_node(pgdat, size); - pgdat->node_mem_map = map; + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifdef CONFIG_FLATMEM /* @@ -2140,7 +2227,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, +void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { @@ -2564,9 +2651,11 @@ void setup_per_zone_pages_min(void) } for_each_zone(zone) { - unsigned long tmp; + u64 tmp; + spin_lock_irqsave(&zone->lru_lock, flags); - tmp = (pages_min * zone->present_pages) / lowmem_pages; + tmp = (u64)pages_min * zone->present_pages; + do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -2593,8 +2682,8 @@ void setup_per_zone_pages_min(void) zone->pages_min = tmp; } - zone->pages_low = zone->pages_min + tmp / 4; - zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -2790,42 +2879,14 @@ void *__init alloc_large_system_hash(const char *tablename, } #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -/* - * pfn <-> page translation. out-of-line version. - * (see asm-generic/memory_model.h) - */ -#if defined(CONFIG_FLATMEM) struct page *pfn_to_page(unsigned long pfn) { - return mem_map + (pfn - ARCH_PFN_OFFSET); + return __pfn_to_page(pfn); } unsigned long page_to_pfn(struct page *page) { - return (page - mem_map) + ARCH_PFN_OFFSET; -} -#elif defined(CONFIG_DISCONTIGMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - int nid = arch_pfn_to_nid(pfn); - return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); -} -unsigned long page_to_pfn(struct page *page) -{ - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); - return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; -} -#elif defined(CONFIG_SPARSEMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; -} - -unsigned long page_to_pfn(struct page *page) -{ - long section_id = page_to_section(page); - return page - __section_mem_map_addr(__nr_to_section(section_id)); + return __page_to_pfn(page); } -#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */