*/
 void __init setup_per_cpu_areas(void)
 {
-       ssize_t size, old_size;
+       ssize_t size, old_size, da_size;
        char *ptr;
        int cpu;
+       unsigned long align = 1;
 
        /* Setup cpu_pda map */
        setup_cpu_pda_map();
 
        /* Copy section for each CPU (we discard the original) */
        old_size = PERCPU_ENOUGH_ROOM;
-       size = old_size + per_cpu_dyn_array_size();
+       da_size = per_cpu_dyn_array_size(&align);
+       align = max_t(unsigned long, PAGE_SIZE, align);
+       size = roundup(old_size + da_size, align);
        printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
                          size);
 
        for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-               ptr = alloc_bootmem_pages(size);
+               ptr = __alloc_bootmem(size, align,
+                                __pa(MAX_DMA_ADDRESS));
 #else
                int node = early_cpu_to_node(cpu);
                if (!node_online(node) || !NODE_DATA(node)) {
-                       ptr = alloc_bootmem_pages(size);
+                       ptr = __alloc_bootmem(size, align,
+                                        __pa(MAX_DMA_ADDRESS));
                        printk(KERN_INFO
                               "cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                                         cpu, __pa(ptr));
                }
                else {
-                       ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+                       ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+                                                       __pa(MAX_DMA_ADDRESS));
                        if (ptr)
                                printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
                                         cpu, node, __pa(ptr));
 
        unsigned long size, i, old_size;
        char *ptr;
        unsigned long nr_possible_cpus = num_possible_cpus();
+       unsigned long align = 1;
+       unsigned da_size;
 
        /* Copy section for each CPU (we discard the original) */
        old_size = PERCPU_ENOUGH_ROOM;
-       size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE);
+       da_size = per_cpu_dyn_array_size(&align);
+       align = max_t(unsigned long, PAGE_SIZE, align);
+       size = ALIGN(old_size + da_size, align);
        ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
        for_each_possible_cpu(i) {
 void pre_alloc_dyn_array(void)
 {
 #ifdef CONFIG_HAVE_DYN_ARRAY
-       unsigned long size, phys = 0;
+       unsigned long total_size = 0, size, phys;
+       unsigned long max_align = 1;
        struct dyn_array **daa;
+       char *ptr;
 
+       /* get the total size at first */
        for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
                struct dyn_array *da = *daa;
 
                size = da->size * (*da->nr);
-               print_fn_descriptor_symbol("dyna_array %s ", da->name);
-               printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
+               print_fn_descriptor_symbol("dyn_array %s ", da->name);
+               printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
                        da->size, *da->nr, da->align);
-               *da->name = __alloc_bootmem(size, da->align, phys);
-               phys = virt_to_phys(*da->name);
+               total_size += roundup(size, da->align);
+               if (da->align > max_align)
+                       max_align = da->align;
+       }
+       if (total_size)
+               printk(KERN_DEBUG "dyn_array total_size: %#lx\n",
+                        total_size);
+       else
+               return;
+
+       /* allocate them all together */
+       max_align = max_t(unsigned long, max_align, PAGE_SIZE);
+       ptr = __alloc_bootmem_nopanic(total_size, max_align, 0);
+       if (!ptr)
+               panic("Can not alloc dyn_alloc\n");
+
+       phys = virt_to_phys(ptr);
+       for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
+               struct dyn_array *da = *daa;
+
+               size = da->size * (*da->nr);
+               print_fn_descriptor_symbol("dyn_array %s ", da->name);
+
+               phys = roundup(phys, da->align);
+               *da->name = phys_to_virt(phys);
                printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
 
+               phys += size;
+
                if (da->init_work)
                        da->init_work(da);
        }
 #endif
 }
 
-unsigned long per_cpu_dyn_array_size(void)
+unsigned long per_cpu_dyn_array_size(unsigned long *align)
 {
        unsigned long total_size = 0;
 #ifdef CONFIG_HAVE_DYN_ARRAY
        unsigned long size;
        struct dyn_array **daa;
+       unsigned max_align = 1;
 
        for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
                struct dyn_array *da = *daa;
 
                size = da->size * (*da->nr);
-               print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
+               print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
                printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
                        da->size, *da->nr, da->align);
                total_size += roundup(size, da->align);
+               if (da->align > max_align)
+                       max_align = da->align;
        }
-       if (total_size)
-               printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n",
+       if (total_size) {
+               printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n",
                         total_size);
+               *align = max_align;
+       }
 #endif
        return total_size;
 }
        void **array;
 
        phys = virt_to_phys(ptr);
-
        for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
                struct dyn_array *da = *daa;
 
                size = da->size * (*da->nr);
-               print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
-               printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
-                       da->size, *da->nr, da->align);
+               print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
 
                phys = roundup(phys, da->align);
                addr = (unsigned long)da->name;
                array = (void **)addr;
                *array = phys_to_virt(phys);
                *da->name = *array; /* so init_work could use it directly */
-               printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size);
+               printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
+
                phys += size;
 
                if (da->init_work) {