linux/mm/nobootmem.c
Tony Luck fc6daaf931 mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute
Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check.  Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).

But we have no recovery path for errors encountered during kernel code
execution.  Except for some very specific cases were are unlikely to ever
be able to recover.

Enter memory mirroring. Actually 3rd generation of memory mirroing.

Gen1: All memory is mirrored
	Pro: No s/w enabling - h/w just gets good data from other side of the
	     mirror
	Con: Halves effective memory capacity available to OS/applications

Gen2: Partial memory mirror - just mirror memory begind some memory controllers
	Pro: Keep more of the capacity
	Con: Nightmare to enable. Have to choose between allocating from
	     mirrored memory for safety vs. NUMA local memory for performance

Gen3: Address range partial memory mirror - some mirror on each memory
      controller
	Pro: Can tune the amount of mirror and keep NUMA performance
	Con: I have to write memory management code to implement

The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:

1) This patch series - find the mirrored memory, use it for boot time
   allocations

2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
   unused mirrored memory from mm/memblock.c and only give it out to
   select kernel allocations (this is still being scoped because
   page_alloc.c is scary).

This patch (of 3):

Add extra "flags" to memblock to allow selection of memory based on
attribute.  No functional changes

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-24 17:49:44 -07:00

441 lines
11 KiB
C

/*
* bootmem - A boot-time physical memory allocator and configurator
*
* Copyright (C) 1999 Ingo Molnar
* 1999 Kanoj Sarcar, SGI
* 2008 Johannes Weiner
*
* Access to this subsystem has to be serialized externally (which is true
* for the boot process anyway).
*/
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
#include <asm/processor.h>
#include "internal.h"
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
void *ptr;
u64 addr;
if (limit > memblock.current_limit)
limit = memblock.current_limit;
addr = memblock_find_in_range_node(size, align, goal, limit, nid,
MEMBLOCK_NONE);
if (!addr)
return NULL;
if (memblock_reserve(addr, size))
return NULL;
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
*/
kmemleak_alloc(ptr, size, 0, 0);
return ptr;
}
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
* @size: size of the range in bytes
*
* This is only useful when the bootmem allocator has already been torn
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
void __init free_bootmem_late(unsigned long addr, unsigned long size)
{
unsigned long cursor, end;
kmemleak_free_part(__va(addr), size);
cursor = PFN_UP(addr);
end = PFN_DOWN(addr + size);
for (; cursor < end; cursor++) {
__free_pages_bootmem(pfn_to_page(cursor), 0);
totalram_pages++;
}
}
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;
while (start < end) {
order = min(MAX_ORDER - 1UL, __ffs(start));
while (start + (1UL << order) > end)
order--;
__free_pages_bootmem(pfn_to_page(start), order);
start += (1UL << order);
}
}
static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = min_t(unsigned long,
PFN_DOWN(end), max_low_pfn);
if (start_pfn > end_pfn)
return 0;
__free_pages_memory(start_pfn, end_pfn);
return end_pfn - start_pfn;
}
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;
memblock_clear_hotplug(0, -1);
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
NULL)
count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
{
phys_addr_t size;
/* Free memblock.reserved array if it was allocated */
size = get_allocated_memblock_reserved_regions_info(&start);
if (size)
count += __free_memory_core(start, start + size);
/* Free memblock.memory array if it was allocated */
size = get_allocated_memblock_memory_regions_info(&start);
if (size)
count += __free_memory_core(start, start + size);
}
#endif
return count;
}
static int reset_managed_pages_done __initdata;
void reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
z->managed_pages = 0;
}
void __init reset_all_zones_managed_pages(void)
{
struct pglist_data *pgdat;
if (reset_managed_pages_done)
return;
for_each_online_pgdat(pgdat)
reset_node_managed_pages(pgdat);
reset_managed_pages_done = 1;
}
/**
* free_all_bootmem - release free pages to the buddy allocator
*
* Returns the number of pages actually released.
*/
unsigned long __init free_all_bootmem(void)
{
unsigned long pages;
reset_all_zones_managed_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
pages = free_low_memory_core_early();
totalram_pages += pages;
return pages;
}
/**
* free_bootmem_node - mark a page range as usable
* @pgdat: node the range resides on
* @physaddr: starting address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must reside completely on the specified node.
*/
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
memblock_free(physaddr, size);
}
/**
* free_bootmem - mark a page range as usable
* @addr: starting address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must be contiguous but may span node boundaries.
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
memblock_free(addr, size);
}
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
void *ptr;
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc(size, GFP_NOWAIT);
restart:
ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
if (ptr)
return ptr;
if (goal != 0) {
goal = 0;
goto restart;
}
return NULL;
}
/**
* __alloc_bootmem_nopanic - allocate boot memory without panicking
* @size: size of the request in bytes
* @align: alignment of the region
* @goal: preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @goal.
*
* Allocation may happen on any node in the system.
*
* Returns NULL on failure.
*/
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
unsigned long limit = -1UL;
return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
if (mem)
return mem;
/*
* Whoops, we cannot satisfy the allocation request.
*/
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
/**
* __alloc_bootmem - allocate boot memory
* @size: size of the request in bytes
* @align: alignment of the region
* @goal: preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @goal.
*
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
*/
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
unsigned long limit = -1UL;
return ___alloc_bootmem(size, align, goal, limit);
}
void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
void *ptr;
again:
ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
goal, limit);
if (ptr)
return ptr;
ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
goal, limit);
if (ptr)
return ptr;
if (goal) {
goal = 0;
goto again;
}
return NULL;
}
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
}
static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal,
unsigned long limit)
{
void *ptr;
ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
if (ptr)
return ptr;
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
* @pgdat: node to allocate from
* @size: size of the request in bytes
* @align: alignment of the region
* @goal: preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @goal.
*
* Allocation may fall back to any node in the system if the specified node
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
*/
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
return __alloc_bootmem_node(pgdat, size, align, goal);
}
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
/**
* __alloc_bootmem_low - allocate low boot memory
* @size: size of the request in bytes
* @align: alignment of the region
* @goal: preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @goal.
*
* Allocation may happen on any node in the system.
*
* The function panics if the request can not be satisfied.
*/
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
}
void * __init __alloc_bootmem_low_nopanic(unsigned long size,
unsigned long align,
unsigned long goal)
{
return ___alloc_bootmem_nopanic(size, align, goal,
ARCH_LOW_ADDRESS_LIMIT);
}
/**
* __alloc_bootmem_low_node - allocate low boot memory from a specific node
* @pgdat: node to allocate from
* @size: size of the request in bytes
* @align: alignment of the region
* @goal: preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @goal.
*
* Allocation may fall back to any node in the system if the specified node
* can not hold the requested memory.
*
* The function panics if the request can not be satisfied.
*/
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
return ___alloc_bootmem_node(pgdat, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT);
}