mirror of
https://github.com/torvalds/linux
synced 2024-11-05 18:23:50 +00:00
3bf5ee9564
ia64 and ppc64 had hugetlb_free_pgtables functions which were no longer being called, and it wasn't obvious what to do about them. The ppc64 case turns out to be easy: the associated tables are noted elsewhere and freed later, safe to either skip its hugetlb areas or go through the motions of freeing nothing. Since ia64 does need a special case, restore to ppc64 the special case of skipping them. The ia64 hugetlb case has been broken since pgd_addr_end went in, though it probably appeared to work okay if you just had one such area; in fact it's been broken much longer if you consider a long munmap spanning from another region into the hugetlb region. In the ia64 hugetlb region, more virtual address bits are available than in the other regions, yet the page tables are structured the same way: the page at the bottom is larger. Here we need to scale down each addr before passing it to the standard free_pgd_range. Was about to write a hugely_scaled_down macro, but found htlbpage_to_page already exists for just this purpose. Fixed off-by-one in ia64 is_hugepage_only_range. Uninline free_pgd_range to make it available to ia64. Make sure the vma-gathering loop in free_pgtables cannot join a hugepage_only_range to any other (safe to join huges? probably but don't bother). Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
341 lines
8 KiB
C
341 lines
8 KiB
C
/*
|
|
* IA-64 Huge TLB Page Support for Kernel.
|
|
*
|
|
* Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
|
|
* Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
|
|
*
|
|
* Sep, 2003: add numa support
|
|
* Feb, 2004: dynamic hugetlb page size via boot parameter
|
|
*/
|
|
|
|
#include <linux/config.h>
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/smp_lock.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/sysctl.h>
|
|
#include <asm/mman.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
|
|
|
|
static pte_t *
|
|
huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
unsigned long taddr = htlbpage_to_page(addr);
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *pte = NULL;
|
|
|
|
pgd = pgd_offset(mm, taddr);
|
|
pud = pud_alloc(mm, pgd, taddr);
|
|
if (pud) {
|
|
pmd = pmd_alloc(mm, pud, taddr);
|
|
if (pmd)
|
|
pte = pte_alloc_map(mm, pmd, taddr);
|
|
}
|
|
return pte;
|
|
}
|
|
|
|
static pte_t *
|
|
huge_pte_offset (struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
unsigned long taddr = htlbpage_to_page(addr);
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *pte = NULL;
|
|
|
|
pgd = pgd_offset(mm, taddr);
|
|
if (pgd_present(*pgd)) {
|
|
pud = pud_offset(pgd, taddr);
|
|
if (pud_present(*pud)) {
|
|
pmd = pmd_offset(pud, taddr);
|
|
if (pmd_present(*pmd))
|
|
pte = pte_offset_map(pmd, taddr);
|
|
}
|
|
}
|
|
|
|
return pte;
|
|
}
|
|
|
|
#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
|
|
|
|
static void
|
|
set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
|
|
struct page *page, pte_t * page_table, int write_access)
|
|
{
|
|
pte_t entry;
|
|
|
|
add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
|
|
if (write_access) {
|
|
entry =
|
|
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
|
|
} else
|
|
entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
|
|
entry = pte_mkyoung(entry);
|
|
mk_pte_huge(entry);
|
|
set_pte(page_table, entry);
|
|
return;
|
|
}
|
|
/*
|
|
* This function checks for proper alignment of input addr and len parameters.
|
|
*/
|
|
int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
|
|
{
|
|
if (len & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
if (addr & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
if (REGION_NUMBER(addr) != REGION_HPAGE)
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
pte_t *src_pte, *dst_pte, entry;
|
|
struct page *ptepage;
|
|
unsigned long addr = vma->vm_start;
|
|
unsigned long end = vma->vm_end;
|
|
|
|
while (addr < end) {
|
|
dst_pte = huge_pte_alloc(dst, addr);
|
|
if (!dst_pte)
|
|
goto nomem;
|
|
src_pte = huge_pte_offset(src, addr);
|
|
entry = *src_pte;
|
|
ptepage = pte_page(entry);
|
|
get_page(ptepage);
|
|
set_pte(dst_pte, entry);
|
|
add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
|
|
addr += HPAGE_SIZE;
|
|
}
|
|
return 0;
|
|
nomem:
|
|
return -ENOMEM;
|
|
}
|
|
|
|
int
|
|
follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
struct page **pages, struct vm_area_struct **vmas,
|
|
unsigned long *st, int *length, int i)
|
|
{
|
|
pte_t *ptep, pte;
|
|
unsigned long start = *st;
|
|
unsigned long pstart;
|
|
int len = *length;
|
|
struct page *page;
|
|
|
|
do {
|
|
pstart = start & HPAGE_MASK;
|
|
ptep = huge_pte_offset(mm, start);
|
|
pte = *ptep;
|
|
|
|
back1:
|
|
page = pte_page(pte);
|
|
if (pages) {
|
|
page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
|
|
get_page(page);
|
|
pages[i] = page;
|
|
}
|
|
if (vmas)
|
|
vmas[i] = vma;
|
|
i++;
|
|
len--;
|
|
start += PAGE_SIZE;
|
|
if (((start & HPAGE_MASK) == pstart) && len &&
|
|
(start < vma->vm_end))
|
|
goto back1;
|
|
} while (len && start < vma->vm_end);
|
|
*length = len;
|
|
*st = start;
|
|
return i;
|
|
}
|
|
|
|
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
|
|
{
|
|
struct page *page;
|
|
pte_t *ptep;
|
|
|
|
if (REGION_NUMBER(addr) != REGION_HPAGE)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
ptep = huge_pte_offset(mm, addr);
|
|
if (!ptep || pte_none(*ptep))
|
|
return NULL;
|
|
page = pte_page(*ptep);
|
|
page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
|
|
return page;
|
|
}
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return 0;
|
|
}
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
void hugetlb_free_pgd_range(struct mmu_gather **tlb,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
/*
|
|
* This is called only when is_hugepage_only_range(addr,),
|
|
* and it follows that is_hugepage_only_range(end,) also.
|
|
*
|
|
* The offset of these addresses from the base of the hugetlb
|
|
* region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
|
|
* the standard free_pgd_range will free the right page tables.
|
|
*
|
|
* If floor and ceiling are also in the hugetlb region, they
|
|
* must likewise be scaled down; but if outside, left unchanged.
|
|
*/
|
|
|
|
addr = htlbpage_to_page(addr);
|
|
end = htlbpage_to_page(end);
|
|
if (is_hugepage_only_range(tlb->mm, floor, HPAGE_SIZE))
|
|
floor = htlbpage_to_page(floor);
|
|
if (is_hugepage_only_range(tlb->mm, ceiling, HPAGE_SIZE))
|
|
ceiling = htlbpage_to_page(ceiling);
|
|
|
|
free_pgd_range(tlb, addr, end, floor, ceiling);
|
|
}
|
|
|
|
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
unsigned long address;
|
|
pte_t *pte;
|
|
struct page *page;
|
|
|
|
BUG_ON(start & (HPAGE_SIZE - 1));
|
|
BUG_ON(end & (HPAGE_SIZE - 1));
|
|
|
|
for (address = start; address < end; address += HPAGE_SIZE) {
|
|
pte = huge_pte_offset(mm, address);
|
|
if (pte_none(*pte))
|
|
continue;
|
|
page = pte_page(*pte);
|
|
put_page(page);
|
|
pte_clear(mm, address, pte);
|
|
}
|
|
add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
|
|
flush_tlb_range(vma, start, end);
|
|
}
|
|
|
|
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
unsigned long addr;
|
|
int ret = 0;
|
|
|
|
BUG_ON(vma->vm_start & ~HPAGE_MASK);
|
|
BUG_ON(vma->vm_end & ~HPAGE_MASK);
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
|
|
unsigned long idx;
|
|
pte_t *pte = huge_pte_alloc(mm, addr);
|
|
struct page *page;
|
|
|
|
if (!pte) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
if (!pte_none(*pte))
|
|
continue;
|
|
|
|
idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
|
|
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
|
|
page = find_get_page(mapping, idx);
|
|
if (!page) {
|
|
/* charge the fs quota first */
|
|
if (hugetlb_get_quota(mapping)) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
page = alloc_huge_page();
|
|
if (!page) {
|
|
hugetlb_put_quota(mapping);
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
|
|
if (! ret) {
|
|
unlock_page(page);
|
|
} else {
|
|
hugetlb_put_quota(mapping);
|
|
page_cache_release(page);
|
|
goto out;
|
|
}
|
|
}
|
|
set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
|
|
}
|
|
out:
|
|
spin_unlock(&mm->page_table_lock);
|
|
return ret;
|
|
}
|
|
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
|
|
unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct vm_area_struct *vmm;
|
|
|
|
if (len > RGN_MAP_LIMIT)
|
|
return -ENOMEM;
|
|
if (len & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
/* This code assumes that REGION_HPAGE != 0. */
|
|
if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
|
|
addr = HPAGE_REGION_BASE;
|
|
else
|
|
addr = ALIGN(addr, HPAGE_SIZE);
|
|
for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
|
|
/* At this point: (!vmm || addr < vmm->vm_end). */
|
|
if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
|
|
return -ENOMEM;
|
|
if (!vmm || (addr + len) <= vmm->vm_start)
|
|
return addr;
|
|
addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
|
|
}
|
|
}
|
|
|
|
static int __init hugetlb_setup_sz(char *str)
|
|
{
|
|
u64 tr_pages;
|
|
unsigned long long size;
|
|
|
|
if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
|
|
/*
|
|
* shouldn't happen, but just in case.
|
|
*/
|
|
tr_pages = 0x15557000UL;
|
|
|
|
size = memparse(str, &str);
|
|
if (*str || (size & (size-1)) || !(tr_pages & size) ||
|
|
size <= PAGE_SIZE ||
|
|
size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
|
|
printk(KERN_WARNING "Invalid huge page size specified\n");
|
|
return 1;
|
|
}
|
|
|
|
hpage_shift = __ffs(size);
|
|
/*
|
|
* boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
|
|
* override here with new page shift.
|
|
*/
|
|
ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
|
|
return 1;
|
|
}
|
|
__setup("hugepagesz=", hugetlb_setup_sz);
|