Creation of Cybook 2416 (actually Gen4) repository

This commit is contained in:
mlt
2009-12-18 17:10:00 +00:00
committed by godzil
commit 76f20f4d40
13791 changed files with 6812321 additions and 0 deletions

165
mm/Kconfig Normal file
View File

@@ -0,0 +1,165 @@
config SELECT_MEMORY_MODEL
def_bool y
depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
This option allows you to change some of the ways that
Linux manages its memory internally. Most users will
only have one option here: FLATMEM. This is normal
and a correct option.
Some users of more advanced features like NUMA and
memory hotplug may have different options here.
DISCONTIGMEM is an more mature, better tested system,
but is incompatible with memory hotplug and may suffer
decreased performance over SPARSEMEM. If unsure between
"Sparse Memory" and "Discontiguous Memory", choose
"Discontiguous Memory".
If unsure, choose this option (Flat Memory) over any other.
config DISCONTIGMEM_MANUAL
bool "Discontiguous Memory"
depends on ARCH_DISCONTIGMEM_ENABLE
help
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
more efficient handling of these holes. However, the vast
majority of hardware has quite flat address spaces, and
can have degraded performance from extra overhead that
this option imposes.
Many NUMA configurations will have this as the only option.
If unsure, choose "Flat Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
memory hotplug systems. This is normal.
For many other systems, this will be an alternative to
"Discontiguous Memory". This option provides some potential
performance benefits, along with decreased code complexity,
but it is newer, and more experimental.
If unsure, choose "Discontiguous Memory" or "Flat Memory"
over this option.
endchoice
config DISCONTIGMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
config SPARSEMEM
def_bool y
depends on SPARSEMEM_MANUAL
config FLATMEM
def_bool y
depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
config FLAT_NODE_MEM_MAP
def_bool y
depends on !SPARSEMEM
#
# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
# to represent different areas of memory. This variable allows
# those dependencies to exist individually.
#
config NEED_MULTIPLE_NODES
def_bool y
depends on DISCONTIGMEM || NUMA
config HAVE_MEMORY_PRESENT
def_bool y
depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
# allocations when memory_present() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
#
# This option will also potentially produce smaller runtime code
# with gcc 3.4 and later.
#
config SPARSEMEM_STATIC
def_bool n
#
# Architecture platforms which require a two level mem_section in SPARSEMEM
# must select this option. This is usually for architecture platforms with
# an extremely sparse physical address space.
#
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC64)
comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
#
config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
default "4"
#
# support for page migration
#
config MIGRATION
bool "Page migration"
def_bool y
depends on NUMA
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful for
example on NUMA systems to put pages nearer to the processors accessing
the page.
config RESOURCES_64BIT
bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
default 64BIT
help
This option allows memory and IO resources to be 64 bit.
config ZONE_DMA_FLAG
int
default "0" if !ZONE_DMA
default "1"

31
mm/Makefile Normal file
View File

@@ -0,0 +1,31 @@
#
# Makefile for the linux memory manager.
#
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
$(mmu-y)
ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
obj-y += bounce.o
endif
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o

130
mm/allocpercpu.c Normal file
View File

@@ -0,0 +1,130 @@
/*
* linux/mm/allocpercpu.c
*
* Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
*/
#include <linux/mm.h>
#include <linux/module.h>
/**
* percpu_depopulate - depopulate per-cpu data for given cpu
* @__pdata: per-cpu data to depopulate
* @cpu: depopulate per-cpu data for this cpu
*
* Depopulating per-cpu data for a cpu going offline would be a typical
* use case. You need to register a cpu hotplug handler for that purpose.
*/
void percpu_depopulate(void *__pdata, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
kfree(pdata->ptrs[cpu]);
pdata->ptrs[cpu] = NULL;
}
EXPORT_SYMBOL_GPL(percpu_depopulate);
/**
* percpu_depopulate_mask - depopulate per-cpu data for some cpu's
* @__pdata: per-cpu data to depopulate
* @mask: depopulate per-cpu data for cpu's selected through mask bits
*/
void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
{
int cpu;
for_each_cpu_mask(cpu, *mask)
percpu_depopulate(__pdata, cpu);
}
EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
/**
* percpu_populate - populate per-cpu data for given cpu
* @__pdata: per-cpu data to populate further
* @size: size of per-cpu object
* @gfp: may sleep or not etc.
* @cpu: populate per-data for this cpu
*
* Populating per-cpu data for a cpu coming online would be a typical
* use case. You need to register a cpu hotplug handler for that purpose.
* Per-cpu object is populated with zeroed buffer.
*/
void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
{
struct percpu_data *pdata = __percpu_disguise(__pdata);
int node = cpu_to_node(cpu);
BUG_ON(pdata->ptrs[cpu]);
if (node_online(node)) {
/* FIXME: kzalloc_node(size, gfp, node) */
pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
if (pdata->ptrs[cpu])
memset(pdata->ptrs[cpu], 0, size);
} else
pdata->ptrs[cpu] = kzalloc(size, gfp);
return pdata->ptrs[cpu];
}
EXPORT_SYMBOL_GPL(percpu_populate);
/**
* percpu_populate_mask - populate per-cpu data for more cpu's
* @__pdata: per-cpu data to populate further
* @size: size of per-cpu object
* @gfp: may sleep or not etc.
* @mask: populate per-cpu data for cpu's selected through mask bits
*
* Per-cpu objects are populated with zeroed buffers.
*/
int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
cpumask_t *mask)
{
cpumask_t populated = CPU_MASK_NONE;
int cpu;
for_each_cpu_mask(cpu, *mask)
if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
__percpu_depopulate_mask(__pdata, &populated);
return -ENOMEM;
} else
cpu_set(cpu, populated);
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_populate_mask);
/**
* percpu_alloc_mask - initial setup of per-cpu data
* @size: size of per-cpu object
* @gfp: may sleep or not etc.
* @mask: populate per-data for cpu's selected through mask bits
*
* Populating per-cpu data for all online cpu's would be a typical use case,
* which is simplified by the percpu_alloc() wrapper.
* Per-cpu objects are populated with zeroed buffers.
*/
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
{
void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
void *__pdata = __percpu_disguise(pdata);
if (unlikely(!pdata))
return NULL;
if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
return __pdata;
kfree(pdata);
return NULL;
}
EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
/**
* percpu_free - final cleanup of per-cpu data
* @__pdata: object to clean up
*
* We simply clean up any per-cpu object left. No need for the client to
* track and specify through a bis mask which per-cpu objects are to free.
*/
void percpu_free(void *__pdata)
{
if (unlikely(!__pdata))
return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(percpu_free);

85
mm/backing-dev.c Normal file
View File

@@ -0,0 +1,85 @@
#include <linux/wait.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/module.h>
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
{
enum bdi_state bit;
wait_queue_head_t *wqh = &congestion_wqh[rw];
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
clear_bit(bit, &bdi->state);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);
void set_bdi_congested(struct backing_dev_info *bdi, int rw)
{
enum bdi_state bit;
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
set_bit(bit, &bdi->state);
}
EXPORT_SYMBOL(set_bdi_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
* @rw: READ or WRITE
* @timeout: timeout in jiffies
*
* Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
* write congestion. If no backing_devs are congested then just wait for the
* next write to be completed.
*/
long congestion_wait(int rw, long timeout)
{
long ret;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[rw];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
return ret;
}
EXPORT_SYMBOL(congestion_wait);
long congestion_wait_interruptible(int rw, long timeout)
{
long ret;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[rw];
prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
if (signal_pending(current))
ret = -ERESTARTSYS;
else
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
return ret;
}
EXPORT_SYMBOL(congestion_wait_interruptible);
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
*/
void congestion_end(int rw)
{
wait_queue_head_t *wqh = &congestion_wqh[rw];
if (waitqueue_active(wqh))
wake_up(wqh);
}
EXPORT_SYMBOL(congestion_end);

489
mm/bootmem.c Normal file
View File

@@ -0,0 +1,489 @@
/*
* linux/mm/bootmem.c
*
* Copyright (C) 1999 Ingo Molnar
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
*
* simple boot-time physical memory area allocator and
* free memory collector. It's used to deal with reserved
* system memory and memory holes as well.
*/
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <asm/bug.h>
#include <asm/io.h>
#include <asm/processor.h>
#include "internal.h"
/*
* Access to this subsystem has to be serialized externally. (this is
* true for the boot process anyway)
*/
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
static LIST_HEAD(bdata_list);
#ifdef CONFIG_CRASH_DUMP
/*
* If we have booted due to a crash, max_pfn will be a very low value. We need
* to know the amount of memory that the previous kernel used.
*/
unsigned long saved_max_pfn;
#endif
/* return the number of _pages_ that will be allocated for the boot bitmap */
unsigned long __init bootmem_bootmap_pages(unsigned long pages)
{
unsigned long mapsize;
mapsize = (pages+7)/8;
mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
mapsize >>= PAGE_SHIFT;
return mapsize;
}
/*
* link bdata in order
*/
static void __init link_bootmem(bootmem_data_t *bdata)
{
bootmem_data_t *ent;
if (list_empty(&bdata_list)) {
list_add(&bdata->list, &bdata_list);
return;
}
/* insert in order */
list_for_each_entry(ent, &bdata_list, list) {
if (bdata->node_boot_start < ent->node_boot_start) {
list_add_tail(&bdata->list, &ent->list);
return;
}
}
list_add_tail(&bdata->list, &bdata_list);
}
/*
* Given an initialised bdata, it returns the size of the boot bitmap
*/
static unsigned long __init get_mapsize(bootmem_data_t *bdata)
{
unsigned long mapsize;
unsigned long start = PFN_DOWN(bdata->node_boot_start);
unsigned long end = bdata->node_low_pfn;
mapsize = ((end - start) + 7) / 8;
return ALIGN(mapsize, sizeof(long));
}
/*
* Called once to set up the allocator itself.
*/
static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize;
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
bdata->node_boot_start = PFN_PHYS(start);
bdata->node_low_pfn = end;
link_bootmem(bdata);
/*
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
mapsize = get_mapsize(bdata);
memset(bdata->node_bootmem_map, 0xff, mapsize);
return mapsize;
}
/*
* Marks a particular physical memory range as unallocatable. Usable RAM
* might be used for boot-time allocations - or it might get added
* to the free page pool later on.
*/
static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
unsigned long size)
{
unsigned long sidx, eidx;
unsigned long i;
/*
* round up, partially reserved pages are considered
* fully reserved.
*/
BUG_ON(!size);
BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
sidx = PFN_DOWN(addr - bdata->node_boot_start);
eidx = PFN_UP(addr + size - bdata->node_boot_start);
for (i = sidx; i < eidx; i++)
if (test_and_set_bit(i, bdata->node_bootmem_map)) {
#ifdef CONFIG_DEBUG_BOOTMEM
printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
#endif
}
}
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
unsigned long size)
{
unsigned long sidx, eidx;
unsigned long i;
/*
* round down end of usable mem, partially free pages are
* considered reserved.
*/
BUG_ON(!size);
BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
if (addr < bdata->last_success)
bdata->last_success = addr;
/*
* Round up the beginning of the address.
*/
sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
for (i = sidx; i < eidx; i++) {
if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
BUG();
}
}
/*
* We 'merge' subsequent allocations to save space. We might 'lose'
* some fraction of a page if allocations cannot be satisfied due to
* size constraints on boxes where there is physical RAM space
* fragmentation - in these cases (mostly large memory boxes) this
* is not a problem.
*
* On low memory boxes we get it right in 100% of the cases.
*
* alignment has to be a power of 2 value.
*
* NOTE: This function is _not_ reentrant.
*/
void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal, unsigned long limit)
{
unsigned long offset, remaining_size, areasize, preferred;
unsigned long i, start = 0, incr, eidx, end_pfn;
void *ret;
if (!size) {
printk("__alloc_bootmem_core(): zero-sized request\n");
BUG();
}
BUG_ON(align & (align-1));
if (limit && bdata->node_boot_start >= limit)
return NULL;
/* on nodes without memory - bootmem_map is NULL */
if (!bdata->node_bootmem_map)
return NULL;
end_pfn = bdata->node_low_pfn;
limit = PFN_DOWN(limit);
if (limit && end_pfn > limit)
end_pfn = limit;
eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
offset = 0;
if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
offset = align - (bdata->node_boot_start & (align - 1UL));
offset = PFN_DOWN(offset);
/*
* We try to allocate bootmem pages above 'goal'
* first, then we try to allocate lower pages.
*/
if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
preferred = goal - bdata->node_boot_start;
if (bdata->last_success >= preferred)
if (!limit || (limit && limit > bdata->last_success))
preferred = bdata->last_success;
} else
preferred = 0;
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
incr = align >> PAGE_SHIFT ? : 1;
restart_scan:
for (i = preferred; i < eidx; i += incr) {
unsigned long j;
i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
i = ALIGN(i, incr);
if (i >= eidx)
break;
if (test_bit(i, bdata->node_bootmem_map))
continue;
for (j = i + 1; j < i + areasize; ++j) {
if (j >= eidx)
goto fail_block;
if (test_bit(j, bdata->node_bootmem_map))
goto fail_block;
}
start = i;
goto found;
fail_block:
i = ALIGN(j, incr);
}
if (preferred > offset) {
preferred = offset;
goto restart_scan;
}
return NULL;
found:
bdata->last_success = PFN_PHYS(start);
BUG_ON(start >= eidx);
/*
* Is the next page of the previous allocation-end the start
* of this allocation's buffer? If yes then we can 'merge'
* the previous partial page with this allocation.
*/
if (align < PAGE_SIZE &&
bdata->last_offset && bdata->last_pos+1 == start) {
offset = ALIGN(bdata->last_offset, align);
BUG_ON(offset > PAGE_SIZE);
remaining_size = PAGE_SIZE - offset;
if (size < remaining_size) {
areasize = 0;
/* last_pos unchanged */
bdata->last_offset = offset + size;
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
offset +
bdata->node_boot_start);
} else {
remaining_size = size - remaining_size;
areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
offset +
bdata->node_boot_start);
bdata->last_pos = start + areasize - 1;
bdata->last_offset = remaining_size;
}
bdata->last_offset &= ~PAGE_MASK;
} else {
bdata->last_pos = start + areasize - 1;
bdata->last_offset = size & ~PAGE_MASK;
ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
}
/*
* Reserve the area now:
*/
for (i = start; i < start + areasize; i++)
if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
BUG();
memset(ret, 0, size);
return ret;
}
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
{
struct page *page;
unsigned long pfn;
bootmem_data_t *bdata = pgdat->bdata;
unsigned long i, count, total = 0;
unsigned long idx;
unsigned long *map;
int gofast = 0;
BUG_ON(!bdata->node_bootmem_map);
count = 0;
/* first extant page of the node */
pfn = PFN_DOWN(bdata->node_boot_start);
idx = bdata->node_low_pfn - pfn;
map = bdata->node_bootmem_map;
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
if (bdata->node_boot_start == 0 ||
ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
gofast = 1;
for (i = 0; i < idx; ) {
unsigned long v = ~map[i / BITS_PER_LONG];
if (gofast && v == ~0UL) {
int order;
page = pfn_to_page(pfn);
count += BITS_PER_LONG;
order = ffs(BITS_PER_LONG) - 1;
__free_pages_bootmem(page, order);
i += BITS_PER_LONG;
page += BITS_PER_LONG;
} else if (v) {
unsigned long m;
page = pfn_to_page(pfn);
for (m = 1; m && i < idx; m<<=1, page++, i++) {
if (v & m) {
count++;
__free_pages_bootmem(page, 0);
}
}
} else {
i += BITS_PER_LONG;
}
pfn += BITS_PER_LONG;
}
total += count;
/*
* Now free the allocator bitmap itself, it's not
* needed anymore:
*/
page = virt_to_page(bdata->node_bootmem_map);
count = 0;
idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
for (i = 0; i < idx; i++, page++) {
__free_pages_bootmem(page, 0);
count++;
}
total += count;
bdata->node_bootmem_map = NULL;
return total;
}
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
unsigned long startpfn, unsigned long endpfn)
{
return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
}
void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
reserve_bootmem_core(pgdat->bdata, physaddr, size);
}
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
free_bootmem_core(pgdat->bdata, physaddr, size);
}
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
return free_all_bootmem_core(pgdat);
}
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
{
max_low_pfn = pages;
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0), start, 0, pages);
}
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
void __init reserve_bootmem(unsigned long addr, unsigned long size)
{
reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
}
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
void __init free_bootmem(unsigned long addr, unsigned long size)
{
free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
}
unsigned long __init free_all_bootmem(void)
{
return free_all_bootmem_core(NODE_DATA(0));
}
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
bootmem_data_t *bdata;
void *ptr;
list_for_each_entry(bdata, &bdata_list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
if (ptr)
return ptr;
}
return NULL;
}
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
void *mem = __alloc_bootmem_nopanic(size,align,goal);
if (mem)
return mem;
/*
* Whoops, we cannot satisfy the allocation request.
*/
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
void *ptr;
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
return __alloc_bootmem(size, align, goal);
}
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
bootmem_data_t *bdata;
void *ptr;
list_for_each_entry(bdata, &bdata_list, list) {
ptr = __alloc_bootmem_core(bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT);
if (ptr)
return ptr;
}
/*
* Whoops, we cannot satisfy the allocation request.
*/
printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
panic("Out of low memory");
return NULL;
}
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
ARCH_LOW_ADDRESS_LIMIT);
}

302
mm/bounce.c Normal file
View File

@@ -0,0 +1,302 @@
/* bounce buffer handling for block devices
*
* - Split from highmem.c
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static mempool_t *page_pool, *isa_page_pool;
#ifdef CONFIG_HIGHMEM
static __init int init_emergency_pool(void)
{
struct sysinfo i;
si_meminfo(&i);
si_swapinfo(&i);
if (!i.totalhigh)
return 0;
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
return 0;
}
__initcall(init_emergency_pool);
/*
* highmem version, map in to vec
*/
static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
{
unsigned long flags;
unsigned char *vto;
local_irq_save(flags);
vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
kunmap_atomic(vto, KM_BOUNCE_READ);
local_irq_restore(flags);
}
#else /* CONFIG_HIGHMEM */
#define bounce_copy_vec(to, vfrom) \
memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
#endif /* CONFIG_HIGHMEM */
/*
* allocate pages in the DMA region for the ISA pool
*/
static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
{
return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
}
/*
* gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
* as the max address, so check if the pool has already been created.
*/
int init_emergency_isa_pool(void)
{
if (isa_page_pool)
return 0;
isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
mempool_free_pages, (void *) 0);
BUG_ON(!isa_page_pool);
printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
return 0;
}
/*
* Simple bounce buffer support for highmem pages. Depending on the
* queue gfp mask set, *to may or may not be a highmem page. kmap it
* always, it will do the Right Thing
*/
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
struct bio_vec *tovec, *fromvec;
int i;
__bio_for_each_segment(tovec, to, i, 0) {
fromvec = from->bi_io_vec + i;
/*
* not bounced
*/
if (tovec->bv_page == fromvec->bv_page)
continue;
/*
* fromvec->bv_offset and fromvec->bv_len might have been
* modified by the block layer, so use the original copy,
* bounce_copy_vec already uses tovec->bv_len
*/
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
flush_dcache_page(tovec->bv_page);
bounce_copy_vec(tovec, vfrom);
}
}
static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, *org_vec;
int i;
if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
/*
* free up bounce indirect pages used
*/
__bio_for_each_segment(bvec, bio, i, 0) {
org_vec = bio_orig->bi_io_vec + i;
if (bvec->bv_page == org_vec->bv_page)
continue;
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, pool);
}
bio_endio(bio_orig, bio_orig->bi_size, err);
bio_put(bio);
}
static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
{
if (bio->bi_size)
return 1;
bounce_end_io(bio, page_pool, err);
return 0;
}
static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
{
if (bio->bi_size)
return 1;
bounce_end_io(bio, isa_page_pool, err);
return 0;
}
static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
{
struct bio *bio_orig = bio->bi_private;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
copy_to_high_bio_irq(bio_orig, bio);
bounce_end_io(bio, pool, err);
}
static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
{
if (bio->bi_size)
return 1;
__bounce_end_io_read(bio, page_pool, err);
return 0;
}
static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
{
if (bio->bi_size)
return 1;
__bounce_end_io_read(bio, isa_page_pool, err);
return 0;
}
static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
mempool_t *pool)
{
struct page *page;
struct bio *bio = NULL;
int i, rw = bio_data_dir(*bio_orig);
struct bio_vec *to, *from;
bio_for_each_segment(from, *bio_orig, i) {
page = from->bv_page;
/*
* is destination page below bounce pfn?
*/
if (page_to_pfn(page) <= q->bounce_pfn)
continue;
/*
* irk, bounce it
*/
if (!bio)
bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
to = bio->bi_io_vec + i;
to->bv_page = mempool_alloc(pool, q->bounce_gfp);
to->bv_len = from->bv_len;
to->bv_offset = from->bv_offset;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
if (rw == WRITE) {
char *vto, *vfrom;
flush_dcache_page(from->bv_page);
vto = page_address(to->bv_page) + to->bv_offset;
vfrom = kmap(from->bv_page) + from->bv_offset;
memcpy(vto, vfrom, to->bv_len);
kunmap(from->bv_page);
}
}
/*
* no pages bounced
*/
if (!bio)
return;
blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
/*
* at least one page was bounced, fill in possible non-highmem
* pages
*/
__bio_for_each_segment(from, *bio_orig, i, 0) {
to = bio_iovec_idx(bio, i);
if (!to->bv_page) {
to->bv_page = from->bv_page;
to->bv_len = from->bv_len;
to->bv_offset = from->bv_offset;
}
}
bio->bi_bdev = (*bio_orig)->bi_bdev;
bio->bi_flags |= (1 << BIO_BOUNCED);
bio->bi_sector = (*bio_orig)->bi_sector;
bio->bi_rw = (*bio_orig)->bi_rw;
bio->bi_vcnt = (*bio_orig)->bi_vcnt;
bio->bi_idx = (*bio_orig)->bi_idx;
bio->bi_size = (*bio_orig)->bi_size;
if (pool == page_pool) {
bio->bi_end_io = bounce_end_io_write;
if (rw == READ)
bio->bi_end_io = bounce_end_io_read;
} else {
bio->bi_end_io = bounce_end_io_write_isa;
if (rw == READ)
bio->bi_end_io = bounce_end_io_read_isa;
}
bio->bi_private = *bio_orig;
*bio_orig = bio;
}
void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
{
mempool_t *pool;
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,
* don't waste time iterating over bio segments
*/
if (!(q->bounce_gfp & GFP_DMA)) {
if (q->bounce_pfn >= blk_max_pfn)
return;
pool = page_pool;
} else {
BUG_ON(!isa_page_pool);
pool = isa_page_pool;
}
/*
* slow path
*/
__blk_queue_bounce(q, bio_orig, pool);
}
EXPORT_SYMBOL(blk_queue_bounce);

125
mm/fadvise.c Normal file
View File

@@ -0,0 +1,125 @@
/*
* mm/fadvise.c
*
* Copyright (C) 2002, Linus Torvalds
*
* 11Jan2003 akpm@digeo.com
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <asm/unistd.h>
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
struct file *file = fget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
int ret = 0;
if (!file)
return -EBADF;
if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
mapping = file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
goto out;
}
if (mapping->a_ops->get_xip_page)
/* no bad return value, but ignore advice */
goto out;
/* Careful about overflows. Len == 0 means "as much as possible" */
endbyte = offset + len;
if (!len || endbyte < len)
endbyte = -1;
else
endbyte--; /* inclusive */
bdi = mapping->backing_dev_info;
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
break;
case POSIX_FADV_RANDOM:
file->f_ra.ra_pages = 0;
break;
case POSIX_FADV_SEQUENTIAL:
file->f_ra.ra_pages = bdi->ra_pages * 2;
break;
case POSIX_FADV_WILLNEED:
if (!mapping->a_ops->readpage) {
ret = -EINVAL;
break;
}
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
end_index = endbyte >> PAGE_CACHE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
ret = force_page_cache_readahead(mapping, file,
start_index,
max_sane_readahead(nrpages));
if (ret > 0)
ret = 0;
break;
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
if (!bdi_write_congested(mapping->backing_dev_info))
filemap_flush(mapping);
/* First and last FULL page! */
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
if (end_index >= start_index)
invalidate_mapping_pages(mapping, start_index,
end_index);
break;
default:
ret = -EINVAL;
}
out:
fput(file);
return ret;
}
#ifdef __ARCH_WANT_SYS_FADVISE64
asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
{
return sys_fadvise64_64(fd, offset, len, advice);
}
#endif

2463
mm/filemap.c Normal file

File diff suppressed because it is too large Load Diff

103
mm/filemap.h Normal file
View File

@@ -0,0 +1,103 @@
/*
* linux/mm/filemap.h
*
* Copyright (C) 1994-1999 Linus Torvalds
*/
#ifndef __FILEMAP_H
#define __FILEMAP_H
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
size_t
__filemap_copy_from_user_iovec_inatomic(char *vaddr,
const struct iovec *iov,
size_t base,
size_t bytes);
/*
* Copy as much as we can into the page and return the number of bytes which
* were sucessfully copied. If a fault is encountered then clear the page
* out to (offset+bytes) and return the number of bytes which were copied.
*
* NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
* to *NOT* zero any tail of the buffer that it failed to copy. If it does,
* and if the following non-atomic copy succeeds, then there is a small window
* where the target page contains neither the data before the write, nor the
* data after the write (it contains zero). A read at this time will see
* data that is inconsistent with any ordering of the read and the write.
* (This has been detected in practice).
*/
static inline size_t
filemap_copy_from_user(struct page *page, unsigned long offset,
const char __user *buf, unsigned bytes)
{
char *kaddr;
int left;
kaddr = kmap_atomic(page, KM_USER0);
left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (left != 0) {
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
kunmap(page);
}
return bytes - left;
}
/*
* This has the same sideeffects and return value as filemap_copy_from_user().
* The difference is that on a fault we need to memset the remainder of the
* page (out to offset+bytes), to emulate filemap_copy_from_user()'s
* single-segment behaviour.
*/
static inline size_t
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
const struct iovec *iov, size_t base, size_t bytes)
{
char *kaddr;
size_t copied;
kaddr = kmap_atomic(page, KM_USER0);
copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
base, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (copied != bytes) {
kaddr = kmap(page);
copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
base, bytes);
if (bytes - copied)
memset(kaddr + offset + copied, 0, bytes - copied);
kunmap(page);
}
return copied;
}
static inline void
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
do {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
} while (bytes);
*iovp = iov;
*basep = base;
}
#endif

468
mm/filemap_xip.c Normal file
View File

@@ -0,0 +1,468 @@
/*
* linux/mm/filemap_xip.c
*
* Copyright (C) 2005 IBM Corporation
* Author: Carsten Otte <cotte@de.ibm.com>
*
* derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
*
*/
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
#include <asm/tlbflush.h>
#include "filemap.h"
/*
* We do use our own empty page to avoid interference with other users
* of ZERO_PAGE(), such as /dev/zero
*/
static struct page *__xip_sparse_page;
static struct page *xip_sparse_page(void)
{
if (!__xip_sparse_page) {
unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER);
if (zeroes) {
static DEFINE_SPINLOCK(xip_alloc_lock);
spin_lock(&xip_alloc_lock);
if (!__xip_sparse_page)
__xip_sparse_page = virt_to_page(zeroes);
else
free_page(zeroes);
spin_unlock(&xip_alloc_lock);
}
}
return __xip_sparse_page;
}
/*
* This is a file read routine for execute in place files, and uses
* the mapping->a_ops->get_xip_page() function for the actual low-level
* stuff.
*
* Note the struct file* is not used at all. It may be NULL.
*/
static void
do_xip_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
loff_t *ppos,
read_descriptor_t *desc,
read_actor_t actor)
{
struct inode *inode = mapping->host;
unsigned long index, end_index, offset;
loff_t isize;
BUG_ON(!mapping->a_ops->get_xip_page);
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
unsigned long nr, ret;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
goto out;
}
}
nr = nr - offset;
page = mapping->a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 0);
if (!page)
goto no_xip_page;
if (unlikely(IS_ERR(page))) {
if (PTR_ERR(page) == -ENODATA) {
/* sparse */
page = ZERO_PAGE(0);
} else {
desc->error = PTR_ERR(page);
goto out;
}
}
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* Ok, we have the page, so now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
if (ret == nr && desc->count)
continue;
goto out;
no_xip_page:
/* Did not get the page. Report it */
desc->error = -EIO;
goto out;
}
out:
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
if (filp)
file_accessed(filp);
}
ssize_t
xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
read_descriptor_t desc;
if (!access_ok(VERIFY_WRITE, buf, len))
return -EFAULT;
desc.written = 0;
desc.arg.buf = buf;
desc.count = len;
desc.error = 0;
do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
ppos, &desc, file_read_actor);
if (desc.written)
return desc.written;
else
return desc.error;
}
EXPORT_SYMBOL_GPL(xip_file_read);
ssize_t
xip_file_sendfile(struct file *in_file, loff_t *ppos,
size_t count, read_actor_t actor, void *target)
{
read_descriptor_t desc;
if (!count)
return 0;
desc.written = 0;
desc.count = count;
desc.arg.data = target;
desc.error = 0;
do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
ppos, &desc, actor);
if (desc.written)
return desc.written;
return desc.error;
}
EXPORT_SYMBOL_GPL(xip_file_sendfile);
/*
* __xip_unmap is invoked from xip_unmap and
* xip_write
*
* This function walks all vmas of the address_space and unmaps the
* __xip_sparse_page when found at pgoff.
*/
static void
__xip_unmap (struct address_space * mapping,
unsigned long pgoff)
{
struct vm_area_struct *vma;
struct mm_struct *mm;
struct prio_tree_iter iter;
unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
struct page *page;
page = __xip_sparse_page;
if (!page)
return;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
pte = page_check_address(page, mm, address, &ptl);
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
}
}
spin_unlock(&mapping->i_mmap_lock);
}
/*
* xip_nopage() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* This function is derived from filemap_nopage, but used for execute in place
*/
static struct page *
xip_file_nopage(struct vm_area_struct * area,
unsigned long address,
int *type)
{
struct file *file = area->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct page *page;
unsigned long size, pgoff, endoff;
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+ area->vm_pgoff;
endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+ area->vm_pgoff;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (pgoff >= size)
return NOPAGE_SIGBUS;
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page))
goto out;
if (PTR_ERR(page) != -ENODATA)
return NOPAGE_SIGBUS;
/* sparse block */
if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
(area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
/* maybe shared writable, allocate new block */
page = mapping->a_ops->get_xip_page (mapping,
pgoff*(PAGE_SIZE/512), 1);
if (IS_ERR(page))
return NOPAGE_SIGBUS;
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, pgoff);
} else {
/* not shared and writable, use xip_sparse_page() */
page = xip_sparse_page();
if (!page)
return NOPAGE_OOM;
}
out:
page_cache_get(page);
return page;
}
static struct vm_operations_struct xip_file_vm_ops = {
.nopage = xip_file_nopage,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
{
BUG_ON(!file->f_mapping->a_ops->get_xip_page);
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
static ssize_t
__xip_file_write(struct file *filp, const char __user *buf,
size_t count, loff_t pos, loff_t *ppos)
{
struct address_space * mapping = filp->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
struct page *page;
size_t bytes;
ssize_t written = 0;
BUG_ON(!mapping->a_ops->get_xip_page);
do {
unsigned long index;
unsigned long offset;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
fault_in_pages_readable(buf, bytes);
page = a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 0);
if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
/* we allocate a new page unmap it */
page = a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 1);
if (!IS_ERR(page))
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, index);
}
if (IS_ERR(page)) {
status = PTR_ERR(page);
break;
}
copied = filemap_copy_from_user(page, offset, buf, bytes);
flush_dcache_page(page);
if (likely(copied > 0)) {
status = copied;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
}
}
if (unlikely(copied != bytes))
if (status >= 0)
status = -EFAULT;
if (status < 0)
break;
} while (count);
*ppos = pos;
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold i_mutex.
*/
if (pos > inode->i_size) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
return written ? written : status;
}
ssize_t
xip_file_write(struct file *filp, const char __user *buf, size_t len,
loff_t *ppos)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
size_t count;
loff_t pos;
ssize_t ret;
mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
ret=-EFAULT;
goto out_up;
}
pos = *ppos;
count = len;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
if (ret)
goto out_backing;
if (count == 0)
goto out_backing;
ret = remove_suid(filp->f_path.dentry);
if (ret)
goto out_backing;
file_update_time(filp);
ret = __xip_file_write (filp, buf, count, pos, ppos);
out_backing:
current->backing_dev_info = NULL;
out_up:
mutex_unlock(&inode->i_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(xip_file_write);
/*
* truncate a page used for execute in place
* functionality is analog to block_truncate_page but does use get_xip_page
* to get the page instead of page cache
*/
int
xip_truncate_page(struct address_space *mapping, loff_t from)
{
pgoff_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize;
unsigned length;
struct page *page;
void *kaddr;
BUG_ON(!mapping->a_ops->get_xip_page);
blocksize = 1 << mapping->host->i_blkbits;
length = offset & (blocksize - 1);
/* Block boundary? Nothing to do */
if (!length)
return 0;
length = blocksize - length;
page = mapping->a_ops->get_xip_page(mapping,
index*(PAGE_SIZE/512), 0);
if (!page)
return -ENOMEM;
if (unlikely(IS_ERR(page))) {
if (PTR_ERR(page) == -ENODATA)
/* Hole? No need to truncate */
return 0;
else
return PTR_ERR(page);
}
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + offset, 0, length);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page);
return 0;
}
EXPORT_SYMBOL_GPL(xip_truncate_page);

228
mm/fremap.c Normal file
View File

@@ -0,0 +1,228 @@
/*
* linux/mm/fremap.c
*
* Explicit pagetable population and nonlinear (random) mappings support.
*
* started by Ingo Molnar, Copyright (C) 2002, 2003
*/
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
struct page *page = NULL;
if (pte_present(pte)) {
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page, vma);
page_cache_release(page);
}
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
return !!page;
}
/*
* Install a file page to a given virtual memory address, release any
* previously existing mapping.
*/
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, struct page *page, pgprot_t prot)
{
struct inode *inode;
pgoff_t size;
int err = -ENOMEM;
pte_t *pte;
pte_t pte_val;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
/*
* This page may have been truncated. Tell the
* caller about it.
*/
err = -EINVAL;
inode = vma->vm_file->f_mapping->host;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (!page->mapping || page->index >= size)
goto unlock;
err = -ENOMEM;
if (page_mapcount(page) > INT_MAX/2)
goto unlock;
if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
inc_mm_counter(mm, file_rss);
flush_icache_page(vma, page);
pte_val = mk_pte(page, prot);
set_pte_at(mm, addr, pte, pte_val);
page_add_file_rmap(page);
update_mmu_cache(vma, addr, pte_val);
lazy_mmu_prot_update(pte_val);
err = 0;
unlock:
pte_unmap_unlock(pte, ptl);
out:
return err;
}
EXPORT_SYMBOL(install_page);
/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
*/
int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long pgoff, pgprot_t prot)
{
int err = -ENOMEM;
pte_t *pte;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss);
}
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
* non-present entry (like a swap entry), noting what file offset should
* be mapped there when there's a fault (in a non-linear vma where
* that's not obvious).
*/
pte_unmap_unlock(pte, ptl);
err = 0;
out:
return err;
}
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
* @start: start of the remapped virtual memory range
* @size: size of the remapped virtual memory range
* @prot: new protection bits of the range
* @pgoff: to be mapped page of the backing store file
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
*
* this syscall works purely via pagetables, so it's the most efficient
* way to map the same (large) file into a given virtual window. Unlike
* mmap()/mremap() it does not create any new vmas. The new mappings are
* also safe across swapout.
*
* NOTE: the 'prot' parameter right now is ignored, and the vma's default
* protection is used. Arbitrary protections might be implemented in the
* future.
*/
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long __prot, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
if (__prot)
return err;
/*
* Sanitize the syscall parameters:
*/
start = start & PAGE_MASK;
size = size & PAGE_MASK;
/* Does the address range wrap, or is the span zero-sized? */
if (start + size <= start)
return err;
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
return err;
#endif
/* We need down_write() to change vma->vm_flags. */
down_read(&mm->mmap_sem);
retry:
vma = find_vma(mm, start);
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
* the single existing vma. vm_private_data is used as a
* swapout cursor in a VM_NONLINEAR vma.
*/
if (vma && (vma->vm_flags & VM_SHARED) &&
(!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
/* Must set VM_NONLINEAR before any pages are populated. */
if (pgoff != linear_page_index(vma, start) &&
!(vma->vm_flags & VM_NONLINEAR)) {
if (!has_write_lock) {
up_read(&mm->mmap_sem);
down_write(&mm->mmap_sem);
has_write_lock = 1;
goto retry;
}
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
vma_prio_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
spin_unlock(&mapping->i_mmap_lock);
}
err = vma->vm_ops->populate(vma, start, size,
vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK);
/*
* We can't clear VM_NONLINEAR because we'd have to do
* it after ->populate completes, and that would prevent
* downgrading the lock. (Locks can't be upgraded).
*/
}
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
up_write(&mm->mmap_sem);
return err;
}

335
mm/highmem.c Normal file
View File

@@ -0,0 +1,335 @@
/*
* High memory handling common code and variables.
*
* (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
* Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
*
*
* Redesigned the x86 32-bit VM architecture to deal with
* 64-bit physical space. With current x86 CPUs this
* means up to 64 Gigabytes physical RAM.
*
* Rewrote high memory support to move the page cache into
* high memory. Implemented permanent (schedulable) kmaps
* based on Linus' idea.
*
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
* since a TLB flush - it is usable.
* 1 means that there are no users, but it has been mapped
* since the last TLB flush - so we can't use it.
* n means that there are (n-1) current users of it.
*/
#ifdef CONFIG_HIGHMEM
unsigned long totalhigh_pages __read_mostly;
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
unsigned int pages = 0;
for_each_online_pgdat(pgdat)
pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
return pages;
}
static int pkmap_count[LAST_PKMAP];
static unsigned int last_pkmap_nr;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
pte_t * pkmap_page_table;
static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
static void flush_all_zero_pkmaps(void)
{
int i;
flush_cache_kmaps();
for (i = 0; i < LAST_PKMAP; i++) {
struct page *page;
/*
* zero means we don't have anything to do,
* >1 means that it is still in use. Only
* a count of 1 means that it is free but
* needs to be unmapped
*/
if (pkmap_count[i] != 1)
continue;
pkmap_count[i] = 0;
/* sanity check */
BUG_ON(pte_none(pkmap_page_table[i]));
/*
* Don't need an atomic fetch-and-clear op here;
* no-one has the page mapped, and cannot get at
* its virtual address (and hence PTE) without first
* getting the kmap_lock (which is held here).
* So no dangers, even with speculative execution.
*/
page = pte_page(pkmap_page_table[i]);
pte_clear(&init_mm, (unsigned long)page_address(page),
&pkmap_page_table[i]);
set_page_address(page, NULL);
}
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
static inline unsigned long map_new_virtual(struct page *page)
{
unsigned long vaddr;
int count;
start:
count = LAST_PKMAP;
/* Find an empty entry */
for (;;) {
last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
if (!last_pkmap_nr) {
flush_all_zero_pkmaps();
count = LAST_PKMAP;
}
if (!pkmap_count[last_pkmap_nr])
break; /* Found a usable entry */
if (--count)
continue;
/*
* Sleep for somebody else to unmap their entries
*/
{
DECLARE_WAITQUEUE(wait, current);
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&pkmap_map_wait, &wait);
spin_unlock(&kmap_lock);
schedule();
remove_wait_queue(&pkmap_map_wait, &wait);
spin_lock(&kmap_lock);
/* Somebody else might have mapped it while we slept */
if (page_address(page))
return (unsigned long)page_address(page);
/* Re-start */
goto start;
}
}
vaddr = PKMAP_ADDR(last_pkmap_nr);
set_pte_at(&init_mm, vaddr,
&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
pkmap_count[last_pkmap_nr] = 1;
set_page_address(page, (void *)vaddr);
return vaddr;
}
void fastcall *kmap_high(struct page *page)
{
unsigned long vaddr;
/*
* For highmem pages, we can't trust "virtual" until
* after we have the lock.
*
* We cannot call this from interrupts, as it may block
*/
spin_lock(&kmap_lock);
vaddr = (unsigned long)page_address(page);
if (!vaddr)
vaddr = map_new_virtual(page);
pkmap_count[PKMAP_NR(vaddr)]++;
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
spin_unlock(&kmap_lock);
return (void*) vaddr;
}
EXPORT_SYMBOL(kmap_high);
void fastcall kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
int need_wakeup;
spin_lock(&kmap_lock);
vaddr = (unsigned long)page_address(page);
BUG_ON(!vaddr);
nr = PKMAP_NR(vaddr);
/*
* A count must never go down to zero
* without a TLB flush!
*/
need_wakeup = 0;
switch (--pkmap_count[nr]) {
case 0:
BUG();
case 1:
/*
* Avoid an unnecessary wake_up() function call.
* The common case is pkmap_count[] == 1, but
* no waiters.
* The tasks queued in the wait-queue are guarded
* by both the lock in the wait-queue-head and by
* the kmap_lock. As the kmap_lock is held here,
* no need for the wait-queue-head's lock. Simply
* test if the queue is empty.
*/
need_wakeup = waitqueue_active(&pkmap_map_wait);
}
spin_unlock(&kmap_lock);
/* do wake-up, if needed, race-free outside of the spin lock */
if (need_wakeup)
wake_up(&pkmap_map_wait);
}
EXPORT_SYMBOL(kunmap_high);
#endif
#if defined(HASHED_PAGE_VIRTUAL)
#define PA_HASH_ORDER 7
/*
* Describes one page->virtual association
*/
struct page_address_map {
struct page *page;
void *virtual;
struct list_head list;
};
/*
* page_address_map freelist, allocated from page_address_maps.
*/
static struct list_head page_address_pool; /* freelist */
static spinlock_t pool_lock; /* protects page_address_pool */
/*
* Hash table bucket
*/
static struct page_address_slot {
struct list_head lh; /* List of page_address_maps */
spinlock_t lock; /* Protect this bucket's list */
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
static struct page_address_slot *page_slot(struct page *page)
{
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
void *page_address(struct page *page)
{
unsigned long flags;
void *ret;
struct page_address_slot *pas;
if (!PageHighMem(page))
return lowmem_page_address(page);
pas = page_slot(page);
ret = NULL;
spin_lock_irqsave(&pas->lock, flags);
if (!list_empty(&pas->lh)) {
struct page_address_map *pam;
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
ret = pam->virtual;
goto done;
}
}
}
done:
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
EXPORT_SYMBOL(page_address);
void set_page_address(struct page *page, void *virtual)
{
unsigned long flags;
struct page_address_slot *pas;
struct page_address_map *pam;
BUG_ON(!PageHighMem(page));
pas = page_slot(page);
if (virtual) { /* Add */
BUG_ON(list_empty(&page_address_pool));
spin_lock_irqsave(&pool_lock, flags);
pam = list_entry(page_address_pool.next,
struct page_address_map, list);
list_del(&pam->list);
spin_unlock_irqrestore(&pool_lock, flags);
pam->page = page;
pam->virtual = virtual;
spin_lock_irqsave(&pas->lock, flags);
list_add_tail(&pam->list, &pas->lh);
spin_unlock_irqrestore(&pas->lock, flags);
} else { /* Remove */
spin_lock_irqsave(&pas->lock, flags);
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
list_del(&pam->list);
spin_unlock_irqrestore(&pas->lock, flags);
spin_lock_irqsave(&pool_lock, flags);
list_add_tail(&pam->list, &page_address_pool);
spin_unlock_irqrestore(&pool_lock, flags);
goto done;
}
}
spin_unlock_irqrestore(&pas->lock, flags);
}
done:
return;
}
static struct page_address_map page_address_maps[LAST_PKMAP];
void __init page_address_init(void)
{
int i;
INIT_LIST_HEAD(&page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
list_add(&page_address_maps[i].list, &page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
spin_lock_init(&pool_lock);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */

833
mm/hugetlb.c Normal file
View File

@@ -0,0 +1,833 @@
/*
* Generic hugetlb support.
* (C) William Irwin, April 2004
*/
#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <linux/hugetlb.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
static DEFINE_SPINLOCK(hugetlb_lock);
static void clear_huge_page(struct page *page, unsigned long addr)
{
int i;
might_sleep();
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
cond_resched();
clear_user_highpage(page + i, addr);
}
}
static void copy_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
might_sleep();
for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
cond_resched();
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
static void enqueue_huge_page(struct page *page)
{
int nid = page_to_nid(page);
list_add(&page->lru, &hugepage_freelists[nid]);
free_huge_pages++;
free_huge_pages_node[nid]++;
}
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
int nid = numa_node_id();
struct page *page = NULL;
struct zonelist *zonelist = huge_zonelist(vma, address);
struct zone **z;
for (z = zonelist->zones; *z; z++) {
nid = zone_to_nid(*z);
if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
!list_empty(&hugepage_freelists[nid]))
break;
}
if (*z) {
page = list_entry(hugepage_freelists[nid].next,
struct page, lru);
list_del(&page->lru);
free_huge_pages--;
free_huge_pages_node[nid]--;
}
return page;
}
static void free_huge_page(struct page *page)
{
BUG_ON(page_count(page));
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
enqueue_huge_page(page);
spin_unlock(&hugetlb_lock);
}
static int alloc_fresh_huge_page(void)
{
static int nid = 0;
struct page *page;
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
spin_unlock(&hugetlb_lock);
put_page(page); /* free it into the hugepage allocator */
return 1;
}
return 0;
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr)
{
struct page *page;
spin_lock(&hugetlb_lock);
if (vma->vm_flags & VM_MAYSHARE)
resv_huge_pages--;
else if (free_huge_pages <= resv_huge_pages)
goto fail;
page = dequeue_huge_page(vma, addr);
if (!page)
goto fail;
spin_unlock(&hugetlb_lock);
set_page_refcounted(page);
return page;
fail:
if (vma->vm_flags & VM_MAYSHARE)
resv_huge_pages++;
spin_unlock(&hugetlb_lock);
return NULL;
}
static int __init hugetlb_init(void)
{
unsigned long i;
if (HPAGE_SHIFT == 0)
return 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&hugepage_freelists[i]);
for (i = 0; i < max_huge_pages; ++i) {
if (!alloc_fresh_huge_page())
break;
}
max_huge_pages = free_huge_pages = nr_huge_pages = i;
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
return 0;
}
module_init(hugetlb_init);
static int __init hugetlb_setup(char *s)
{
if (sscanf(s, "%lu", &max_huge_pages) <= 0)
max_huge_pages = 0;
return 1;
}
__setup("hugepages=", hugetlb_setup);
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
int i;
nr_huge_pages--;
nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
}
page[1].lru.next = NULL;
set_page_refcounted(page);
__free_pages(page, HUGETLB_PAGE_ORDER);
}
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(unsigned long count)
{
int i;
for (i = 0; i < MAX_NUMNODES; ++i) {
struct page *page, *next;
list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
if (PageHighMem(page))
continue;
list_del(&page->lru);
update_and_free_page(page);
free_huge_pages--;
free_huge_pages_node[page_to_nid(page)]--;
if (count >= nr_huge_pages)
return;
}
}
}
#else
static inline void try_to_free_low(unsigned long count)
{
}
#endif
static unsigned long set_max_huge_pages(unsigned long count)
{
while (count > nr_huge_pages) {
if (!alloc_fresh_huge_page())
return nr_huge_pages;
}
if (count >= nr_huge_pages)
return nr_huge_pages;
spin_lock(&hugetlb_lock);
count = max(count, resv_huge_pages);
try_to_free_low(count);
while (count < nr_huge_pages) {
struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
update_and_free_page(page);
}
spin_unlock(&hugetlb_lock);
return nr_huge_pages;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer,
size_t *length, loff_t *ppos)
{
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
max_huge_pages = set_max_huge_pages(max_huge_pages);
return 0;
}
#endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo(char *buf)
{
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"Hugepagesize: %5lu kB\n",
nr_huge_pages,
free_huge_pages,
resv_huge_pages,
HPAGE_SIZE/1024);
}
int hugetlb_report_node_meminfo(int nid, char *buf)
{
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n",
nid, nr_huge_pages_node[nid],
nid, free_huge_pages_node[nid]);
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
unsigned long address, int *unused)
{
BUG();
return NULL;
}
struct vm_operations_struct hugetlb_vm_ops = {
.nopage = hugetlb_nopage,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
return entry;
}
static void set_huge_ptep_writable(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(*ptep));
ptep_set_access_flags(vma, address, ptep, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
int cow;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
if (cow)
ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
spin_unlock(&dst->page_table_lock);
}
return 0;
nomem:
return -ENOMEM;
}
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
struct page *tmp;
/*
* A page gathering list, protected by per file i_mmap_lock. The
* lock is used to avoid list corruption from multiple unmapping
* of the same page since we are using page->lru.
*/
LIST_HEAD(page_list);
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
if (huge_pmd_unshare(mm, &address, ptep))
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
if (pte_none(pte))
continue;
page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
list_add(&page->lru, &page_list);
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
}
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
/*
* It is undesirable to test vma->vm_file as it should be non-null
* for valid hugetlb area. However, vm_file will be NULL in the error
* cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
* do_mmap_pgoff() nullifies vma->vm_file before calling this function
* to clean up. Since no pte has actually been setup, it is safe to
* do nothing in this case.
*/
if (vma->vm_file) {
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
__unmap_hugepage_range(vma, start, end);
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
}
}
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte)
{
struct page *old_page, *new_page;
int avoidcopy;
old_page = pte_page(pte);
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
avoidcopy = (page_count(old_page) == 1);
if (avoidcopy) {
set_huge_ptep_writable(vma, address, ptep);
return VM_FAULT_MINOR;
}
page_cache_get(old_page);
new_page = alloc_huge_page(vma, address);
if (!new_page) {
page_cache_release(old_page);
return VM_FAULT_OOM;
}
spin_unlock(&mm->page_table_lock);
copy_huge_page(new_page, old_page, address, vma);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
if (likely(pte_same(*ptep, pte))) {
/* Break COW */
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
/* Make the old page be freed below */
new_page = old_page;
}
page_cache_release(new_page);
page_cache_release(old_page);
return VM_FAULT_MINOR;
}
int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, int write_access)
{
int ret = VM_FAULT_SIGBUS;
unsigned long idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
pte_t new_pte;
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
/*
* Use page lock to guard against racing truncation
* before we get page_table_lock.
*/
retry:
page = find_lock_page(mapping, idx);
if (!page) {
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto out;
if (hugetlb_get_quota(mapping))
goto out;
page = alloc_huge_page(vma, address);
if (!page) {
hugetlb_put_quota(mapping);
ret = VM_FAULT_OOM;
goto out;
}
clear_huge_page(page, address);
if (vma->vm_flags & VM_SHARED) {
int err;
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err) {
put_page(page);
hugetlb_put_quota(mapping);
if (err == -EEXIST)
goto retry;
goto out;
}
} else
lock_page(page);
}
spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto backout;
ret = VM_FAULT_MINOR;
if (!pte_none(*ptep))
goto backout;
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
if (write_access && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
}
spin_unlock(&mm->page_table_lock);
unlock_page(page);
out:
return ret;
backout:
spin_unlock(&mm->page_table_lock);
hugetlb_put_quota(mapping);
unlock_page(page);
put_page(page);
goto out;
}
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pte_t *ptep;
pte_t entry;
int ret;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
ptep = huge_pte_alloc(mm, address);
if (!ptep)
return VM_FAULT_OOM;
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
mutex_lock(&hugetlb_instantiation_mutex);
entry = *ptep;
if (pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
}
ret = VM_FAULT_MINOR;
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (likely(pte_same(entry, *ptep)))
if (write_access && !pte_write(entry))
ret = hugetlb_cow(mm, vma, address, ptep, entry);
spin_unlock(&mm->page_table_lock);
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
int remainder = *length;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
struct page *page;
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make * sure we get the
* first, for the page indexing below to work.
*/
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
if (!pte || pte_none(*pte)) {
int ret;
spin_unlock(&mm->page_table_lock);
ret = hugetlb_fault(mm, vma, vaddr, 0);
spin_lock(&mm->page_table_lock);
if (ret == VM_FAULT_MINOR)
continue;
remainder = 0;
if (!i)
i = -EFAULT;
break;
}
pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
page = pte_page(*pte);
same_page:
if (pages) {
get_page(page);
pages[i] = page + pfn_offset;
}
if (vmas)
vmas[i] = vma;
vaddr += PAGE_SIZE;
++pfn_offset;
--remainder;
++i;
if (vaddr < vma->vm_end && remainder &&
pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
/*
* We use pfn_offset to avoid touching the pageframes
* of this compound page.
*/
goto same_page;
}
}
spin_unlock(&mm->page_table_lock);
*length = remainder;
*position = vaddr;
return i;
}
void hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
spin_lock(&mm->page_table_lock);
for (; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
if (huge_pmd_unshare(mm, &address, ptep))
continue;
if (!pte_none(*ptep)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
lazy_mmu_prot_update(pte);
}
}
spin_unlock(&mm->page_table_lock);
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
flush_tlb_range(vma, start, end);
}
struct file_region {
struct list_head link;
long from;
long to;
};
static long region_add(struct list_head *head, long f, long t)
{
struct file_region *rg, *nrg, *trg;
/* Locate the region we are either in or before. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
break;
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
/* Check for and consume any regions we now overlap with. */
nrg = rg;
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
if (&rg->link == head)
break;
if (rg->from > t)
break;
/* If this area reaches higher then extend our area to
* include it completely. If this is not the first area
* which we intend to reuse, free it. */
if (rg->to > t)
t = rg->to;
if (rg != nrg) {
list_del(&rg->link);
kfree(rg);
}
}
nrg->from = f;
nrg->to = t;
return 0;
}
static long region_chg(struct list_head *head, long f, long t)
{
struct file_region *rg, *nrg;
long chg = 0;
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
break;
/* If we are below the current region then a new region is required.
* Subtle, allocate a new region at the position but make it zero
* size such that we can guarentee to record the reservation. */
if (&rg->link == head || t < rg->from) {
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (nrg == 0)
return -ENOMEM;
nrg->from = f;
nrg->to = f;
INIT_LIST_HEAD(&nrg->link);
list_add(&nrg->link, rg->link.prev);
return t - f;
}
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
chg = t - f;
/* Check for and consume any regions we now overlap with. */
list_for_each_entry(rg, rg->link.prev, link) {
if (&rg->link == head)
break;
if (rg->from > t)
return chg;
/* We overlap with this area, if it extends futher than
* us then we must extend ourselves. Account for its
* existing reservation. */
if (rg->to > t) {
chg += rg->to - t;
t = rg->to;
}
chg -= rg->to - rg->from;
}
return chg;
}
static long region_truncate(struct list_head *head, long end)
{
struct file_region *rg, *trg;
long chg = 0;
/* Locate the region we are either in or before. */
list_for_each_entry(rg, head, link)
if (end <= rg->to)
break;
if (&rg->link == head)
return 0;
/* If we are in the middle of a region then adjust it. */
if (end > rg->from) {
chg = rg->to - end;
rg->to = end;
rg = list_entry(rg->link.next, typeof(*rg), link);
}
/* Drop any remaining regions. */
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
if (&rg->link == head)
break;
chg += rg->to - rg->from;
list_del(&rg->link);
kfree(rg);
}
return chg;
}
static int hugetlb_acct_memory(long delta)
{
int ret = -ENOMEM;
spin_lock(&hugetlb_lock);
if ((delta + resv_huge_pages) <= free_huge_pages) {
resv_huge_pages += delta;
ret = 0;
}
spin_unlock(&hugetlb_lock);
return ret;
}
int hugetlb_reserve_pages(struct inode *inode, long from, long to)
{
long ret, chg;
chg = region_chg(&inode->i_mapping->private_list, from, to);
if (chg < 0)
return chg;
ret = hugetlb_acct_memory(chg);
if (ret < 0)
return ret;
region_add(&inode->i_mapping->private_list, from, to);
return 0;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
long chg = region_truncate(&inode->i_mapping->private_list, offset);
hugetlb_acct_memory(freed - chg);
}

40
mm/internal.h Normal file
View File

@@ -0,0 +1,40 @@
/* internal.h: mm/ internal definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
#include <linux/mm.h>
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
}
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
VM_BUG_ON(atomic_read(&page->_count));
set_page_count(page, 1);
}
static inline void __put_page(struct page *page)
{
atomic_dec(&page->_count);
}
extern void fastcall __init __free_pages_bootmem(struct page *page,
unsigned int order);
#endif

337
mm/madvise.c Normal file
View File

@@ -0,0 +1,337 @@
/*
* linux/mm/madvise.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 2002 Christoph Hellwig
*/
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/hugetlb.h>
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
static long madvise_behavior(struct vm_area_struct * vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
int new_flags = vma->vm_flags;
switch (behavior) {
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
case MADV_DONTFORK:
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
new_flags &= ~VM_DONTCOPY;
break;
}
if (new_flags == vma->vm_flags) {
*prev = vma;
goto out;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
if (*prev) {
vma = *prev;
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto out;
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
goto out;
}
success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
out:
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct * vma,
struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
if (!file)
return -EBADF;
if (file->f_mapping->a_ops->get_xip_page) {
/* no bad return value, but ignore advice */
return 0;
}
*prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
force_page_cache_readahead(file->f_mapping,
file, start, max_sane_readahead(end - start));
return 0;
}
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
* zap_page_range call sets things up for refill_inactive to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* refill_inactive to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
* applications like large transactional databases which want to discard
* pages in anonymous maps after committing to backing store the data
* that was kept in them. There is no reason to write this data out to
* the swap area if the application is discarding it.
*
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
static long madvise_dontneed(struct vm_area_struct * vma,
struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
*prev = vma;
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
struct zap_details details = {
.nonlinear_vma = vma,
.last_index = ULONG_MAX,
};
zap_page_range(vma, start, end - start, &details);
} else
zap_page_range(vma, start, end - start, NULL);
return 0;
}
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*
* NOTE: Currently, only shmfs/tmpfs is supported for this operation.
* Other filesystems return -ENOSYS.
*/
static long madvise_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct address_space *mapping;
loff_t offset, endoff;
int error;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
if (!vma->vm_file || !vma->vm_file->f_mapping
|| !vma->vm_file->f_mapping->host) {
return -EINVAL;
}
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
return -EACCES;
mapping = vma->vm_file->f_mapping;
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
endoff = (loff_t)(end - vma->vm_start - 1)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
up_write(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff);
down_write(&current->mm->mmap_sem);
return error;
}
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
long error;
switch (behavior) {
case MADV_DOFORK:
if (vma->vm_flags & VM_IO) {
error = -EINVAL;
break;
}
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
error = madvise_behavior(vma, prev, start, end, behavior);
break;
case MADV_REMOVE:
error = madvise_remove(vma, prev, start, end);
break;
case MADV_WILLNEED:
error = madvise_willneed(vma, prev, start, end);
break;
case MADV_DONTNEED:
error = madvise_dontneed(vma, prev, start, end);
break;
default:
error = -EINVAL;
break;
}
return error;
}
/*
* The madvise(2) system call.
*
* Applications can use madvise() to advise the kernel how it should
* handle paging I/O in this VM area. The idea is to help the kernel
* use appropriate read-ahead and caching techniques. The information
* provided is advisory only, and can be safely disregarded by the
* kernel without affecting the correct operation of the application.
*
* behavior values:
* MADV_NORMAL - the default behavior is to read clusters. This
* results in some read-ahead and read-behind.
* MADV_RANDOM - the system should read the minimum amount of data
* on any access, since it is unlikely that the appli-
* cation will need more than what it asks for.
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
* once, so they can be aggressively read ahead, and
* can be freed soon after they are accessed.
* MADV_WILLNEED - the application is notifying the system to read
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
*
* return values:
* zero - success
* -EINVAL - start + len < 0, start is not page-aligned,
* "behavior" is not a valid value, or application
* is attempting to release locked or shared pages.
* -ENOMEM - addresses in the specified range are not currently
* mapped, or are outside the AS of the process.
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
size_t len;
down_write(&current->mm->mmap_sem);
if (start & ~PAGE_MASK)
goto out;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
goto out;
end = start + len;
if (end < start)
goto out;
error = 0;
if (end == start)
goto out;
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(current->mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
goto out;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start);
}
out:
up_write(&current->mm->mmap_sem);
return error;
}

2738
mm/memory.c Normal file

File diff suppressed because it is too large Load Diff

310
mm/memory_hotplug.c Normal file
View File

@@ -0,0 +1,310 @@
/*
* linux/mm/memory_hotplug.c
*
* Copyright (C)
*/
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/cpuset.h>
#include <asm/tlbflush.h>
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
struct resource *res;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
BUG_ON(!res);
res->name = "System RAM";
res->start = start;
res->end = start + size - 1;
res->flags = IORESOURCE_MEM;
if (request_resource(&iomem_resource, res) < 0) {
printk("System RAM resource %llx - %llx cannot be added\n",
(unsigned long long)res->start, (unsigned long long)res->end);
kfree(res);
res = NULL;
}
return res;
}
static void release_memory_resource(struct resource *res)
{
if (!res)
return;
release_resource(res);
kfree(res);
return;
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
zone_type = zone - pgdat->node_zones;
if (!populated_zone(zone)) {
int ret = 0;
ret = init_currently_empty_zone(zone, phys_start_pfn,
nr_pages, MEMMAP_HOTPLUG);
if (ret < 0)
return ret;
}
memmap_init_zone(nr_pages, nid, zone_type,
phys_start_pfn, MEMMAP_HOTPLUG);
return 0;
}
static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
if (ret < 0)
return ret;
ret = __add_zone(zone, phys_start_pfn);
if (ret < 0)
return ret;
return register_new_memory(__pfn_to_section(phys_start_pfn));
}
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
* call this function after deciding the zone to which to
* add the new pages.
*/
int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long nr_pages)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(zone, i << PFN_SECTION_SHIFT);
/*
* EEXIST is finally dealed with by ioresource collision
* check. see add_memory() => register_memory_resource()
* Warning will be printed if there is collision.
*/
if (err && (err != -EEXIST))
break;
err = 0;
}
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
static void grow_zone_span(struct zone *zone,
unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long old_zone_end_pfn;
zone_span_writelock(zone);
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
if (start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
zone->zone_start_pfn;
zone_span_writeunlock(zone);
}
static void grow_pgdat_span(struct pglist_data *pgdat,
unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long old_pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
pgdat->node_start_pfn;
}
int online_pages(unsigned long pfn, unsigned long nr_pages)
{
unsigned long i;
unsigned long flags;
unsigned long onlined_pages = 0;
struct resource res;
u64 section_end;
unsigned long start_pfn;
struct zone *zone;
int need_zonelists_rebuild = 0;
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
* memory_block->state_sem.
*/
zone = page_zone(pfn_to_page(pfn));
pgdat_resize_lock(zone->zone_pgdat, &flags);
grow_zone_span(zone, pfn, pfn + nr_pages);
grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
if (!populated_zone(zone))
need_zonelists_rebuild = 1;
res.start = (u64)pfn << PAGE_SHIFT;
res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_MEM; /* we just need system ram */
section_end = res.end;
while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
nr_pages = (unsigned long)
((res.end + 1 - res.start) >> PAGE_SHIFT);
if (PageReserved(pfn_to_page(start_pfn))) {
/* this region's page is not onlined now */
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(start_pfn + i);
online_page(page);
onlined_pages++;
}
}
res.start = res.end + 1;
res.end = section_end;
}
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
setup_per_zone_pages_min();
if (need_zonelists_rebuild)
build_all_zonelists();
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
unsigned long zholes_size[MAX_NR_ZONES] = {0};
unsigned long start_pfn = start >> PAGE_SHIFT;
pgdat = arch_alloc_nodedata(nid);
if (!pgdat)
return NULL;
arch_refresh_nodedata(nid, pgdat);
/* we can use NODE_DATA(nid) from here */
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
return pgdat;
}
static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
arch_refresh_nodedata(nid, NULL);
arch_free_nodedata(pgdat);
return;
}
int add_memory(int nid, u64 start, u64 size)
{
pg_data_t *pgdat = NULL;
int new_pgdat = 0;
struct resource *res;
int ret;
res = register_memory_resource(start, size);
if (!res)
return -EEXIST;
if (!node_online(nid)) {
pgdat = hotadd_new_pgdat(nid, start);
if (!pgdat)
return -ENOMEM;
new_pgdat = 1;
ret = kswapd_run(nid);
if (ret)
goto error;
}
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size);
if (ret < 0)
goto error;
/* we online node here. we can't roll back from here. */
node_set_online(nid);
cpuset_track_online_nodes();
if (new_pgdat) {
ret = register_one_node(nid);
/*
* If sysfs file of new node can't create, cpu on the node
* can't be hot-added. There is no rollback way now.
* So, check by BUG_ON() to catch it reluctantly..
*/
BUG_ON(ret);
}
return ret;
error:
/* rollback pgdat allocation and others */
if (new_pgdat)
rollback_node_hotadd(nid, pgdat);
if (res)
release_memory_resource(res);
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);

1911
mm/mempolicy.c Normal file

File diff suppressed because it is too large Load Diff

338
mm/mempool.c Normal file
View File

@@ -0,0 +1,338 @@
/*
* linux/mm/mempool.c
*
* memory buffer pool support. Such pools are mostly used
* for guaranteed, deadlock-free memory allocations during
* extreme VM load.
*
* started by Ingo Molnar, Copyright (C) 2001
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
static void add_element(mempool_t *pool, void *element)
{
BUG_ON(pool->curr_nr >= pool->min_nr);
pool->elements[pool->curr_nr++] = element;
}
static void *remove_element(mempool_t *pool)
{
BUG_ON(pool->curr_nr <= 0);
return pool->elements[--pool->curr_nr];
}
static void free_pool(mempool_t *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
kfree(pool);
}
/**
* mempool_create - create a memory pool
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
* @free_fn: user-defined element-freeing function.
* @pool_data: optional private data available to the user-defined functions.
*
* this function creates and allocates a guaranteed size, preallocated
* memory pool. The pool can be used from the mempool_alloc() and mempool_free()
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
* functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
}
EXPORT_SYMBOL(mempool_create);
mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data, int node_id)
{
mempool_t *pool;
pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
if (!pool)
return NULL;
memset(pool, 0, sizeof(*pool));
pool->elements = kmalloc_node(min_nr * sizeof(void *),
GFP_KERNEL, node_id);
if (!pool->elements) {
kfree(pool);
return NULL;
}
spin_lock_init(&pool->lock);
pool->min_nr = min_nr;
pool->pool_data = pool_data;
init_waitqueue_head(&pool->wait);
pool->alloc = alloc_fn;
pool->free = free_fn;
/*
* First pre-allocate the guaranteed number of buffers.
*/
while (pool->curr_nr < pool->min_nr) {
void *element;
element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (unlikely(!element)) {
free_pool(pool);
return NULL;
}
add_element(pool, element);
}
return pool;
}
EXPORT_SYMBOL(mempool_create_node);
/**
* mempool_resize - resize an existing memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
* @gfp_mask: the usual allocation bitmask.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
}
pool->min_nr = new_min_nr;
goto out_unlock;
}
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
if (!new_elements)
return -ENOMEM;
spin_lock_irqsave(&pool->lock, flags);
if (unlikely(new_min_nr <= pool->min_nr)) {
/* Raced, other resize will do our work */
spin_unlock_irqrestore(&pool->lock, flags);
kfree(new_elements);
goto out;
}
memcpy(new_elements, pool->elements,
pool->curr_nr * sizeof(*new_elements));
kfree(pool->elements);
pool->elements = new_elements;
pool->min_nr = new_min_nr;
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
element = pool->alloc(gfp_mask, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
if (pool->curr_nr < pool->min_nr) {
add_element(pool, element);
} else {
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data); /* Raced */
goto out;
}
}
out_unlock:
spin_unlock_irqrestore(&pool->lock, flags);
out:
return 0;
}
EXPORT_SYMBOL(mempool_resize);
/**
* mempool_destroy - deallocate a memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
*
* this function only sleeps if the free_fn() function sleeps. The caller
* has to guarantee that all elements have been returned to the pool (ie:
* freed) prior to calling mempool_destroy().
*/
void mempool_destroy(mempool_t *pool)
{
/* Check for outstanding elements */
BUG_ON(pool->curr_nr != pool->min_nr);
free_pool(pool);
}
EXPORT_SYMBOL(mempool_destroy);
/**
* mempool_alloc - allocate an element from a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @gfp_mask: the usual allocation bitmask.
*
* this function only sleeps if the alloc_fn() function sleeps or
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
*/
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
gfp_t gfp_temp;
might_sleep_if(gfp_mask & __GFP_WAIT);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
repeat_alloc:
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
return element;
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
return element;
}
spin_unlock_irqrestore(&pool->lock, flags);
/* We must not sleep in the GFP_ATOMIC case */
if (!(gfp_mask & __GFP_WAIT))
return NULL;
/* Now start performing page reclaim */
gfp_temp = gfp_mask;
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
smp_mb();
if (!pool->curr_nr) {
/*
* FIXME: this should be io_schedule(). The timeout is there
* as a workaround for some DM problems in 2.6.18.
*/
io_schedule_timeout(5*HZ);
}
finish_wait(&pool->wait, &wait);
goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc);
/**
* mempool_free - return an element to the pool.
* @element: pool element pointer.
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
*
* this function only sleeps if the free_fn() function sleeps.
*/
void mempool_free(void *element, mempool_t *pool)
{
unsigned long flags;
smp_mb();
if (pool->curr_nr < pool->min_nr) {
spin_lock_irqsave(&pool->lock, flags);
if (pool->curr_nr < pool->min_nr) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
/*
* A commonly used alloc and free fn.
*/
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
struct kmem_cache *mem = pool_data;
return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
void mempool_free_slab(void *element, void *pool_data)
{
struct kmem_cache *mem = pool_data;
kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);
/*
* A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
* specfied by pool_data
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)(long)pool_data;
return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t) pool_data;
return kzalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kzalloc);
void mempool_kfree(void *element, void *pool_data)
{
kfree(element);
}
EXPORT_SYMBOL(mempool_kfree);
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.
*/
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
int order = (int)(long)pool_data;
return alloc_pages(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);
void mempool_free_pages(void *element, void *pool_data)
{
int order = (int)(long)pool_data;
__free_pages(element, order);
}
EXPORT_SYMBOL(mempool_free_pages);

1017
mm/migrate.c Normal file

File diff suppressed because it is too large Load Diff

229
mm/mincore.c Normal file
View File

@@ -0,0 +1,229 @@
/*
* linux/mm/mincore.c
*
* Copyright (C) 1994-2006 Linus Torvalds
*/
/*
* The mincore() system call.
*/
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
{
unsigned char present = 0;
struct page *page;
/*
* When tmpfs swaps out a page from a file, any process mapping that
* file will not get a swp_entry_t in its pte, but rather it is like
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .nopage). So swapped out tmpfs mappings are tested here.
*
* However when tmpfs moves the page from pagecache and into swapcache,
* it is still in core, but the find_get_page below won't find it.
* No big deal, but make a note of it.
*/
page = find_get_page(mapping, pgoff);
if (page) {
present = PageUptodate(page);
page_cache_release(page);
}
return present;
}
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
* just return the amount of info we're asked for.
*/
static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
unsigned long nr;
int i;
pgoff_t pgoff;
struct vm_area_struct *vma = find_vma(current->mm, addr);
/*
* find_vma() didn't find anything above us, or we're
* in an unmapped hole in the address space: ENOMEM.
*/
if (!vma || addr < vma->vm_start)
return -ENOMEM;
/*
* Calculate how many pages there are left in the last level of the
* PTE array for our address.
*/
nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
/*
* Don't overrun this vma
*/
nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
/*
* Don't return more than the caller asked for
*/
nr = min(nr, pages);
pgd = pgd_offset(vma->vm_mm, addr);
if (pgd_none_or_clear_bad(pgd))
goto none_mapped;
pud = pud_offset(pgd, addr);
if (pud_none_or_clear_bad(pud))
goto none_mapped;
pmd = pmd_offset(pud, addr);
if (pmd_none_or_clear_bad(pmd))
goto none_mapped;
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
unsigned char present;
pte_t pte = *ptep;
if (pte_present(pte)) {
present = 1;
} else if (pte_none(pte)) {
if (vma->vm_file) {
pgoff = linear_page_index(vma, addr);
present = mincore_page(vma->vm_file->f_mapping,
pgoff);
} else
present = 0;
} else if (pte_file(pte)) {
pgoff = pte_to_pgoff(pte);
present = mincore_page(vma->vm_file->f_mapping, pgoff);
} else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (is_migration_entry(entry)) {
/* migration entries are always uptodate */
present = 1;
} else {
#ifdef CONFIG_SWAP
pgoff = entry.val;
present = mincore_page(&swapper_space, pgoff);
#else
WARN_ON(1);
present = 1;
#endif
}
}
vec[i] = present;
}
pte_unmap_unlock(ptep-1, ptl);
return nr;
none_mapped:
if (vma->vm_file) {
pgoff = linear_page_index(vma, addr);
for (i = 0; i < nr; i++, pgoff++)
vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
} else {
for (i = 0; i < nr; i++)
vec[i] = 0;
}
return nr;
}
/*
* The mincore(2) system call.
*
* mincore() returns the memory residency status of the pages in the
* current process's address space specified by [addr, addr + len).
* The status is returned in a vector of bytes. The least significant
* bit of each byte is 1 if the referenced page is in memory, otherwise
* it is zero.
*
* Because the status of a page can change after mincore() checks it
* but before it returns to the application, the returned vector may
* contain stale information. Only locked pages are guaranteed to
* remain in memory.
*
* return values:
* zero - success
* -EFAULT - vec points to an illegal address
* -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
* -ENOMEM - Addresses in the range [addr, addr + len] are
* invalid for the address space of this process, or
* specify one or more pages which are not currently
* mapped
* -EAGAIN - A kernel resource was temporarily unavailable.
*/
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec)
{
long retval;
unsigned long pages;
unsigned char *tmp;
/* Check the start address: needs to be page-aligned.. */
if (start & ~PAGE_CACHE_MASK)
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
if (!access_ok(VERIFY_READ, (void __user *) start, len))
return -ENOMEM;
/* This also avoids any overflows on PAGE_CACHE_ALIGN */
pages = len >> PAGE_SHIFT;
pages += (len & ~PAGE_MASK) != 0;
if (!access_ok(VERIFY_WRITE, vec, pages))
return -EFAULT;
tmp = (void *) __get_free_page(GFP_USER);
if (!tmp)
return -EAGAIN;
retval = 0;
while (pages) {
/*
* Do at most PAGE_SIZE entries per iteration, due to
* the temporary buffer size.
*/
down_read(&current->mm->mmap_sem);
retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
up_read(&current->mm->mmap_sem);
if (retval <= 0)
break;
if (copy_to_user(vec, tmp, retval)) {
retval = -EFAULT;
break;
}
pages -= retval;
vec += retval;
start += retval << PAGE_SHIFT;
retval = 0;
}
free_page((unsigned long) tmp);
return retval;
}

254
mm/mlock.c Normal file
View File

@@ -0,0 +1,254 @@
/*
* linux/mm/mlock.c
*
* (C) Copyright 1995 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*/
#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, unsigned int newflags)
{
struct mm_struct * mm = vma->vm_mm;
pgoff_t pgoff;
int pages;
int ret = 0;
if (newflags == vma->vm_flags) {
*prev = vma;
goto out;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
if (*prev) {
vma = *prev;
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
goto out;
}
if (end != vma->vm_end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
goto out;
}
success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, make_pages_present below will bring it back.
*/
vma->vm_flags = newflags;
/*
* Keep track of amount of locked VM.
*/
pages = (end - start) >> PAGE_SHIFT;
if (newflags & VM_LOCKED) {
pages = -pages;
if (!(newflags & VM_IO))
ret = make_pages_present(start, end);
}
mm->locked_vm -= pages;
out:
if (ret == -ENOMEM)
ret = -EAGAIN;
return ret;
}
static int do_mlock(unsigned long start, size_t len, int on)
{
unsigned long nstart, end, tmp;
struct vm_area_struct * vma, * prev;
int error;
len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
vma = find_vma_prev(current->mm, start, &prev);
if (!vma || vma->vm_start > start)
return -ENOMEM;
if (start > vma->vm_start)
prev = vma;
for (nstart = start ; ; ) {
unsigned int newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vma->vm_flags | VM_LOCKED;
if (!on)
newflags &= ~VM_LOCKED;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
break;
nstart = tmp;
if (nstart < prev->vm_end)
nstart = prev->vm_end;
if (nstart >= end)
break;
vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
}
}
return error;
}
asmlinkage long sys_mlock(unsigned long start, size_t len)
{
unsigned long locked;
unsigned long lock_limit;
int error = -ENOMEM;
if (!can_do_mlock())
return -EPERM;
down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
locked = len >> PAGE_SHIFT;
locked += current->mm->locked_vm;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
return error;
}
asmlinkage long sys_munlock(unsigned long start, size_t len)
{
int ret;
down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
return ret;
}
static int do_mlockall(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
unsigned int def_flags = 0;
if (flags & MCL_FUTURE)
def_flags = VM_LOCKED;
current->mm->def_flags = def_flags;
if (flags == MCL_FUTURE)
goto out;
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
unsigned int newflags;
newflags = vma->vm_flags | VM_LOCKED;
if (!(flags & MCL_CURRENT))
newflags &= ~VM_LOCKED;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
}
out:
return 0;
}
asmlinkage long sys_mlockall(int flags)
{
unsigned long lock_limit;
int ret = -EINVAL;
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
goto out;
ret = -EPERM;
if (!can_do_mlock())
goto out;
down_write(&current->mm->mmap_sem);
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
out:
return ret;
}
asmlinkage long sys_munlockall(void)
{
int ret;
down_write(&current->mm->mmap_sem);
ret = do_mlockall(0);
up_write(&current->mm->mmap_sem);
return ret;
}
/*
* Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
* shm segments) get accounted against the user_struct instead.
*/
static DEFINE_SPINLOCK(shmlock_user_lock);
int user_shm_lock(size_t size, struct user_struct *user)
{
unsigned long lock_limit, locked;
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
get_uid(user);
user->locked_shm += locked;
allowed = 1;
out:
spin_unlock(&shmlock_user_lock);
return allowed;
}
void user_shm_unlock(size_t size, struct user_struct *user)
{
spin_lock(&shmlock_user_lock);
user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
spin_unlock(&shmlock_user_lock);
free_uid(user);
}

2177
mm/mmap.c Normal file

File diff suppressed because it is too large Load Diff

44
mm/mmzone.c Normal file
View File

@@ -0,0 +1,44 @@
/*
* linux/mm/mmzone.c
*
* management codes for pgdats and zones.
*/
#include <linux/stddef.h>
#include <linux/mmzone.h>
#include <linux/module.h>
struct pglist_data *first_online_pgdat(void)
{
return NODE_DATA(first_online_node);
}
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
int nid = next_online_node(pgdat->node_id);
if (nid == MAX_NUMNODES)
return NULL;
return NODE_DATA(nid);
}
/*
* next_zone - helper magic for for_each_zone()
*/
struct zone *next_zone(struct zone *zone)
{
pg_data_t *pgdat = zone->zone_pgdat;
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
zone++;
else {
pgdat = next_online_pgdat(pgdat);
if (pgdat)
zone = pgdat->node_zones;
else
zone = NULL;
}
return zone;
}

314
mm/mprotect.c Normal file
View File

@@ -0,0 +1,314 @@
/*
* mm/mprotect.c
*
* (C) Copyright 1994 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*
* Address space accounting code <alan@redhat.com>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
/* Avoid an SMP race with hardware updated dirty/clean
* bits by wiping the pte and then setting the new pte
* into place.
*/
ptent = ptep_get_and_clear(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
if (dirty_accountable && pte_dirty(ptent))
ptent = pte_mkwrite(ptent);
set_pte_at(mm, addr, pte, ptent);
lazy_mmu_prot_update(ptent);
#ifdef CONFIG_MIGRATION
} else if (!pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
}
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
} while (pud++, addr = next, addr != end);
}
static void change_protection(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
}
static int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
pgoff_t pgoff;
int error;
int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
return 0;
}
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again.
*
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
* a MAP_NORESERVE private mapping to writable will now reserve.
*/
if (newflags & VM_WRITE) {
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
}
/*
* First try to merge with previous and/or next vma.
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (*pprev) {
vma = *pprev;
goto success;
}
*pprev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto fail;
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
goto fail;
}
success:
/*
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
*/
vma->vm_flags = newflags;
vma->vm_page_prot = protection_map[newflags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
if (vma_wants_writenotify(vma)) {
vma->vm_page_prot = protection_map[newflags &
(VM_READ|VM_WRITE|VM_EXEC)];
dirty_accountable = 1;
}
if (is_vm_hugetlb_page(vma))
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;
fail:
vm_unacct_memory(charged);
return error;
}
asmlinkage long
sys_mprotect(unsigned long start, size_t len, unsigned long prot)
{
unsigned long vm_flags, nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
if (start & ~PAGE_MASK)
return -EINVAL;
if (!len)
return 0;
len = PAGE_ALIGN(len);
end = start + len;
if (end <= start)
return -ENOMEM;
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
return -EINVAL;
reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC:
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
vma = find_vma_prev(current->mm, start, &prev);
error = -ENOMEM;
if (!vma)
goto out;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
start = vma->vm_start;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
}
else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
end = vma->vm_end;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSUP))
goto out;
}
}
if (start > vma->vm_start)
prev = vma;
for (nstart = start ; ; ) {
unsigned long newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
error = -EACCES;
goto out;
}
error = security_file_mprotect(vma, reqprot, prot);
if (error)
goto out;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
nstart = tmp;
if (nstart < prev->vm_end)
nstart = prev->vm_end;
if (nstart >= end)
goto out;
vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
}
}
out:
up_write(&current->mm->mmap_sem);
return error;
}

416
mm/mremap.c Normal file
View File

@@ -0,0 +1,416 @@
/*
* mm/mremap.c
*
* (C) Copyright 1996 Linus Torvalds
*
* Address space accounting code <alan@redhat.com>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (pgd_none_or_clear_bad(pgd))
return NULL;
pud = pud_offset(pgd, addr);
if (pud_none_or_clear_bad(pud))
return NULL;
pmd = pmd_offset(pud, addr);
if (pmd_none_or_clear_bad(pmd))
return NULL;
return pmd;
}
static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return NULL;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return NULL;
if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
return NULL;
return pmd;
}
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr)
{
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
* moving file-based ptes, we must lock vmtruncate out,
* since it might clean the dst vma before the src vma,
* and we propagate stale pages into the dst afterward.
*/
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
if (new_vma->vm_truncate_count &&
new_vma->vm_truncate_count != vma->vm_truncate_count)
new_vma->vm_truncate_count = 0;
}
/*
* We don't have to worry about the ordering of src and dst
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map_nested(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
if (pte_none(*old_pte))
continue;
pte = ptep_clear_flush(vma, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
set_pte_at(mm, new_addr, new_pte, pte);
}
arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
static unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
if (next - 1 > old_end)
next = old_end;
extent = next - old_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
if (!new_pmd)
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
}
return len + old_addr - old_end; /* how much done */
}
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
unsigned long new_pgoff;
unsigned long moved_len;
unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0;
/*
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
if (!new_vma)
return -ENOMEM;
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
if (moved_len < old_len) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
vma->vm_flags &= ~VM_ACCOUNT;
excess = vma->vm_end - vma->vm_start - old_len;
if (old_addr > vma->vm_start &&
old_addr + old_len < vma->vm_end)
split = 1;
}
/*
* If we failed to move page tables we still do total_vm increment
* since do_munmap() will decrement it by old_len == new_len.
*
* Since total_vm is about to be raised artificially high for a
* moment, we need to restore high watermark afterwards: if stats
* are taken meanwhile, total_vm and hiwater_vm appear too high.
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
mm->total_vm += new_len >> PAGE_SHIFT;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
vma->vm_next->vm_flags |= VM_ACCOUNT;
}
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
if (new_len > old_len)
make_pages_present(new_addr + old_len,
new_addr + new_len);
}
return new_addr;
}
/*
* Expand (or shrink) an existing mapping, potentially moving it at the
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
*
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
* This option implies MREMAP_MAYMOVE.
*/
unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
goto out;
if (addr & ~PAGE_MASK)
goto out;
old_len = PAGE_ALIGN(old_len);
new_len = PAGE_ALIGN(new_len);
/*
* We allow a zero old-len as a special case
* for DOS-emu "duplicate shm area" thing. But
* a zero new-len is nonsensical.
*/
if (!new_len)
goto out;
/* new_addr is only valid if MREMAP_FIXED is specified */
if (flags & MREMAP_FIXED) {
if (new_addr & ~PAGE_MASK)
goto out;
if (!(flags & MREMAP_MAYMOVE))
goto out;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
goto out;
/* Check if the location we're moving into overlaps the
* old location at all, and fail if it does.
*/
if ((new_addr <= addr) && (new_addr+new_len) > addr)
goto out;
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
}
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* do_munmap does all the needed commit accounting
*/
if (old_len >= new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len);
if (ret && old_len != new_len)
goto out;
ret = addr;
if (!(flags & MREMAP_FIXED) || (new_addr == addr))
goto out;
old_len = new_len;
}
/*
* Ok, we need to grow.. or relocate.
*/
ret = -EFAULT;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
goto out;
if (is_vm_hugetlb_page(vma)) {
ret = -EINVAL;
goto out;
}
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
goto out;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
if (new_len > old_len)
goto out;
}
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
locked += new_len - old_len;
ret = -EAGAIN;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
}
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
ret = -ENOMEM;
goto out;
}
if (vma->vm_flags & VM_ACCOUNT) {
charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
goto out_nc;
}
/* old_len exactly to the end of the area..
* And we're not relocating the area.
*/
if (old_len == vma->vm_end - addr &&
!((flags & MREMAP_FIXED) && (addr != new_addr)) &&
(old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
unsigned long max_addr = TASK_SIZE;
if (vma->vm_next)
max_addr = vma->vm_next->vm_start;
/* can we just expand the current mapping? */
if (max_addr - addr >= new_len) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
vma_adjust(vma, vma->vm_start,
addr + new_len, vma->vm_pgoff, NULL);
mm->total_vm += pages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
make_pages_present(addr + old_len,
addr + new_len);
}
ret = addr;
goto out;
}
}
/*
* We weren't able to just expand or shrink the area,
* we need to create a new one and move it..
*/
ret = -ENOMEM;
if (flags & MREMAP_MAYMOVE) {
if (!(flags & MREMAP_FIXED)) {
unsigned long map_flags = 0;
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;
new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
vma->vm_pgoff, map_flags);
ret = new_addr;
if (new_addr & ~PAGE_MASK)
goto out;
}
ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
out_nc:
return ret;
}
asmlinkage unsigned long sys_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
unsigned long ret;
down_write(&current->mm->mmap_sem);
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
up_write(&current->mm->mmap_sem);
return ret;
}

102
mm/msync.c Normal file
View File

@@ -0,0 +1,102 @@
/*
* linux/mm/msync.c
*
* Copyright (C) 1994-1999 Linus Torvalds
*/
/*
* The msync() system call.
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/syscalls.h>
/*
* MS_SYNC syncs the entire file - including mappings.
*
* MS_ASYNC does not start I/O (it used to, up to 2.5.67).
* Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
* Now it doesn't do anything, since dirty pages are properly tracked.
*
* The application may now run fsync() to
* write out the dirty pages and wait on the writeout and check the result.
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
* async writeout immediately.
* So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications.
*/
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int unmapped_error = 0;
int error = -EINVAL;
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (start & ~PAGE_MASK)
goto out;
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
goto out;
error = -ENOMEM;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
if (end < start)
goto out;
error = 0;
if (end == start)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
for (;;) {
struct file *file;
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out_unlock;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
start = vma->vm_start;
if (start >= end)
goto out_unlock;
unmapped_error = -ENOMEM;
}
/* Here vma->vm_start <= start < vma->vm_end. */
if ((flags & MS_INVALIDATE) &&
(vma->vm_flags & VM_LOCKED)) {
error = -EBUSY;
goto out_unlock;
}
file = vma->vm_file;
start = vma->vm_end;
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
error = do_fsync(file, 0);
fput(file);
if (error || start >= end)
goto out;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
} else {
if (start >= end) {
error = 0;
goto out_unlock;
}
vma = vma->vm_next;
}
}
out_unlock:
up_read(&mm->mmap_sem);
out:
return error ? : unmapped_error;
}

1377
mm/nommu.c Normal file

File diff suppressed because it is too large Load Diff

470
mm/oom_kill.c Normal file
View File

@@ -0,0 +1,470 @@
/*
* linux/mm/oom_kill.c
*
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
* in mm/page_alloc.c when we really run out of memory.
*
* Since we won't call these routines often (on a well-configured
* machine) this file will double as a 'coding guide' and a signpost
* for newbie kernel hackers. It features several pointers to major
* kernel subsystems and hints as to where to find out what things do.
*/
#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
#include <linux/module.h>
#include <linux/notifier.h>
int sysctl_panic_on_oom;
/* #define DEBUG */
/**
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
* @uptime: current uptime in seconds
*
* The formula used is relatively simple and documented inline in the
* function. The main rationale is that we want to select a good task
* to kill when we run out of memory.
*
* Good in this context means that:
* 1) we lose the minimum amount of work done
* 2) we recover a large amount of memory
* 3) we don't kill anything innocent of eating tons of memory
* 4) we want to kill the minimum amount of processes (one)
* 5) we try to kill the process the user expects us to kill, this
* algorithm has been meticulously tuned to meet the principle
* of least surprise ... (be careful when you change it)
*/
unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
struct task_struct *child;
task_lock(p);
mm = p->mm;
if (!mm) {
task_unlock(p);
return 0;
}
/*
* The memory size of the process is the basis for the badness.
*/
points = mm->total_vm;
/*
* After this unlock we can no longer dereference local variable `mm'
*/
task_unlock(p);
/*
* swapoff can easily use up all memory, so kill those first.
*/
if (p->flags & PF_SWAPOFF)
return ULONG_MAX;
/*
* Processes which fork a lot of child processes are likely
* a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
* machine with an endless amount of children. In case a single
* child is eating the vast majority of memory, adding only half
* to the parents will make the child our kill candidate of choice.
*/
list_for_each_entry(child, &p->children, sibling) {
task_lock(child);
if (child->mm != mm && child->mm)
points += child->mm->total_vm/2 + 1;
task_unlock(child);
}
/*
* CPU time is in tens of seconds and run time is in thousands
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
*/
cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
>> (SHIFT_HZ + 3);
if (uptime >= p->start_time.tv_sec)
run_time = (uptime - p->start_time.tv_sec) >> 10;
else
run_time = 0;
s = int_sqrt(cpu_time);
if (s)
points /= s;
s = int_sqrt(int_sqrt(run_time));
if (s)
points /= s;
/*
* Niced processes are most likely less important, so double
* their badness points.
*/
if (task_nice(p) > 0)
points *= 2;
/*
* Superuser processes are usually more important, so we make it
* less likely that we kill those.
*/
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
p->uid == 0 || p->euid == 0)
points /= 4;
/*
* We don't want to kill a process with direct hardware access.
* Not only could that mess up the hardware, but usually users
* tend to only have this flag set on applications they think
* of as important.
*/
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
points /= 4;
/*
* If p's nodes don't overlap ours, it may still help to kill p
* because p may have allocated or otherwise mapped memory on
* this node before. However it will be less likely.
*/
if (!cpuset_excl_nodes_overlap(p))
points /= 8;
/*
* Adjust the score by oomkilladj.
*/
if (p->oomkilladj) {
if (p->oomkilladj > 0)
points <<= p->oomkilladj;
else
points >>= -(p->oomkilladj);
}
#ifdef DEBUG
printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
p->pid, p->comm, points);
#endif
return points;
}
/*
* Types of limitations to the nodes from which allocations may occur
*/
#define CONSTRAINT_NONE 1
#define CONSTRAINT_MEMORY_POLICY 2
#define CONSTRAINT_CPUSET 3
/*
* Determine the type of allocation constraint.
*/
static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
struct zone **z;
nodemask_t nodes;
int node;
nodes_clear(nodes);
/* node has memory ? */
for_each_online_node(node)
if (NODE_DATA(node)->node_present_pages)
node_set(node, nodes);
for (z = zonelist->zones; *z; z++)
if (cpuset_zone_allowed_softwall(*z, gfp_mask))
node_clear(zone_to_nid(*z), nodes);
else
return CONSTRAINT_CPUSET;
if (!nodes_empty(nodes))
return CONSTRAINT_MEMORY_POLICY;
#endif
return CONSTRAINT_NONE;
}
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned long *ppoints)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
*ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
unsigned long points;
/*
* skip kernel threads and tasks which have already released
* their mm.
*/
if (!p->mm)
continue;
/* skip the init task */
if (is_init(p))
continue;
/*
* This task already has access to memory reserves and is
* being killed. Don't allow any other task access to the
* memory reserve.
*
* Note: this may have a chance of deadlock if it gets
* blocked waiting for another task which itself is waiting
* for memory. Is there a better alternative?
*/
if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
/*
* This is in the process of releasing memory so wait for it
* to finish before killing some other task by mistake.
*
* However, if p is the current task, we allow the 'kill' to
* go ahead if it is exiting: this will simply set TIF_MEMDIE,
* which will allow it to gain access to memory reserves in
* the process of exiting and releasing its resources.
* Otherwise we could get an easy OOM deadlock.
*/
if (p->flags & PF_EXITING) {
if (p != current)
return ERR_PTR(-1UL);
chosen = p;
*ppoints = ULONG_MAX;
}
if (p->oomkilladj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
} while_each_thread(g, p);
return chosen;
}
/**
* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
* set.
*/
static void __oom_kill_task(struct task_struct *p, int verbose)
{
if (is_init(p)) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill init!\n");
return;
}
if (!p->mm) {
WARN_ON(1);
printk(KERN_WARNING "tried to kill an mm-less task!\n");
return;
}
if (verbose)
printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
p->time_slice = HZ;
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
}
static int oom_kill_task(struct task_struct *p)
{
struct mm_struct *mm;
struct task_struct *g, *q;
mm = p->mm;
/* WARNING: mm may not be dereferenced since we did not obtain its
* value from get_task_mm(p). This is OK since all we need to do is
* compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
* change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us.
*/
if (mm == NULL)
return 1;
/*
* Don't kill the process if any threads are set to OOM_DISABLE
*/
do_each_thread(g, q) {
if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
return 1;
} while_each_thread(g, q);
__oom_kill_task(p, 1);
/*
* kill all processes that share the ->mm (i.e. all threads),
* but are in a different thread group. Don't let them have access
* to memory reserves though, otherwise we might deplete all memory.
*/
do_each_thread(g, q) {
if (q->mm == mm && q->tgid != p->tgid)
force_sig(SIGKILL, q);
} while_each_thread(g, q);
return 0;
}
static int oom_kill_process(struct task_struct *p, unsigned long points,
const char *message)
{
struct task_struct *c;
struct list_head *tsk;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
__oom_kill_task(p, 0);
return 0;
}
printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
message, p->pid, p->comm, points);
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
if (!oom_kill_task(c))
return 0;
}
return oom_kill_task(p);
}
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
int register_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_oom_notifier);
int unregister_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
* out_of_memory - kill the "best" process when we run out of memory
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
struct task_struct *p;
unsigned long points = 0;
unsigned long freed = 0;
int constraint;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
if (printk_ratelimit()) {
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
current->comm, gfp_mask, order, current->oomkilladj);
dump_stack();
show_mem();
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
constraint = constrained_alloc(zonelist, gfp_mask);
cpuset_lock();
read_lock(&tasklist_lock);
switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
oom_kill_process(current, points,
"No available memory (MPOL_BIND)");
break;
case CONSTRAINT_CPUSET:
oom_kill_process(current, points,
"No available memory in cpuset");
break;
case CONSTRAINT_NONE:
if (sysctl_panic_on_oom)
panic("out of memory. panic_on_oom is selected\n");
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
p = select_bad_process(&points);
if (PTR_ERR(p) == -1UL)
goto out;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
cpuset_unlock();
panic("Out of memory and no killable processes...\n");
}
if (oom_kill_process(p, points, "Out of memory"))
goto retry;
break;
}
out:
read_unlock(&tasklist_lock);
cpuset_unlock();
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}

981
mm/page-writeback.c Normal file
View File

@@ -0,0 +1,981 @@
/*
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
*
* Contains functions related to writing back dirty pages at the
* address_space level.
*
* 10Apr2002 akpm@zip.com.au
* Initial version
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
* operation. We do this so we don't hold I_LOCK against an inode for
* enormous amounts of time, which would block a userspace task which has
* been forced to throttle against that inode. Also, the code reevaluates
* the dirty each time it has written this many pages.
*/
#define MAX_WRITEBACK_PAGES 1024
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
static long ratelimit_pages = 32;
static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I/O are submitted.
*/
static inline long sync_writeback_pages(void)
{
return ratelimit_pages + ratelimit_pages / 2;
}
/* The following parameters are exported via /proc/sys/vm */
/*
* Start background writeback (via pdflush) at this percentage
*/
int dirty_background_ratio = 10;
/*
* The generator of dirty data starts writeback at this percentage
*/
int vm_dirty_ratio = 40;
/*
* The interval between `kupdate'-style writebacks, in jiffies
*/
int dirty_writeback_interval = 5 * HZ;
/*
* The longest number of jiffies for which data is allowed to remain dirty
*/
int dirty_expire_interval = 30 * HZ;
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
*/
int block_dump;
/*
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
* a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
static void background_writeout(unsigned long _min_pages);
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
*
* The main aim here is to lower them aggressively if there is a lot of mapped
* memory around. To avoid stressing page reclaim with lots of unreclaimable
* pages. It is better to clamp down on writers than to start swapping, and
* performing lots of scanning.
*
* We only allow 1/2 of the currently-unmapped memory to be dirtied.
*
* We don't permit the clamping level to fall below 5% - that is getting rather
* excessive.
*
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
static void
get_dirty_limits(long *pbackground, long *pdirty,
struct address_space *mapping)
{
int background_ratio; /* Percentages */
int dirty_ratio;
int unmapped_ratio;
long background;
long dirty;
unsigned long available_memory = vm_total_pages;
struct task_struct *tsk;
#ifdef CONFIG_HIGHMEM
/*
* We always exclude high memory from our count.
*/
available_memory -= totalhigh_pages;
#endif
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
vm_total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
dirty_ratio = unmapped_ratio / 2;
if (dirty_ratio < 5)
dirty_ratio = 5;
background_ratio = dirty_background_ratio;
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;
background = (background_ratio * available_memory) / 100;
dirty = (dirty_ratio * available_memory) / 100;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
dirty += dirty / 4;
}
*pbackground = background;
*pdirty = dirty;
}
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
* the caller to perform writeback if the system is over `vm_dirty_ratio'.
* If we're over `background_thresh' then pdflush is woken to perform some
* writeout.
*/
static void balance_dirty_pages(struct address_space *mapping)
{
long nr_reclaimable;
long background_thresh;
long dirty_thresh;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
struct writeback_control wbc = {
.bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
.range_cyclic = 1,
};
get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
dirty_thresh)
break;
if (!dirty_exceeded)
dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
get_dirty_limits(&background_thresh,
&dirty_thresh, mapping);
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
if (nr_reclaimable +
global_page_state(NR_WRITEBACK)
<= dirty_thresh)
break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break; /* We've done our duty */
}
congestion_wait(WRITE, HZ/10);
}
if (nr_reclaimable + global_page_state(NR_WRITEBACK)
<= dirty_thresh && dirty_exceeded)
dirty_exceeded = 0;
if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (nr_reclaimable > background_thresh)))
pdflush_operation(background_writeout, 0);
}
void set_page_dirty_balance(struct page *page)
{
if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
balance_dirty_pages_ratelimited(mapping);
}
}
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
* @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* On really big machines, get_writeback_state is expensive, so try to avoid
* calling it too often (ratelimiting). But once we're over the dirty memory
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
ratelimit = ratelimit_pages;
if (dirty_exceeded)
ratelimit = 8;
/*
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
p = &__get_cpu_var(ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
*p = 0;
preempt_enable();
balance_dirty_pages(mapping);
return;
}
preempt_enable();
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
void throttle_vm_writeout(gfp_t gfp_mask)
{
long background_thresh;
long dirty_thresh;
if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
/*
* The caller might hold locks which can prevent IO completion
* or progress in the filesystem. So we cannot just sit here
* waiting for IO to complete.
*/
congestion_wait(WRITE, HZ/10);
return;
}
for ( ; ; ) {
get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
/*
* Boost the allowable dirty threshold a bit for page
* allocators so they don't get DoS'ed by heavy writers
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
congestion_wait(WRITE, HZ/10);
}
}
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
static void background_writeout(unsigned long _min_pages)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
.range_cyclic = 1,
};
for ( ; ; ) {
long background_thresh;
long dirty_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
if (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
congestion_wait(WRITE, HZ/10);
if (!wbc.encountered_congestion)
break;
}
}
}
/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
int wakeup_pdflush(long nr_pages)
{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
return pdflush_operation(background_writeout, nr_pages);
}
static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
* Periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static void wb_kupdate(unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
.range_cyclic = 1,
};
sync_supers();
oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
next_jif = start_jif + dirty_writeback_interval;
nr_to_write = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
if (wbc.encountered_congestion)
congestion_wait(WRITE, HZ/10);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
if (dirty_writeback_interval) {
mod_timer(&wb_timer,
jiffies + dirty_writeback_interval);
} else {
del_timer(&wb_timer);
}
return 0;
}
static void wb_timer_fn(unsigned long unused)
{
if (pdflush_operation(wb_kupdate, 0) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
static void laptop_flush(unsigned long unused)
{
sys_sync();
}
static void laptop_timer_fn(unsigned long unused)
{
pdflush_operation(laptop_flush, 0);
}
/*
* We've spun up the disk and we're in laptop mode: schedule writeback
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
void laptop_io_completion(void)
{
mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
* We're in laptop mode and we've just synced. The sync's writes will have
* caused another writeback to be scheduled by laptop_io_completion.
* Nothing needs to be written back anymore, so we unschedule the writeback.
*/
void laptop_sync_completion(void)
{
del_timer(&laptop_mode_wb_timer);
}
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
* If it is too low then SMP machines will call the (expensive)
* get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
* thresholds before writeback cuts in.
*
* But the limit should not be set too high. Because it also controls the
* amount of memory which the balance_dirty_pages() caller has to write back.
* If this is too large then the caller will block on the IO queue all the
* time. So limit it to four megabytes - the balance_dirty_pages() caller
* will write six megabyte chunks, max.
*/
void writeback_set_ratelimit(void)
{
ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
static int __cpuinit
ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
{
writeback_set_ratelimit();
return NOTIFY_DONE;
}
static struct notifier_block __cpuinitdata ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
/*
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
* related to pages that could be allocated for buffers (by
* comparing nr_free_buffer_pages() to vm_total_pages.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
* is now applied to total non-HIGHPAGE memory (by subtracting
* totalhigh_pages from vm_total_pages), and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
*
* But we might still want to scale the dirty_ratio by how
* much memory the box has..
*/
void __init page_writeback_init(void)
{
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
/**
* generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
*
* This is a library function, which implements the writepages()
* address_space_operation.
*
* If a page is already under I/O, generic_writepages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*
* Derived from mpage_writepages() - if you fix this you should check that
* also!
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
int (*writepage)(struct page *page, struct writeback_control *wbc);
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
return 0;
}
writepage = mapping->a_ops->writepage;
/* deal with chardevs and other special file */
if (!writepage)
return 0;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
scanned = 1;
}
retry:
while (!done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;
scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file
* mapping
*/
lock_page(page);
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
}
if (!wbc->range_cyclic && page->index > end) {
done = 1;
unlock_page(page);
continue;
}
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
if (PageWriteback(page) ||
!clear_page_dirty_for_io(page)) {
unlock_page(page);
continue;
}
ret = (*writepage)(page, wbc);
if (ret) {
if (ret == -ENOSPC)
set_bit(AS_ENOSPC, &mapping->flags);
else
set_bit(AS_EIO, &mapping->flags);
}
if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
unlock_page(page);
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
}
}
pagevec_release(&pvec);
cond_resched();
}
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
scanned = 1;
index = 0;
goto retry;
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
return ret;
}
EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
wbc->for_writepages = 0;
return ret;
}
/**
* write_one_page - write out a single page and optionally wait on I/O
* @page: the page to write
* @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
* write_one_page() returns a negative error code if I/O failed.
*/
int write_one_page(struct page *page, int wait)
{
struct address_space *mapping = page->mapping;
int ret = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 1,
};
BUG_ON(!PageLocked(page));
if (wait)
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
page_cache_get(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
}
page_cache_release(page);
} else {
unlock_page(page);
}
return ret;
}
EXPORT_SYMBOL(write_one_page);
/*
* For address_spaces which do not use buffers nor write back.
*/
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
SetPageDirty(page);
return 0;
}
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
* Most callers have locked the page, which pins the address_space in memory.
* But zap_pte_range() does not lock the page, however in that case the
* mapping is pinned by the vma's ->vm_file reference.
*
* We take care to handle the case where the page was truncated from the
* mapping by re-checking page_mapping() insode tree_lock.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
if (!mapping)
return 1;
write_lock_irq(&mapping->tree_lock);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
task_io_account_write(PAGE_CACHE_SIZE);
}
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
write_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
wbc->pages_skipped++;
return __set_page_dirty_nobuffers(page);
}
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
int fastcall set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
#endif
return (*spd)(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
return 1;
}
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
* CPU could truncate the page off the mapping and then free the mapping.
*
* Usually, the page _is_ locked, or the caller is a user-space process which
* holds a reference on the inode by having an open file.
*
* In other cases, the page should be locked before running set_page_dirty().
*/
int set_page_dirty_lock(struct page *page)
{
int ret;
lock_page_nosync(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
}
EXPORT_SYMBOL(set_page_dirty_lock);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
* tagged as dirty in the radix tree so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
* at which stage we bring the page's dirty flag and radix-tree dirty tag
* back into sync.
*
* This incoherency between the page's dirty flag and radix-tree tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (mapping && mapping_cap_account_dirty(mapping)) {
/*
* Yes, Virginia, this is indeed insane.
*
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
* mark the whole page dirty if it was
* dirty in a pagetable. Only to then
* (c) clean the page again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
* Note! Normally the "set_page_dirty(page)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
* We basically use the page "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*
* FIXME! We still have a race here: if somebody
* adds the page back to the page tables in
* between the "page_mkclean()" and the "TestClearPageDirty()",
* we might have it mapped without the dirty bit set.
*/
if (page_mkclean(page))
set_page_dirty(page);
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
return 1;
}
return 0;
}
return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
if (mapping) {
unsigned long flags;
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
if (ret)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
return ret;
}
int test_set_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
if (mapping) {
unsigned long flags;
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret)
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
return ret;
}
EXPORT_SYMBOL(test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marged with the
* passed tag.
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
unsigned long flags;
int ret;
read_lock_irqsave(&mapping->tree_lock, flags);
ret = radix_tree_tagged(&mapping->page_tree, tag);
read_unlock_irqrestore(&mapping->tree_lock, flags);
return ret;
}
EXPORT_SYMBOL(mapping_tagged);

3394
mm/page_alloc.c Normal file

File diff suppressed because it is too large Load Diff

149
mm/page_io.c Normal file
View File

@@ -0,0 +1,149 @@
/*
* linux/mm/page_io.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95,
* Asynchronous swapping added 30.12.95. Stephen Tweedie
* Removed race in async swapping. 14.4.1996. Bruno Haible
* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
*/
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
struct page *page, bio_end_io_t end_io)
{
struct bio *bio;
bio = bio_alloc(gfp_flags, 1);
if (bio) {
struct swap_info_struct *sis;
swp_entry_t entry = { .val = index, };
sis = get_swap_info_struct(swp_type(entry));
bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
(PAGE_SIZE >> 9);
bio->bi_bdev = sis->bdev;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
bio->bi_idx = 0;
bio->bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
}
static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_size)
return 1;
if (!uptodate) {
SetPageError(page);
/*
* We failed to write the page out to swap-space.
* Re-dirty the page in order to avoid it being reclaimed.
* Also print a dire warning that things will go BAD (tm)
* very quickly.
*
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
return 0;
}
int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_size)
return 1;
if (!uptodate) {
SetPageError(page);
ClearPageUptodate(page);
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_sector);
} else {
SetPageUptodate(page);
}
unlock_page(page);
bio_put(bio);
return 0;
}
/*
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
int ret = 0, rw = WRITE;
if (remove_exclusive_swap_page(page)) {
unlock_page(page);
goto out;
}
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
ret = -ENOMEM;
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
rw |= (1 << BIO_RW_SYNC);
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
submit_bio(rw, bio);
out:
return ret;
}
int swap_readpage(struct file *file, struct page *page)
{
struct bio *bio;
int ret = 0;
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
goto out;
}
count_vm_event(PSWPIN);
submit_bio(READ, bio);
out:
return ret;
}

240
mm/pdflush.c Normal file
View File

@@ -0,0 +1,240 @@
/*
* mm/pdflush.c - worker threads for writing back filesystem data
*
* Copyright (C) 2002, Linus Torvalds.
*
* 09Apr2002 akpm@zip.com.au
* Initial version
* 29Feb2004 kaos@sgi.com
* Move worker thread creation to kthread to avoid chewing
* up stack space with nested calls to kernel_thread.
*/
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h> // Needed by writeback.h
#include <linux/writeback.h> // Prototypes pdflush_operation()
#include <linux/kthread.h>
#include <linux/cpuset.h>
#include <linux/freezer.h>
/*
* Minimum and maximum number of pdflush instances
*/
#define MIN_PDFLUSH_THREADS 2
#define MAX_PDFLUSH_THREADS 8
static void start_one_pdflush_thread(void);
/*
* The pdflush threads are worker threads for writing back dirty data.
* Ideally, we'd like one thread per active disk spindle. But the disk
* topology is very hard to divine at this level. Instead, we take
* care in various places to prevent more than one pdflush thread from
* performing writeback against a single filesystem. pdflush threads
* have the PF_FLUSHER flag set in current->flags to aid in this.
*/
/*
* All the pdflush threads. Protected by pdflush_lock
*/
static LIST_HEAD(pdflush_list);
static DEFINE_SPINLOCK(pdflush_lock);
/*
* The count of currently-running pdflush threads. Protected
* by pdflush_lock.
*
* Readable by sysctl, but not writable. Published to userspace at
* /proc/sys/vm/nr_pdflush_threads.
*/
int nr_pdflush_threads = 0;
/*
* The time at which the pdflush thread pool last went empty
*/
static unsigned long last_empty_jifs;
/*
* The pdflush thread.
*
* Thread pool management algorithm:
*
* - The minimum and maximum number of pdflush instances are bound
* by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
*
* - If there have been no idle pdflush instances for 1 second, create
* a new one.
*
* - If the least-recently-went-to-sleep pdflush thread has been asleep
* for more than one second, terminate a thread.
*/
/*
* A structure for passing work to a pdflush thread. Also for passing
* state information between pdflush threads. Protected by pdflush_lock.
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
void (*fn)(unsigned long); /* A callback function */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
};
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
spin_lock_irq(&pdflush_lock);
nr_pdflush_threads++;
for ( ; ; ) {
struct pdflush_work *pdf;
set_current_state(TASK_INTERRUPTIBLE);
list_move(&my_work->list, &pdflush_list);
my_work->when_i_went_to_sleep = jiffies;
spin_unlock_irq(&pdflush_lock);
schedule();
try_to_freeze();
spin_lock_irq(&pdflush_lock);
if (!list_empty(&my_work->list)) {
/*
* Someone woke us up, but without removing our control
* structure from the global list. swsusp will do this
* in try_to_freeze()->refrigerator(). Handle it.
*/
my_work->fn = NULL;
continue;
}
if (my_work->fn == NULL) {
printk("pdflush: bogus wakeup\n");
continue;
}
spin_unlock_irq(&pdflush_lock);
(*my_work->fn)(my_work->arg0);
/*
* Thread creation: For how long have there been zero
* available threads?
*/
if (jiffies - last_empty_jifs > 1 * HZ) {
/* unlocked list_empty() test is OK here */
if (list_empty(&pdflush_list)) {
/* unlocked test is OK here */
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
start_one_pdflush_thread();
}
}
spin_lock_irq(&pdflush_lock);
my_work->fn = NULL;
/*
* Thread destruction: For how long has the sleepiest
* thread slept?
*/
if (list_empty(&pdflush_list))
continue;
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
/* Limit exit rate */
pdf->when_i_went_to_sleep = jiffies;
break; /* exeunt */
}
}
nr_pdflush_threads--;
spin_unlock_irq(&pdflush_lock);
return 0;
}
/*
* Of course, my_work wants to be just a local in __pdflush(). It is
* separated out in this manner to hopefully prevent the compiler from
* performing unfortunate optimisations against the auto variables. Because
* these are visible to other tasks and CPUs. (No problem has actually
* been observed. This is just paranoia).
*/
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
cpumask_t cpus_allowed;
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
* don't want to do that at keventd's priority.
*/
set_user_nice(current, 0);
/*
* Some configs put our parent kthread in a limited cpuset,
* which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
* Our needs are more modest - cut back to our cpusets cpus_allowed.
* This is needed as pdflush's are dynamically created and destroyed.
* The boottime pdflush's are easily placed w/o these 2 lines.
*/
cpus_allowed = cpuset_cpus_allowed(current);
set_cpus_allowed(current, cpus_allowed);
return __pdflush(&my_work);
}
/*
* Attempt to wake up a pdflush thread, and get it to do some work for you.
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
{
unsigned long flags;
int ret = 0;
BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
spin_lock_irqsave(&pdflush_lock, flags);
if (list_empty(&pdflush_list)) {
spin_unlock_irqrestore(&pdflush_lock, flags);
ret = -1;
} else {
struct pdflush_work *pdf;
pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
list_del_init(&pdf->list);
if (list_empty(&pdflush_list))
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
wake_up_process(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
return ret;
}
static void start_one_pdflush_thread(void)
{
kthread_run(pdflush, NULL, "pdflush");
}
static int __init pdflush_init(void)
{
int i;
for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
start_one_pdflush_thread();
return 0;
}
module_init(pdflush_init);

207
mm/prio_tree.c Normal file
View File

@@ -0,0 +1,207 @@
/*
* mm/prio_tree.c - priority search tree for mapping->i_mmap
*
* Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
*
* This file is released under the GPL v2.
*
* Based on the radix priority search tree proposed by Edward M. McCreight
* SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
*
* 02Feb2004 Initial version
*/
#include <linux/mm.h>
#include <linux/prio_tree.h>
/*
* See lib/prio_tree.c for details on the general radix priority search tree
* code.
*/
/*
* The following #defines are mirrored from lib/prio_tree.c. They're only used
* for debugging, and should be removed (along with the debugging code using
* them) when switching also VMAs to the regular prio_tree code.
*/
#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
/* avoid overflow */
#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
/*
* Radix priority search tree for address_space->i_mmap
*
* For each vma that map a unique set of file pages i.e., unique [radix_index,
* heap_index] value, we have a corresponing priority search tree node. If
* multiple vmas have identical [radix_index, heap_index] value, then one of
* them is used as a tree node and others are stored in a vm_set list. The tree
* node points to the first vma (head) of the list using vm_set.head.
*
* prio_tree_root
* |
* A vm_set.head
* / \ /
* L R -> H-I-J-K-M-N-O-P-Q-S
* ^ ^ <-- vm_set.list -->
* tree nodes
*
* We need some way to identify whether a vma is a tree node, head of a vm_set
* list, or just a member of a vm_set list. We cannot use vm_flags to store
* such information. The reason is, in the above figure, it is possible that
* vm_flags' of R and H are covered by the different mmap_sems. When R is
* removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
* H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
* That's why some trick involving shared.vm_set.parent is used for identifying
* tree nodes and list head nodes.
*
* vma radix priority search tree node rules:
*
* vma->shared.vm_set.parent != NULL ==> a tree node
* vma->shared.vm_set.head != NULL ==> list of others mapping same range
* vma->shared.vm_set.head == NULL ==> no others map the same range
*
* vma->shared.vm_set.parent == NULL
* vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
* vma->shared.vm_set.head == NULL ==> a list node
*/
/*
* Add a new vma known to map the same set of pages as the old vma:
* useful for fork's dup_mmap as well as vma_prio_tree_insert below.
* Note that it just happens to work correctly on i_mmap_nonlinear too.
*/
void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
{
/* Leave these BUG_ONs till prio_tree patch stabilizes */
BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
vma->shared.vm_set.head = NULL;
vma->shared.vm_set.parent = NULL;
if (!old->shared.vm_set.parent)
list_add(&vma->shared.vm_set.list,
&old->shared.vm_set.list);
else if (old->shared.vm_set.head)
list_add_tail(&vma->shared.vm_set.list,
&old->shared.vm_set.head->shared.vm_set.list);
else {
INIT_LIST_HEAD(&vma->shared.vm_set.list);
vma->shared.vm_set.head = old;
old->shared.vm_set.head = vma;
}
}
void vma_prio_tree_insert(struct vm_area_struct *vma,
struct prio_tree_root *root)
{
struct prio_tree_node *ptr;
struct vm_area_struct *old;
vma->shared.vm_set.head = NULL;
ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
old = prio_tree_entry(ptr, struct vm_area_struct,
shared.prio_tree_node);
vma_prio_tree_add(vma, old);
}
}
void vma_prio_tree_remove(struct vm_area_struct *vma,
struct prio_tree_root *root)
{
struct vm_area_struct *node, *head, *new_head;
if (!vma->shared.vm_set.head) {
if (!vma->shared.vm_set.parent)
list_del_init(&vma->shared.vm_set.list);
else
raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
} else {
/* Leave this BUG_ON till prio_tree patch stabilizes */
BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
if (vma->shared.vm_set.parent) {
head = vma->shared.vm_set.head;
if (!list_empty(&head->shared.vm_set.list)) {
new_head = list_entry(
head->shared.vm_set.list.next,
struct vm_area_struct,
shared.vm_set.list);
list_del_init(&head->shared.vm_set.list);
} else
new_head = NULL;
raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
&head->shared.prio_tree_node);
head->shared.vm_set.head = new_head;
if (new_head)
new_head->shared.vm_set.head = head;
} else {
node = vma->shared.vm_set.head;
if (!list_empty(&vma->shared.vm_set.list)) {
new_head = list_entry(
vma->shared.vm_set.list.next,
struct vm_area_struct,
shared.vm_set.list);
list_del_init(&vma->shared.vm_set.list);
node->shared.vm_set.head = new_head;
new_head->shared.vm_set.head = node;
} else
node->shared.vm_set.head = NULL;
}
}
}
/*
* Helper function to enumerate vmas that map a given file page or a set of
* contiguous file pages. The function returns vmas that at least map a single
* page in the given range of contiguous file pages.
*/
struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
struct prio_tree_iter *iter)
{
struct prio_tree_node *ptr;
struct vm_area_struct *next;
if (!vma) {
/*
* First call is with NULL vma
*/
ptr = prio_tree_next(iter);
if (ptr) {
next = prio_tree_entry(ptr, struct vm_area_struct,
shared.prio_tree_node);
prefetch(next->shared.vm_set.head);
return next;
} else
return NULL;
}
if (vma->shared.vm_set.parent) {
if (vma->shared.vm_set.head) {
next = vma->shared.vm_set.head;
prefetch(next->shared.vm_set.list.next);
return next;
}
} else {
next = list_entry(vma->shared.vm_set.list.next,
struct vm_area_struct, shared.vm_set.list);
if (!next->shared.vm_set.head) {
prefetch(next->shared.vm_set.list.next);
return next;
}
}
ptr = prio_tree_next(iter);
if (ptr) {
next = prio_tree_entry(ptr, struct vm_area_struct,
shared.prio_tree_node);
prefetch(next->shared.vm_set.head);
return next;
} else
return NULL;
}

580
mm/readahead.c Normal file
View File

@@ -0,0 +1,580 @@
/*
* mm/readahead.c - address_space-level file readahead.
*
* Copyright (C) 2002, Linus Torvalds
*
* 09Apr2002 akpm@zip.com.au
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
{
}
EXPORT_SYMBOL(default_unplug_io_fn);
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
*/
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = mapping->backing_dev_info->ra_pages;
ra->prev_page = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
/*
* Return max readahead size for this inode in number-of-pages.
*/
static inline unsigned long get_max_readahead(struct file_ra_state *ra)
{
return ra->ra_pages;
}
static inline unsigned long get_min_readahead(struct file_ra_state *ra)
{
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}
static inline void reset_ahead_window(struct file_ra_state *ra)
{
/*
* ... but preserve ahead_start + ahead_size value,
* see 'recheck:' label in page_cache_readahead().
* Note: We never use ->ahead_size as rvalue without
* checking ->ahead_start != 0 first.
*/
ra->ahead_size += ra->ahead_start;
ra->ahead_start = 0;
}
static inline void ra_off(struct file_ra_state *ra)
{
ra->start = 0;
ra->flags = 0;
ra->size = 0;
reset_ahead_window(ra);
return;
}
/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
* 1-8 page = 32k initial, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
unsigned long newsize = roundup_pow_of_two(size);
if (newsize <= max / 32)
newsize = newsize * 4;
else if (newsize <= max / 4)
newsize = newsize * 2;
else
newsize = max;
return newsize;
}
/*
* Set the new window size, this is called only when I/O is to be submitted,
* not for each call to readahead. If a cache miss occured, reduce next I/O
* size, else increase depending on how close to max we are.
*/
static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
{
unsigned long max = get_max_readahead(ra);
unsigned long min = get_min_readahead(ra);
unsigned long cur = ra->size;
unsigned long newsize;
if (ra->flags & RA_FLAG_MISS) {
ra->flags &= ~RA_FLAG_MISS;
newsize = max((cur - 2), min);
} else if (cur < max / 16) {
newsize = 4 * cur;
} else {
newsize = 2 * cur;
}
return min(newsize, max);
}
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
/**
* read_cache_pages - populate an address space with some pages & start reads against them
* @mapping: the address_space
* @pages: The address of a list_head which contains the target pages. These
* pages have their ->index populated and are otherwise uninitialised.
* @filler: callback routine for filling a single page.
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
{
struct page *page;
struct pagevec lru_pvec;
int ret = 0;
pagevec_init(&lru_pvec, 0);
while (!list_empty(pages)) {
page = list_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
page_cache_release(page);
continue;
}
ret = filler(data, page);
if (!pagevec_add(&lru_pvec, page))
__pagevec_lru_add(&lru_pvec);
if (ret) {
put_pages_list(pages);
break;
}
task_io_account_read(PAGE_CACHE_SIZE);
}
pagevec_lru_add(&lru_pvec);
return ret;
}
EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
unsigned page_idx;
struct pagevec lru_pvec;
int ret;
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
pagevec_init(&lru_pvec, 0);
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
mapping->a_ops->readpage(filp, page);
if (!pagevec_add(&lru_pvec, page))
__pagevec_lru_add(&lru_pvec);
} else
page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
ret = 0;
out:
return ret;
}
/*
* Readahead design.
*
* The fields in struct file_ra_state represent the most-recently-executed
* readahead attempt:
*
* start: Page index at which we started the readahead
* size: Number of pages in that read
* Together, these form the "current window".
* Together, start and size represent the `readahead window'.
* prev_page: The page which the readahead algorithm most-recently inspected.
* It is mainly used to detect sequential file reading.
* If page_cache_readahead sees that it is again being called for
* a page which it just looked at, it can return immediately without
* making any state changes.
* ahead_start,
* ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
*
* When readahead is in the off state (size == 0), readahead is disabled.
* In this state, prev_page is used to detect the resumption of sequential I/O.
*
* The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages
* in the current window, I/O is underway on the ahead window. When the
* current window is fully traversed, it is replaced by the ahead window
* and the ahead window is invalidated. When this copying happens, the
* new current window's pages are probably still locked. So
* we submit a new batch of I/O immediately, creating a new ahead window.
*
* So:
*
* ----|----------------|----------------|-----
* ^start ^start+size
* ^ahead_start ^ahead_start+ahead_size
*
* ^ When this page is read, we submit I/O for the
* ahead window.
*
* A `readahead hit' occurs when a read request is made against a page which is
* the next sequential page. Ahead window calculations are done only when it
* is time to submit a new IO. The code ramps up the size agressively at first,
* but slow down as it approaches max_readhead.
*
* Any seek/ramdom IO will result in readahead being turned off. It will resume
* at the first sequential access.
*
* There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear
* read is about to happen and the window is immediately set to the initial size
* based on I/O request size and the max_readahead.
*
* This function is to be called for every read request, rather than when
* it is time to perform readahead. It is called only once for the entire I/O
* regardless of size unless readahead is unable to start enough I/O to satisfy
* the request (I/O request > max_readahead).
*/
/*
* do_page_cache_readahead actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*
* do_page_cache_readahead() returns -1 if it encountered request queue
* congestion.
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
if (isize == 0)
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
page = radix_tree_lookup(&mapping->page_tree, page_offset);
if (page)
continue;
read_unlock_irq(&mapping->tree_lock);
page = page_cache_alloc_cold(mapping);
read_lock_irq(&mapping->tree_lock);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
ret++;
}
read_unlock_irq(&mapping->tree_lock);
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
int ret = 0;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
while (nr_to_read) {
int err;
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk);
if (err < 0) {
ret = err;
break;
}
ret += err;
offset += this_chunk;
nr_to_read -= this_chunk;
}
return ret;
}
/*
* Check how effective readahead is being. If the amount of started IO is
* less than expected then the file is partly or fully in pagecache and
* readahead isn't helping.
*
*/
static inline int check_ra_success(struct file_ra_state *ra,
unsigned long nr_to_read, unsigned long actual)
{
if (actual == 0) {
ra->cache_hit += nr_to_read;
if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
ra_off(ra);
ra->flags |= RA_FLAG_INCACHE;
return 0;
}
} else {
ra->cache_hit=0;
}
return 1;
}
/*
* This version skips the IO if the queue is read-congested, and will tell the
* block layer to abandon the readahead if request allocation would block.
*
* force_page_cache_readahead() will ignore queue congestion and will block on
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
}
/*
* Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
* is set wait till the read completes. Otherwise attempt to read without
* blocking.
* Returns 1 meaning 'success' if read is successful without switching off
* readahead mode. Otherwise return failure.
*/
static int
blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
struct file_ra_state *ra, int block)
{
int actual;
if (!block && bdi_read_congested(mapping->backing_dev_info))
return 0;
actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
return check_ra_success(ra, nr_to_read, actual);
}
static int make_ahead_window(struct address_space *mapping, struct file *filp,
struct file_ra_state *ra, int force)
{
int block, ret;
ra->ahead_size = get_next_ra_size(ra);
ra->ahead_start = ra->start + ra->size;
block = force || (ra->prev_page >= ra->ahead_start);
ret = blockable_page_cache_readahead(mapping, filp,
ra->ahead_start, ra->ahead_size, ra, block);
if (!ret && !force) {
/* A read failure in blocking mode, implies pages are
* all cached. So we can safely assume we have taken
* care of all the pages requested in this call.
* A read failure in non-blocking mode, implies we are
* reading more pages than requested in this call. So
* we safely assume we have taken care of all the pages
* requested in this call.
*
* Just reset the ahead window in case we failed due to
* congestion. The ahead window will any way be closed
* in case we failed due to excessive page cache hits.
*/
reset_ahead_window(ra);
}
return ret;
}
/**
* page_cache_readahead - generic adaptive readahead
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
* @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
* @req_size: hint: total size of the read which the caller is performing in
* PAGE_CACHE_SIZE units
*
* page_cache_readahead() is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
*
* Note that @filp is purely used for passing on to the ->readpage[s]()
* handler: it may refer to a different file from @mapping (so we may not use
* @filp->f_mapping or @filp->f_path.dentry->d_inode here).
* Also, @ra may not be equal to &@filp->f_ra.
*
*/
unsigned long
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
struct file *filp, pgoff_t offset, unsigned long req_size)
{
unsigned long max, newsize;
int sequential;
/*
* We avoid doing extra work and bogusly perturbing the readahead
* window expansion logic.
*/
if (offset == ra->prev_page && --req_size)
++offset;
/* Note that prev_page == -1 if it is a first read */
sequential = (offset == ra->prev_page + 1);
ra->prev_page = offset;
max = get_max_readahead(ra);
newsize = min(req_size, max);
/* No readahead or sub-page sized read or file already in cache */
if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
goto out;
ra->prev_page += newsize - 1;
/*
* Special case - first read at start of file. We'll assume it's
* a whole-file read and grow the window fast. Or detect first
* sequential access
*/
if (sequential && ra->size == 0) {
ra->size = get_init_ra_size(newsize, max);
ra->start = offset;
if (!blockable_page_cache_readahead(mapping, filp, offset,
ra->size, ra, 1))
goto out;
/*
* If the request size is larger than our max readahead, we
* at least want to be sure that we get 2 IOs in flight and
* we know that we will definitly need the new I/O.
* once we do this, subsequent calls should be able to overlap
* IOs,* thus preventing stalls. so issue the ahead window
* immediately.
*/
if (req_size >= max)
make_ahead_window(mapping, filp, ra, 1);
goto out;
}
/*
* Now handle the random case:
* partial page reads and first access were handled above,
* so this must be the next page otherwise it is random
*/
if (!sequential) {
ra_off(ra);
blockable_page_cache_readahead(mapping, filp, offset,
newsize, ra, 1);
goto out;
}
/*
* If we get here we are doing sequential IO and this was not the first
* occurence (ie we have an existing window)
*/
if (ra->ahead_start == 0) { /* no ahead window yet */
if (!make_ahead_window(mapping, filp, ra, 0))
goto recheck;
}
/*
* Already have an ahead window, check if we crossed into it.
* If so, shift windows and issue a new ahead window.
* Only return the #pages that are in the current window, so that
* we get called back on the first page of the ahead window which
* will allow us to submit more IO.
*/
if (ra->prev_page >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
make_ahead_window(mapping, filp, ra, 0);
recheck:
/* prev_page shouldn't overrun the ahead window */
ra->prev_page = min(ra->prev_page,
ra->ahead_start + ra->ahead_size - 1);
}
out:
return ra->prev_page + 1;
}
EXPORT_SYMBOL_GPL(page_cache_readahead);
/*
* handle_ra_miss() is called when it is known that a page which should have
* been present in the pagecache (we just did some readahead there) was in fact
* not found. This will happen if it was evicted by the VM (readahead
* thrashing)
*
* Turn on the cache miss flag in the RA struct, this will cause the RA code
* to reduce the RA size on the next read.
*/
void handle_ra_miss(struct address_space *mapping,
struct file_ra_state *ra, pgoff_t offset)
{
ra->flags |= RA_FLAG_MISS;
ra->flags &= ~RA_FLAG_INCACHE;
ra->cache_hit = 0;
}
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
*/
unsigned long max_sane_readahead(unsigned long nr)
{
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
}

941
mm/rmap.c Normal file
View File

@@ -0,0 +1,941 @@
/*
* mm/rmap.c - physical to virtual reverse mappings
*
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
* Released under the General Public License (GPL).
*
* Simple, low overhead reverse mapping scheme.
* Please try to keep this thing as modular as possible.
*
* Provides methods for unmapping each kind of mapped page:
* the anon methods track anonymous pages, and
* the file methods track pages belonging to an inode.
*
* Original design by Rik van Riel <riel@conectiva.com.br> 2001
* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
* Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
*/
/*
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
* inode->i_alloc_sem (vmtruncate_range)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within inode_lock in __sync_single_inode)
*/
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <asm/tlbflush.h>
struct kmem_cache *anon_vma_cachep;
static inline void validate_anon_vma(struct vm_area_struct *find_vma)
{
#ifdef CONFIG_DEBUG_VM
struct anon_vma *anon_vma = find_vma->anon_vma;
struct vm_area_struct *vma;
unsigned int mapcount = 0;
int found = 0;
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
mapcount++;
BUG_ON(mapcount > 100000);
if (vma == find_vma)
found = 1;
}
BUG_ON(!found);
#endif
}
/* This must be called under the mmap_sem. */
int anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated, *locked;
anon_vma = find_mergeable_anon_vma(vma);
if (anon_vma) {
allocated = NULL;
locked = anon_vma;
spin_lock(&locked->lock);
} else {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
return -ENOMEM;
allocated = anon_vma;
locked = NULL;
}
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
allocated = NULL;
}
spin_unlock(&mm->page_table_lock);
if (locked)
spin_unlock(&locked->lock);
if (unlikely(allocated))
anon_vma_free(allocated);
}
return 0;
}
void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
{
BUG_ON(vma->anon_vma != next->anon_vma);
list_del(&next->anon_vma_node);
}
void __anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma) {
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
validate_anon_vma(vma);
}
}
void anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma) {
spin_lock(&anon_vma->lock);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
validate_anon_vma(vma);
spin_unlock(&anon_vma->lock);
}
}
void anon_vma_unlink(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
int empty;
if (!anon_vma)
return;
spin_lock(&anon_vma->lock);
validate_anon_vma(vma);
list_del(&vma->anon_vma_node);
/* We must garbage collect the anon_vma if it's empty */
empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
}
static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
unsigned long flags)
{
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
SLAB_CTOR_CONSTRUCTOR) {
struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock);
INIT_LIST_HEAD(&anon_vma->head);
}
}
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
static struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
anon_mapping = (unsigned long) page->mapping;
if (!(anon_mapping & PAGE_MAPPING_ANON))
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
spin_lock(&anon_vma->lock);
return anon_vma;
out:
rcu_read_unlock();
return NULL;
}
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
}
/*
* At what user virtual address is page expected in vma?
*/
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address;
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within any vma from prio_tree_next */
BUG_ON(!PageAnon(page));
return -EFAULT;
}
return address;
}
/*
* At what user virtual address is page expected in vma? checking that the
* page matches the vma: currently only used on anon pages, by unuse_vma;
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
if ((void *)vma->anon_vma !=
(void *)page->mapping - PAGE_MAPPING_ANON)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT;
return vma_address(page, vma);
}
/*
* Check that @page is mapped at @address into @mm.
*
* On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return NULL;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
if (!pte_present(*pte)) {
pte_unmap(pte);
return NULL;
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
return pte;
}
pte_unmap_unlock(pte, ptl);
return NULL;
}
/*
* Subfunctions of page_referenced: page_referenced_one called
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
struct vm_area_struct *vma, unsigned int *mapcount)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
pte = page_check_address(page, mm, address, &ptl);
if (!pte)
goto out;
if (ptep_clear_flush_young(vma, address, pte))
referenced++;
/* Pretend the page is referenced if the task has the
swap token and is in the middle of a page fault. */
if (mm != current->mm && has_swap_token(mm) &&
rwsem_is_locked(&mm->mmap_sem))
referenced++;
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
out:
return referenced;
}
static int page_referenced_anon(struct page *page)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
int referenced = 0;
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return referenced;
mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
}
page_unlock_anon_vma(anon_vma);
return referenced;
}
/**
* page_referenced_file - referenced check for object-based rmap
* @page: the page we're checking references on.
*
* For an object-based mapped page, find all the places it is mapped and
* check/clear the referenced flag. This is done by following the page->mapping
* pointer, then walking the chain of vmas it holds. It returns the number
* of references it found.
*
* This function is only called from page_referenced for object-based pages.
*/
static int page_referenced_file(struct page *page)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
int referenced = 0;
/*
* The caller's checks on page->mapping and !PageAnon have made
* sure that this is a file page: the check for page->mapping
* excludes the case just before it gets set on an anon page.
*/
BUG_ON(PageAnon(page));
/*
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_lock.
*/
BUG_ON(!PageLocked(page));
spin_lock(&mapping->i_mmap_lock);
/*
* i_mmap_lock does not stabilize mapcount at all, but mapcount
* is more likely to be accurate if we note it after spinning.
*/
mapcount = page_mapcount(page);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
== (VM_LOCKED|VM_MAYSHARE)) {
referenced++;
break;
}
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
}
spin_unlock(&mapping->i_mmap_lock);
return referenced;
}
/**
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
int page_referenced(struct page *page, int is_locked)
{
int referenced = 0;
if (page_test_and_clear_young(page))
referenced++;
if (TestClearPageReferenced(page))
referenced++;
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
referenced += page_referenced_anon(page);
else if (is_locked)
referenced += page_referenced_file(page);
else if (TestSetPageLocked(page))
referenced++;
else {
if (page->mapping)
referenced += page_referenced_file(page);
unlock_page(page);
}
}
return referenced;
}
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
pte = page_check_address(page, mm, address, &ptl);
if (!pte)
goto out;
if (pte_dirty(*pte) || pte_write(*pte)) {
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
lazy_mmu_prot_update(entry);
ret = 1;
}
pte_unmap_unlock(pte, ptl);
out:
return ret;
}
static int page_mkclean_file(struct address_space *mapping, struct page *page)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
int ret = 0;
BUG_ON(PageAnon(page));
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED)
ret += page_mkclean_one(page, vma);
}
spin_unlock(&mapping->i_mmap_lock);
return ret;
}
int page_mkclean(struct page *page)
{
int ret = 0;
BUG_ON(!PageLocked(page));
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
ret = page_mkclean_file(mapping, page);
if (page_test_and_clear_dirty(page))
ret = 1;
}
return ret;
}
/**
* page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
static void __page_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
/*
* nr_mapped state can be updated without turning off
* interrupts because it is not modified via interrupt.
*/
__inc_zone_page_state(page, NR_ANON_PAGES);
}
/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
* The caller needs to hold the pte lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
if (atomic_inc_and_test(&page->_mapcount))
__page_set_anon_rmap(page, vma, address);
/* else checking page index and mapping is racy */
}
/*
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
*/
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
__page_set_anon_rmap(page, vma, address);
}
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
* The caller needs to hold the pte lock.
*/
void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount))
__inc_zone_page_state(page, NR_FILE_MAPPED);
}
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
*
* The caller needs to hold the pte lock.
*/
void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
if (unlikely(page_mapcount(page) < 0)) {
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
if (vma->vm_ops)
print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
if (vma->vm_file && vma->vm_file->f_op)
print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
BUG();
}
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
* before us: so leave the reset to free_hot_cold_page,
* and remember that it's only reliable while mapped.
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
__dec_zone_page_state(page,
PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
}
}
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
int migration)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
pte = page_check_address(page, mm, address, &ptl);
if (!pte)
goto out;
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
(ptep_clear_flush_young(vma, address, pte)))) {
ret = SWAP_FAIL;
goto out_unmap;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
#ifdef CONFIG_MIGRATION
} else {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
BUG_ON(!migration);
entry = make_migration_entry(page, pte_write(pteval));
#endif
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
} else
#ifdef CONFIG_MIGRATION
if (migration) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
#endif
dec_mm_counter(mm, file_rss);
page_remove_rmap(page, vma);
page_cache_release(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
out:
return ret;
}
/*
* objrmap doesn't work for nonlinear VMAs because the assumption that
* offset-into-file correlates with offset-into-virtual-addresses does not hold.
* Consequently, given a particular page and its ->index, we cannot locate the
* ptes which are mapping that page without an exhaustive linear search.
*
* So what this code does is a mini "virtual scan" of each nonlinear VMA which
* maps the file to which the target page belongs. The ->vm_private_data field
* holds the current cursor into that scan. Successive searches will circulate
* around the vma's virtual address space.
*
* So as more replacement pressure is applied to the pages in a nonlinear VMA,
* more scanning pressure is placed against them as well. Eventually pages
* will become fully unmapped and are eligible for eviction.
*
* For very sparsely populated VMAs this is a little inefficient - chances are
* there there won't be many ptes located within the scan cluster. In this case
* maybe we could scan further - to the end of the pte page, perhaps.
*/
#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
static void try_to_unmap_cluster(unsigned long cursor,
unsigned int *mapcount, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
struct page *page;
unsigned long address;
unsigned long end;
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
if (address < vma->vm_start)
address = vma->vm_start;
if (end > vma->vm_end)
end = vma->vm_end;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
for (; address < end; pte++, address += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
if (ptep_clear_flush_young(vma, address, pte))
continue;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
page_remove_rmap(page, vma);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
}
static int try_to_unmap_anon(struct page *page, int migration)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
return ret;
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
break;
}
page_unlock_anon_vma(anon_vma);
return ret;
}
/**
* try_to_unmap_file - unmap file page using the object-based rmap method
* @page: the page to unmap
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
* This function is only called from try_to_unmap for object-based pages.
*/
static int try_to_unmap_file(struct page *page, int migration)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
ret = try_to_unmap_one(page, vma, migration);
if (ret == SWAP_FAIL || !page_mapped(page))
goto out;
}
if (list_empty(&mapping->i_mmap_nonlinear))
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
if ((vma->vm_flags & VM_LOCKED) && !migration)
continue;
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
cursor = vma->vm_end - vma->vm_start;
if (cursor > max_nl_size)
max_nl_size = cursor;
}
if (max_nl_size == 0) { /* any nonlinears locked or reserved */
ret = SWAP_FAIL;
goto out;
}
/*
* We don't try to search for this page in the nonlinear vmas,
* and page_referenced wouldn't have found it anyway. Instead
* just walk the nonlinear vmas trying to age and unmap some.
* The mapcount of the page we came in with is irrelevant,
* but even so use it as a guide to how hard we should try?
*/
mapcount = page_mapcount(page);
if (!mapcount)
goto out;
cond_resched_lock(&mapping->i_mmap_lock);
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
if (max_nl_cursor == 0)
max_nl_cursor = CLUSTER_SIZE;
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
shared.vm_set.list) {
if ((vma->vm_flags & VM_LOCKED) && !migration)
continue;
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
try_to_unmap_cluster(cursor, &mapcount, vma);
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
goto out;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
cond_resched_lock(&mapping->i_mmap_lock);
max_nl_cursor += CLUSTER_SIZE;
} while (max_nl_cursor <= max_nl_size);
/*
* Don't loop forever (perhaps all the remaining pages are
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_private_data = NULL;
out:
spin_unlock(&mapping->i_mmap_lock);
return ret;
}
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
* Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
*/
int try_to_unmap(struct page *page, int migration)
{
int ret;
BUG_ON(!PageLocked(page));
if (PageAnon(page))
ret = try_to_unmap_anon(page, migration);
else
ret = try_to_unmap_file(page, migration);
if (!page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
}

2605
mm/shmem.c Normal file

File diff suppressed because it is too large Load Diff

197
mm/shmem_acl.c Normal file
View File

@@ -0,0 +1,197 @@
/*
* mm/shmem_acl.c
*
* (C) 2005 Andreas Gruenbacher <agruen@suse.de>
*
* This file is released under the GPL.
*/
#include <linux/fs.h>
#include <linux/shmem_fs.h>
#include <linux/xattr.h>
#include <linux/generic_acl.h>
/**
* shmem_get_acl - generic_acl_operations->getacl() operation
*/
static struct posix_acl *
shmem_get_acl(struct inode *inode, int type)
{
struct posix_acl *acl = NULL;
spin_lock(&inode->i_lock);
switch(type) {
case ACL_TYPE_ACCESS:
acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
break;
case ACL_TYPE_DEFAULT:
acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
break;
}
spin_unlock(&inode->i_lock);
return acl;
}
/**
* shmem_set_acl - generic_acl_operations->setacl() operation
*/
static void
shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
{
struct posix_acl *free = NULL;
spin_lock(&inode->i_lock);
switch(type) {
case ACL_TYPE_ACCESS:
free = SHMEM_I(inode)->i_acl;
SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
break;
case ACL_TYPE_DEFAULT:
free = SHMEM_I(inode)->i_default_acl;
SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
break;
}
spin_unlock(&inode->i_lock);
posix_acl_release(free);
}
struct generic_acl_operations shmem_acl_ops = {
.getacl = shmem_get_acl,
.setacl = shmem_set_acl,
};
/**
* shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
* shmem_xattr_acl_access_handler - plumbing code to implement the
* system.posix_acl_access xattr using the generic acl functions.
*/
static size_t
shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
list, list_size);
}
static int
shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
size_t size)
{
if (strcmp(name, "") != 0)
return -EINVAL;
return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
size);
}
static int
shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
if (strcmp(name, "") != 0)
return -EINVAL;
return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
size);
}
struct xattr_handler shmem_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.list = shmem_list_acl_access,
.get = shmem_get_acl_access,
.set = shmem_set_acl_access,
};
/**
* shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
* shmem_xattr_acl_default_handler - plumbing code to implement the
* system.posix_acl_default xattr using the generic acl functions.
*/
static size_t
shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
list, list_size);
}
static int
shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
size_t size)
{
if (strcmp(name, "") != 0)
return -EINVAL;
return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
size);
}
static int
shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
if (strcmp(name, "") != 0)
return -EINVAL;
return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
size);
}
struct xattr_handler shmem_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.list = shmem_list_acl_default,
.get = shmem_get_acl_default,
.set = shmem_set_acl_default,
};
/**
* shmem_acl_init - Inizialize the acl(s) of a new inode
*/
int
shmem_acl_init(struct inode *inode, struct inode *dir)
{
return generic_acl_init(inode, dir, &shmem_acl_ops);
}
/**
* shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
*
* This is done before destroying the actual inode.
*/
void
shmem_acl_destroy_inode(struct inode *inode)
{
if (SHMEM_I(inode)->i_acl)
posix_acl_release(SHMEM_I(inode)->i_acl);
SHMEM_I(inode)->i_acl = NULL;
if (SHMEM_I(inode)->i_default_acl)
posix_acl_release(SHMEM_I(inode)->i_default_acl);
SHMEM_I(inode)->i_default_acl = NULL;
}
/**
* shmem_check_acl - check_acl() callback for generic_permission()
*/
static int
shmem_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
return error;
}
return -EAGAIN;
}
/**
* shmem_permission - permission() inode operation
*/
int
shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
{
return generic_permission(inode, mask, shmem_check_acl);
}

4487
mm/slab.c Normal file

File diff suppressed because it is too large Load Diff

348
mm/slob.c Normal file
View File

@@ -0,0 +1,348 @@
/*
* SLOB Allocator: Simple List Of Blocks
*
* Matt Mackall <mpm@selenic.com> 12/30/03
*
* How SLOB works:
*
* The core of SLOB is a traditional K&R style heap allocator, with
* support for returning aligned objects. The granularity of this
* allocator is 8 bytes on x86, though it's perhaps possible to reduce
* this to 4 if it's deemed worth the effort. The slob heap is a
* singly-linked list of pages from __get_free_page, grown on demand
* and allocation from the heap is currently first-fit.
*
* Above this is an implementation of kmalloc/kfree. Blocks returned
* from kmalloc are 8-byte aligned and prepended with a 8-byte header.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* __get_free_pages directly so that it can return page-aligned blocks
* and keeps a linked list of such pages and their orders. These
* objects are detected in kfree() by their page alignment.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
* destructors for every SLAB allocation. Objects are returned with
* the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
* set, in which case the low-level allocator will fragment blocks to
* create the proper alignment. Again, objects of page-size or greater
* are allocated by calling __get_free_pages. As SLAB objects know
* their size, no separate size bookkeeping is necessary and there is
* essentially no allocation space overhead.
*/
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/timer.h>
struct slob_block {
int units;
struct slob_block *next;
};
typedef struct slob_block slob_t;
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
#define SLOB_ALIGN L1_CACHE_BYTES
struct bigblock {
int order;
void *pages;
struct bigblock *next;
};
typedef struct bigblock bigblock_t;
static slob_t arena = { .next = &arena, .units = 1 };
static slob_t *slobfree = &arena;
static bigblock_t *bigblocks;
static DEFINE_SPINLOCK(slob_lock);
static DEFINE_SPINLOCK(block_lock);
static void slob_free(void *b, int size);
static void slob_timer_cbk(void);
static void *slob_alloc(size_t size, gfp_t gfp, int align)
{
slob_t *prev, *cur, *aligned = 0;
int delta = 0, units = SLOB_UNITS(size);
unsigned long flags;
spin_lock_irqsave(&slob_lock, flags);
prev = slobfree;
for (cur = prev->next; ; prev = cur, cur = cur->next) {
if (align) {
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
delta = aligned - cur;
}
if (cur->units >= units + delta) { /* room enough? */
if (delta) { /* need to fragment head to align? */
aligned->units = cur->units - delta;
aligned->next = cur->next;
cur->next = aligned;
cur->units = delta;
prev = cur;
cur = aligned;
}
if (cur->units == units) /* exact fit? */
prev->next = cur->next; /* unlink */
else { /* fragment */
prev->next = cur + units;
prev->next->units = cur->units - units;
prev->next->next = cur->next;
cur->units = units;
}
slobfree = prev;
spin_unlock_irqrestore(&slob_lock, flags);
return cur;
}
if (cur == slobfree) {
spin_unlock_irqrestore(&slob_lock, flags);
if (size == PAGE_SIZE) /* trying to shrink arena? */
return 0;
cur = (slob_t *)__get_free_page(gfp);
if (!cur)
return 0;
slob_free(cur, PAGE_SIZE);
spin_lock_irqsave(&slob_lock, flags);
cur = slobfree;
}
}
}
static void slob_free(void *block, int size)
{
slob_t *cur, *b = (slob_t *)block;
unsigned long flags;
if (!block)
return;
if (size)
b->units = SLOB_UNITS(size);
/* Find reinsertion point */
spin_lock_irqsave(&slob_lock, flags);
for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
if (cur >= cur->next && (b > cur || b < cur->next))
break;
if (b + b->units == cur->next) {
b->units += cur->next->units;
b->next = cur->next->next;
} else
b->next = cur->next;
if (cur + cur->units == b) {
cur->units += b->units;
cur->next = b->next;
} else
cur->next = b;
slobfree = cur;
spin_unlock_irqrestore(&slob_lock, flags);
}
void *__kmalloc(size_t size, gfp_t gfp)
{
slob_t *m;
bigblock_t *bb;
unsigned long flags;
if (size < PAGE_SIZE - SLOB_UNIT) {
m = slob_alloc(size + SLOB_UNIT, gfp, 0);
return m ? (void *)(m + 1) : 0;
}
bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
if (!bb)
return 0;
bb->order = get_order(size);
bb->pages = (void *)__get_free_pages(gfp, bb->order);
if (bb->pages) {
spin_lock_irqsave(&block_lock, flags);
bb->next = bigblocks;
bigblocks = bb;
spin_unlock_irqrestore(&block_lock, flags);
return bb->pages;
}
slob_free(bb, sizeof(bigblock_t));
return 0;
}
EXPORT_SYMBOL(__kmalloc);
void kfree(const void *block)
{
bigblock_t *bb, **last = &bigblocks;
unsigned long flags;
if (!block)
return;
if (!((unsigned long)block & (PAGE_SIZE-1))) {
/* might be on the big block list */
spin_lock_irqsave(&block_lock, flags);
for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
if (bb->pages == block) {
*last = bb->next;
spin_unlock_irqrestore(&block_lock, flags);
free_pages((unsigned long)block, bb->order);
slob_free(bb, sizeof(bigblock_t));
return;
}
}
spin_unlock_irqrestore(&block_lock, flags);
}
slob_free((slob_t *)block - 1, 0);
return;
}
EXPORT_SYMBOL(kfree);
unsigned int ksize(const void *block)
{
bigblock_t *bb;
unsigned long flags;
if (!block)
return 0;
if (!((unsigned long)block & (PAGE_SIZE-1))) {
spin_lock_irqsave(&block_lock, flags);
for (bb = bigblocks; bb; bb = bb->next)
if (bb->pages == block) {
spin_unlock_irqrestore(&slob_lock, flags);
return PAGE_SIZE << bb->order;
}
spin_unlock_irqrestore(&block_lock, flags);
}
return ((slob_t *)block - 1)->units * SLOB_UNIT;
}
struct kmem_cache {
unsigned int size, align;
const char *name;
void (*ctor)(void *, struct kmem_cache *, unsigned long);
void (*dtor)(void *, struct kmem_cache *, unsigned long);
};
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void*, struct kmem_cache *, unsigned long),
void (*dtor)(void*, struct kmem_cache *, unsigned long))
{
struct kmem_cache *c;
c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
if (c) {
c->name = name;
c->size = size;
c->ctor = ctor;
c->dtor = dtor;
/* ignore alignment unless it's forced */
c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
if (c->align < align)
c->align = align;
}
return c;
}
EXPORT_SYMBOL(kmem_cache_create);
void kmem_cache_destroy(struct kmem_cache *c)
{
slob_free(c, sizeof(struct kmem_cache));
}
EXPORT_SYMBOL(kmem_cache_destroy);
void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
{
void *b;
if (c->size < PAGE_SIZE)
b = slob_alloc(c->size, flags, c->align);
else
b = (void *)__get_free_pages(flags, get_order(c->size));
if (c->ctor)
c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
return b;
}
EXPORT_SYMBOL(kmem_cache_alloc);
void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
{
void *ret = kmem_cache_alloc(c, flags);
if (ret)
memset(ret, 0, c->size);
return ret;
}
EXPORT_SYMBOL(kmem_cache_zalloc);
void kmem_cache_free(struct kmem_cache *c, void *b)
{
if (c->dtor)
c->dtor(b, c, 0);
if (c->size < PAGE_SIZE)
slob_free(b, c->size);
else
free_pages((unsigned long)b, get_order(c->size));
}
EXPORT_SYMBOL(kmem_cache_free);
unsigned int kmem_cache_size(struct kmem_cache *c)
{
return c->size;
}
EXPORT_SYMBOL(kmem_cache_size);
const char *kmem_cache_name(struct kmem_cache *c)
{
return c->name;
}
EXPORT_SYMBOL(kmem_cache_name);
static struct timer_list slob_timer = TIMER_INITIALIZER(
(void (*)(unsigned long))slob_timer_cbk, 0, 0);
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
int kmem_ptr_validate(struct kmem_cache *a, const void *b)
{
return 0;
}
void __init kmem_cache_init(void)
{
slob_timer_cbk();
}
static void slob_timer_cbk(void)
{
void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
if (p)
free_page((unsigned long)p);
mod_timer(&slob_timer, jiffies + HZ);
}

340
mm/sparse.c Normal file
View File

@@ -0,0 +1,340 @@
/*
* sparse memory mappings.
*/
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <asm/dma.h>
/*
* Permanent SPARSEMEM data:
*
* 1) mem_section - memory sections, mem_map's for valid memory
*/
#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section *mem_section[NR_SECTION_ROOTS]
____cacheline_internodealigned_in_smp;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
#endif
EXPORT_SYMBOL(mem_section);
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
* If we did not store the node number in the page then we have to
* do a lookup in the section_to_node_table in order to find which
* node the page belongs to.
*/
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif
int page_to_nid(struct page *page)
{
return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
static struct mem_section *sparse_index_alloc(int nid)
{
struct mem_section *section = NULL;
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
if (slab_is_available())
section = kmalloc_node(array_size, GFP_KERNEL, nid);
else
section = alloc_bootmem_node(NODE_DATA(nid), array_size);
if (section)
memset(section, 0, array_size);
return section;
}
static int sparse_index_init(unsigned long section_nr, int nid)
{
static DEFINE_SPINLOCK(index_init_lock);
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
int ret = 0;
#ifdef NODE_NOT_IN_PAGE_FLAGS
section_to_node_table[section_nr] = nid;
#endif
if (mem_section[root])
return -EEXIST;
section = sparse_index_alloc(nid);
/*
* This lock keeps two different sections from
* reallocating for the same index
*/
spin_lock(&index_init_lock);
if (mem_section[root]) {
ret = -EEXIST;
goto out;
}
mem_section[root] = section;
out:
spin_unlock(&index_init_lock);
return ret;
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
return 0;
}
#endif
/*
* Although written for the SPARSEMEM_EXTREME case, this happens
* to also work for the flat array case becase
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
*/
int __section_nr(struct mem_section* ms)
{
unsigned long root_nr;
struct mem_section* root;
for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
if (!root)
continue;
if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
break;
}
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
/*
* During early boot, before section_mem_map is used for an actual
* mem_map, we use section_mem_map to store the section's NUMA
* node. This keeps us from having to use another data structure. The
* node information is cleared just before we store the real mem_map.
*/
static inline unsigned long sparse_encode_early_nid(int nid)
{
return (nid << SECTION_NID_SHIFT);
}
static inline int sparse_early_nid(struct mem_section *section)
{
return (section->section_mem_map >> SECTION_NID_SHIFT);
}
/* Record a memory area against a node. */
void memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
start &= PAGE_SECTION_MASK;
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;
sparse_index_init(section, nid);
ms = __nr_to_section(section);
if (!ms->section_mem_map)
ms->section_mem_map = sparse_encode_early_nid(nid) |
SECTION_MARKED_PRESENT;
}
}
/*
* Only used by the i386 NUMA architecures, but relatively
* generic code.
*/
unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn;
unsigned long nr_pages = 0;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
if (nid != early_pfn_to_nid(pfn))
continue;
if (pfn_valid(pfn))
nr_pages += PAGES_PER_SECTION;
}
return nr_pages * sizeof(struct page);
}
/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
* physical page frame number.
*/
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
}
/*
* We need this if we ever free the mem_maps. While not implemented yet,
* this function is included for parity with its sibling.
*/
static __attribute((unused))
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
static int sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map)
{
if (!valid_section(ms))
return -EINVAL;
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
return 1;
}
__attribute__((weak))
void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
{
return NULL;
}
static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
struct mem_section *ms = __nr_to_section(pnum);
int nid = sparse_early_nid(ms);
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
map = alloc_bootmem_high_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
ms->section_mem_map = 0;
return NULL;
}
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * nr_pages;
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
goto got_map_page;
ret = vmalloc(memmap_size);
if (ret)
goto got_map_ptr;
return NULL;
got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:
memset(ret, 0, memmap_size);
return ret;
}
static int vaddr_in_vmalloc_area(void *addr)
{
if (addr >= (void *)VMALLOC_START &&
addr < (void *)VMALLOC_END)
return 1;
return 0;
}
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
if (vaddr_in_vmalloc_area(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
get_order(sizeof(struct page) * nr_pages));
}
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
*/
void sparse_init(void)
{
unsigned long pnum;
struct page *map;
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!valid_section_nr(pnum))
continue;
map = sparse_early_mem_map_alloc(pnum);
if (!map)
continue;
sparse_init_one_section(__nr_to_section(pnum), pnum, map);
}
}
/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
struct mem_section *ms;
struct page *memmap;
unsigned long flags;
int ret;
/*
* no locking for this, because it does its own
* plus, it does a kmalloc
*/
sparse_index_init(section_nr, pgdat->node_id);
memmap = __kmalloc_section_memmap(nr_pages);
pgdat_resize_lock(pgdat, &flags);
ms = __pfn_to_section(start_pfn);
if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
ret = -EEXIST;
goto out;
}
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap);
out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0)
__kfree_section_memmap(memmap, nr_pages);
return ret;
}

520
mm/swap.c Normal file
View File

@@ -0,0 +1,520 @@
/*
* linux/mm/swap.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* This file contains the default values for the opereation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
* Documentation/sysctl/vm.txt.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
* Buffermem limits added 12.3.98, Rik van Riel.
*/
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm_inline.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/module.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
*/
static void fastcall __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
unsigned long flags;
struct zone *zone = page_zone(page);
spin_lock_irqsave(&zone->lru_lock, flags);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
free_hot_page(page);
}
static void put_compound_page(struct page *page)
{
page = (struct page *)page_private(page);
if (put_page_testzero(page)) {
compound_page_dtor *dtor;
dtor = get_compound_page_dtor(page);
(*dtor)(page);
}
}
void put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
__page_cache_release(page);
}
EXPORT_SYMBOL(put_page);
/**
* put_pages_list(): release a list of pages
*
* Release a list of pages which are strung together on page.lru. Currently
* used by read_cache_pages() and related error recovery code.
*
* @pages: list of pages threaded on page->lru
*/
void put_pages_list(struct list_head *pages)
{
while (!list_empty(pages)) {
struct page *victim;
victim = list_entry(pages->prev, struct page, lru);
list_del(&victim->lru);
page_cache_release(victim);
}
}
EXPORT_SYMBOL(put_pages_list);
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list. The page still has PageWriteback set, which will pin it.
*
* We don't expect many pages to come through here, so don't bother batching
* things up.
*
* To avoid placing the page at the tail of the LRU while PG_writeback is still
* set, this function will clear PG_writeback before performing the page
* motion. Do that inside the lru lock because once PG_writeback is cleared
* we may not touch the page.
*
* Returns zero if it cleared PG_writeback.
*/
int rotate_reclaimable_page(struct page *page)
{
struct zone *zone;
unsigned long flags;
if (PageLocked(page))
return 1;
if (PageDirty(page))
return 1;
if (PageActive(page))
return 1;
if (!PageLRU(page))
return 1;
zone = page_zone(page);
spin_lock_irqsave(&zone->lru_lock, flags);
if (PageLRU(page) && !PageActive(page)) {
list_move_tail(&page->lru, &zone->inactive_list);
__count_vm_event(PGROTATED);
}
if (!test_clear_page_writeback(page))
BUG();
spin_unlock_irqrestore(&zone->lru_lock, flags);
return 0;
}
/*
* FIXME: speed this up?
*/
void fastcall activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && !PageActive(page)) {
del_page_from_inactive_list(zone, page);
SetPageActive(page);
add_page_to_active_list(zone, page);
__count_vm_event(PGACTIVATE);
}
spin_unlock_irq(&zone->lru_lock);
}
/*
* Mark a page as having seen activity.
*
* inactive,unreferenced -> inactive,referenced
* inactive,referenced -> active,unreferenced
* active,unreferenced -> active,referenced
*/
void fastcall mark_page_accessed(struct page *page)
{
if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
activate_page(page);
ClearPageReferenced(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
}
EXPORT_SYMBOL(mark_page_accessed);
/**
* lru_cache_add: add a page to the page lists
* @page: the page to add
*/
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
void fastcall lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
page_cache_get(page);
if (!pagevec_add(pvec, page))
__pagevec_lru_add(pvec);
put_cpu_var(lru_add_pvecs);
}
void fastcall lru_cache_add_active(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
page_cache_get(page);
if (!pagevec_add(pvec, page))
__pagevec_lru_add_active(pvec);
put_cpu_var(lru_add_active_pvecs);
}
static void __lru_add_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
/* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
pvec = &per_cpu(lru_add_active_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add_active(pvec);
}
void lru_add_drain(void)
{
__lru_add_drain(get_cpu());
put_cpu();
}
#ifdef CONFIG_NUMA
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
/*
* Returns 0 for success
*/
int lru_add_drain_all(void)
{
return schedule_on_each_cpu(lru_add_drain_per_cpu);
}
#else
/*
* Returns 0 for success
*/
int lru_add_drain_all(void)
{
lru_add_drain();
return 0;
}
#endif
/*
* Batched page_cache_release(). Decrement the reference count on all the
* passed pages. If it fell to zero then remove the page from the LRU and
* free it.
*
* Avoid taking zone->lru_lock if possible, but if it is taken, retain it
* for the remainder of the operation.
*
* The locking in this function is against shrink_cache(): we recheck the
* page count inside the lock to see whether shrink_cache grabbed the page
* via the LRU. If it did, give up: shrink_cache will free it.
*/
void release_pages(struct page **pages, int nr, int cold)
{
int i;
struct pagevec pages_to_free;
struct zone *zone = NULL;
pagevec_init(&pages_to_free, cold);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
if (unlikely(PageCompound(page))) {
if (zone) {
spin_unlock_irq(&zone->lru_lock);
zone = NULL;
}
put_compound_page(page);
continue;
}
if (!put_page_testzero(page))
continue;
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru(zone, page);
}
if (!pagevec_add(&pages_to_free, page)) {
if (zone) {
spin_unlock_irq(&zone->lru_lock);
zone = NULL;
}
__pagevec_free(&pages_to_free);
pagevec_reinit(&pages_to_free);
}
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_free(&pages_to_free);
}
/*
* The pages which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
* OK from a correctness point of view but is inefficient - those pages may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
* So __pagevec_release() will drain those queues here. __pagevec_lru_add()
* and __pagevec_lru_add_active() call release_pages() directly to avoid
* mutual recursion.
*/
void __pagevec_release(struct pagevec *pvec)
{
lru_add_drain();
release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_release);
/*
* pagevec_release() for pages which are known to not be on the LRU
*
* This function reinitialises the caller's pagevec.
*/
void __pagevec_release_nonlru(struct pagevec *pvec)
{
int i;
struct pagevec pages_to_free;
pagevec_init(&pages_to_free, pvec->cold);
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
VM_BUG_ON(PageLRU(page));
if (put_page_testzero(page))
pagevec_add(&pages_to_free, page);
}
pagevec_free(&pages_to_free);
pagevec_reinit(pvec);
}
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_inactive_list(zone, page);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_lru_add);
void __pagevec_lru_add_active(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
VM_BUG_ON(PageActive(page));
SetPageActive(page);
add_page_to_active_list(zone, page);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
/*
* Try to drop buffers from the pages in a pagevec
*/
void pagevec_strip(struct pagevec *pvec)
{
int i;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
if (PagePrivate(page) && !TestSetPageLocked(page)) {
if (PagePrivate(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
* @nr_pages: The maximum number of pages
*
* pagevec_lookup() will search for and return a group of up to @nr_pages pages
* in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
*
* pagevec_lookup() returns the number of pages which were found.
*/
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages)
{
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup);
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
{
pvec->nr = find_get_pages_tag(mapping, index, tag,
nr_pages, pvec->pages);
return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_tag);
#ifdef CONFIG_SMP
/*
* We tolerate a little inaccuracy to avoid ping-ponging the counter between
* CPUs
*/
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
static DEFINE_PER_CPU(long, committed_space) = 0;
void vm_acct_memory(long pages)
{
long *local;
preempt_disable();
local = &__get_cpu_var(committed_space);
*local += pages;
if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
atomic_add(*local, &vm_committed_space);
*local = 0;
}
preempt_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
long *committed;
committed = &per_cpu(committed_space, (long)hcpu);
if (action == CPU_DEAD) {
atomic_add(*committed, &vm_committed_space);
*committed = 0;
__lru_add_drain((long)hcpu);
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
#endif /* CONFIG_SMP */
/*
* Perform any setup for the swap system
*/
void __init swap_setup(void)
{
unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
else
page_cluster = 3;
/*
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
#ifdef CONFIG_HOTPLUG_CPU
hotcpu_notifier(cpu_swap_callback, 0);
#endif
}

366
mm/swap_state.c Normal file
View File

@@ -0,0 +1,366 @@
/*
* linux/mm/swap_state.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
*
* Rewritten to use page cache, (C) 1998 Stephen Tweedie
*/
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <asm/pgtable.h>
/*
* swapper_space is a fiction, retained to simplify the path through
* vmscan's shrink_list, to make sync_page look nicer, and to allow
* future use of radix_tree tags in the swap cache.
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
.sync_page = block_sync_page,
.set_page_dirty = __set_page_dirty_nobuffers,
.migratepage = migrate_page,
};
static struct backing_dev_info swap_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
.unplug_io_fn = swap_unplug_io_fn,
};
struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
.a_ops = &swap_aops,
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
.backing_dev_info = &swap_backing_dev_info,
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
static struct {
unsigned long add_total;
unsigned long del_total;
unsigned long find_success;
unsigned long find_total;
unsigned long noent_race;
unsigned long exist_race;
} swap_cache_info;
void show_swap_cache_info(void)
{
printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total,
swap_cache_info.noent_race, swap_cache_info.exist_race);
printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
/*
* __add_to_swap_cache resembles add_to_page_cache on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
gfp_t gfp_mask)
{
int error;
BUG_ON(PageSwapCache(page));
BUG_ON(PagePrivate(page));
error = radix_tree_preload(gfp_mask);
if (!error) {
write_lock_irq(&swapper_space.tree_lock);
error = radix_tree_insert(&swapper_space.page_tree,
entry.val, page);
if (!error) {
page_cache_get(page);
SetPageLocked(page);
SetPageSwapCache(page);
set_page_private(page, entry.val);
total_swapcache_pages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
}
write_unlock_irq(&swapper_space.tree_lock);
radix_tree_preload_end();
}
return error;
}
static int add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
if (!swap_duplicate(entry)) {
INC_CACHE_INFO(noent_race);
return -ENOENT;
}
error = __add_to_swap_cache(page, entry, GFP_KERNEL);
/*
* Anon pages are already on the LRU, we don't run lru_cache_add here.
*/
if (error) {
swap_free(entry);
if (error == -EEXIST)
INC_CACHE_INFO(exist_race);
return error;
}
INC_CACHE_INFO(add_total);
return 0;
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
void __delete_from_swap_cache(struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(!PageSwapCache(page));
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));
radix_tree_delete(&swapper_space.page_tree, page_private(page));
set_page_private(page, 0);
ClearPageSwapCache(page);
total_swapcache_pages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
}
/**
* add_to_swap - allocate swap space for a page
* @page: page we want to move to swap
*
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
int add_to_swap(struct page * page, gfp_t gfp_mask)
{
swp_entry_t entry;
int err;
BUG_ON(!PageLocked(page));
for (;;) {
entry = get_swap_page();
if (!entry.val)
return 0;
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
* stops emergency reserves from being allocated.
*
* TODO: this could cause a theoretical memory reclaim
* deadlock in the swap out path.
*/
/*
* Add it to the swap cache and mark it dirty
*/
err = __add_to_swap_cache(page, entry,
gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
switch (err) {
case 0: /* Success */
SetPageUptodate(page);
SetPageDirty(page);
INC_CACHE_INFO(add_total);
return 1;
case -EEXIST:
/* Raced with "speculative" read_swap_cache_async */
INC_CACHE_INFO(exist_race);
swap_free(entry);
continue;
default:
/* -ENOMEM radix-tree allocation failure */
swap_free(entry);
return 0;
}
}
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache and locked.
* It will never put the page into the free list,
* the caller has a reference on the page.
*/
void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
entry.val = page_private(page);
write_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
write_unlock_irq(&swapper_space.tree_lock);
swap_free(entry);
page_cache_release(page);
}
/*
* Strange swizzling function only for use by shmem_writepage
*/
int move_to_swap_cache(struct page *page, swp_entry_t entry)
{
int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
if (!err) {
remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
if (!swap_duplicate(entry))
BUG();
SetPageDirty(page);
INC_CACHE_INFO(add_total);
} else if (err == -EEXIST)
INC_CACHE_INFO(exist_race);
return err;
}
/*
* Strange swizzling function for shmem_getpage (and shmem_unuse)
*/
int move_from_swap_cache(struct page *page, unsigned long index,
struct address_space *mapping)
{
int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
if (!err) {
delete_from_swap_cache(page);
/* shift page from clean_pages to dirty_pages list */
ClearPageDirty(page);
set_page_dirty(page);
}
return err;
}
/*
* If we are the only user, then try to free up the swap cache.
*
* Its ok to check for PageSwapCache without the page lock
* here because we are going to recheck again inside
* exclusive_swap_page() _with_ the lock.
* - Marcelo
*/
static inline void free_swap_cache(struct page *page)
{
if (PageSwapCache(page) && !TestSetPageLocked(page)) {
remove_exclusive_swap_page(page);
unlock_page(page);
}
}
/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
*/
void free_page_and_swap_cache(struct page *page)
{
free_swap_cache(page);
page_cache_release(page);
}
/*
* Passed an array of pages, drop them all from swapcache and then release
* them. They are removed from the LRU and freed if this is their last use.
*/
void free_pages_and_swap_cache(struct page **pages, int nr)
{
struct page **pagep = pages;
lru_add_drain();
while (nr) {
int todo = min(nr, PAGEVEC_SIZE);
int i;
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
release_pages(pagep, todo, 0);
pagep += todo;
nr -= todo;
}
}
/*
* Lookup a swap entry in the swap cache. A found page will be returned
* unlocked and with its refcount incremented - we rely on the kernel
* lock getting page table operations atomic even if we drop the page
* lock before returning.
*/
struct page * lookup_swap_cache(swp_entry_t entry)
{
struct page *page;
page = find_get_page(&swapper_space, entry.val);
if (page)
INC_CACHE_INFO(find_success);
INC_CACHE_INFO(find_total);
return page;
}
/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *found_page, *new_page = NULL;
int err;
do {
/*
* First check the swap cache. Since this is normally
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
found_page = find_get_page(&swapper_space, entry.val);
if (found_page)
break;
/*
* Get a new page to read into from swap.
*/
if (!new_page) {
new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
if (!new_page)
break; /* Out of memory */
}
/*
* Associate the page with swap entry in the swap cache.
* May fail (-ENOENT) if swap entry has been freed since
* our caller observed it. May fail (-EEXIST) if there
* is already a page associated with this entry in the
* swap cache: added by a racing read_swap_cache_async,
* or by try_to_swap_out (or shmem_writepage) re-using
* the just freed swap entry for an existing page.
* May fail (-ENOMEM) if radix-tree node allocation failed.
*/
err = add_to_swap_cache(new_page, entry);
if (!err) {
/*
* Initiate read into locked page and return.
*/
lru_cache_add_active(new_page);
swap_readpage(NULL, new_page);
return new_page;
}
} while (err != -ENOENT && err != -ENOMEM);
if (new_page)
page_cache_release(new_page);
return found_page;
}

1801
mm/swapfile.c Normal file

File diff suppressed because it is too large Load Diff

80
mm/thrash.c Normal file
View File

@@ -0,0 +1,80 @@
/*
* mm/thrash.c
*
* Copyright (C) 2004, Red Hat, Inc.
* Copyright (C) 2004, Rik van Riel <riel@redhat.com>
* Released under the GPL, see the file COPYING for details.
*
* Simple token based thrashing protection, using the algorithm
* described in: http://www.cs.wm.edu/~sjiang/token.pdf
*
* Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
* Improved algorithm to pass token:
* Each task has a priority which is incremented if it contended
* for the token in an interval less than its previous attempt.
* If the token is acquired, that task's priority is boosted to prevent
* the token from bouncing around too often and to let the task make
* some progress in its execution.
*/
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
static unsigned int global_faults;
void grab_swap_token(void)
{
int current_interval;
global_faults++;
current_interval = global_faults - current->mm->faultstamp;
if (!spin_trylock(&swap_token_lock))
return;
/* First come first served */
if (swap_token_mm == NULL) {
current->mm->token_priority = current->mm->token_priority + 2;
swap_token_mm = current->mm;
goto out;
}
if (current->mm != swap_token_mm) {
if (current_interval < current->mm->last_interval)
current->mm->token_priority++;
else {
current->mm->token_priority--;
if (unlikely(current->mm->token_priority < 0))
current->mm->token_priority = 0;
}
/* Check if we deserve the token */
if (current->mm->token_priority >
swap_token_mm->token_priority) {
current->mm->token_priority += 2;
swap_token_mm = current->mm;
}
} else {
/* Token holder came in again! */
current->mm->token_priority += 2;
}
out:
current->mm->faultstamp = global_faults;
current->mm->last_interval = current_interval;
spin_unlock(&swap_token_lock);
return;
}
/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
spin_lock(&swap_token_lock);
if (likely(mm == swap_token_mm))
swap_token_mm = NULL;
spin_unlock(&swap_token_lock);
}

150
mm/tiny-shmem.c Normal file
View File

@@ -0,0 +1,150 @@
/*
* tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
*
* Matt Mackall <mpm@selenic.com> January, 2004
* derived from mm/shmem.c and fs/ramfs/inode.c
*
* This is intended for small system where the benefits of the full
* shmem code (swap-backed and resource-limited) are outweighed by
* their complexity. On systems without swap this code should be
* effectively equivalent, but much lighter weight.
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/ramfs.h>
static struct file_system_type tmpfs_fs_type = {
.name = "tmpfs",
.get_sb = ramfs_get_sb,
.kill_sb = kill_litter_super,
};
static struct vfsmount *shm_mnt;
static int __init init_tmpfs(void)
{
BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
shm_mnt = kern_mount(&tmpfs_fs_type);
BUG_ON(IS_ERR(shm_mnt));
return 0;
}
module_init(init_tmpfs)
/*
* shmem_file_setup - get an unlinked file living in tmpfs
*
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
*
*/
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
{
int error;
struct file *file;
struct inode *inode;
struct dentry *dentry, *root;
struct qstr this;
if (IS_ERR(shm_mnt))
return (void *)shm_mnt;
error = -ENOMEM;
this.name = name;
this.len = strlen(name);
this.hash = 0; /* will go */
root = shm_mnt->mnt_root;
dentry = d_alloc(root, &this);
if (!dentry)
goto put_memory;
error = -ENFILE;
file = get_empty_filp();
if (!file)
goto put_dentry;
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto close_file;
d_instantiate(dentry, inode);
inode->i_nlink = 0; /* It is unlinked */
file->f_path.mnt = mntget(shm_mnt);
file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_op = &ramfs_file_operations;
file->f_mode = FMODE_WRITE | FMODE_READ;
/* notify everyone as to the change of file size */
error = do_truncate(dentry, size, 0, file);
if (error < 0)
goto close_file;
return file;
close_file:
put_filp(file);
put_dentry:
dput(dentry);
put_memory:
return ERR_PTR(error);
}
/*
* shmem_zero_setup - setup a shared anonymous mapping
*
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
struct file *file;
loff_t size = vma->vm_end - vma->vm_start;
file = shmem_file_setup("dev/zero", size, vma->vm_flags);
if (IS_ERR(file))
return PTR_ERR(file);
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
int shmem_unuse(swp_entry_t entry, struct page *page)
{
return 0;
}
#if 0
int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
#ifndef CONFIG_MMU
return ramfs_nommu_mmap(file, vma);
#else
return 0;
#endif
}
#endif /* 0 */
#ifndef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long pgoff,
unsigned long flags)
{
return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
}
#endif

444
mm/truncate.c Normal file
View File

@@ -0,0 +1,444 @@
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
* Copyright (C) 2002, Linus Torvalds
*
* 10Sep2002 akpm@zip.com.au
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
/**
* do_invalidatepage - invalidate part of all of a page
* @page: the page which is affected
* @offset: the index of the truncation point
*
* do_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
*
* do_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
void do_invalidatepage(struct page *page, unsigned long offset)
{
void (*invalidatepage)(struct page *, unsigned long);
invalidatepage = page->mapping->a_ops->invalidatepage;
#ifdef CONFIG_BLOCK
if (!invalidatepage)
invalidatepage = block_invalidatepage;
#endif
if (invalidatepage)
(*invalidatepage)(page, offset);
}
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
/*
* This cancels just the dirty bit on the kernel page itself, it
* does NOT actually remove dirty bits on any mmap's that may be
* around. It also leaves the page tagged dirty, so any sync
* activity will still find it on the dirty lists, and in particular,
* clear_page_dirty_for_io() will still look at the dirty bits in
* the VM.
*
* Doing this should *normally* only ever be done when a page
* is truncated, and is not actually mapped anywhere at all. However,
* fs/buffer.c does this when it notices that somebody has cleaned
* out all the buffers on a page without actually doing it through
* the VM. Can you say "ext3 is horribly ugly"? Tought you could.
*/
void cancel_dirty_page(struct page *page, unsigned int account_size)
{
if (TestClearPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
if (account_size)
task_io_account_cancelled_write(account_size);
}
}
}
EXPORT_SYMBOL(cancel_dirty_page);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_nopage().
*
* We need to bale out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
static void
truncate_complete_page(struct address_space *mapping, struct page *page)
{
if (page->mapping != mapping)
return;
cancel_dirty_page(page, PAGE_CACHE_SIZE);
if (PagePrivate(page))
do_invalidatepage(page, 0);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
}
/*
* This is for invalidate_mapping_pages(). That function can be called at
* any time, and is not supposed to throw away dirty pages. But pages can
* be marked dirty at any time too, so use remove_mapping which safely
* discards clean, unused pages.
*
* Returns non-zero if the page was successfully invalidated.
*/
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
int ret;
if (page->mapping != mapping)
return 0;
if (PagePrivate(page) && !try_to_release_page(page, 0))
return 0;
ret = remove_mapping(mapping, page);
return ret;
}
/**
* truncate_inode_pages - truncate range of pages specified by start and
* end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
* @lend: offset to which to truncate
*
* Truncate the page cache, removing the pages that are between
* specified offsets (and zeroing out partial page
* (if lstart is not page aligned)).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
* will wait. This is to prevent as much IO as possible in the affected region.
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
* When looking at page->index outside the page lock we need to be careful to
* copy it into a local to avoid races (it could change at any time).
*
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
*/
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
pgoff_t end;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t next;
int i;
if (mapping->nrpages == 0)
return;
BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
end = (lend >> PAGE_CACHE_SHIFT);
pagevec_init(&pvec, 0);
next = start;
while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
if (page_index > end) {
next = page_index;
break;
}
if (page_index > next)
next = page_index;
next++;
if (TestSetPageLocked(page))
continue;
if (PageWriteback(page)) {
unlock_page(page);
continue;
}
truncate_complete_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
if (partial) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
wait_on_page_writeback(page);
truncate_partial_page(page, partial);
unlock_page(page);
page_cache_release(page);
}
}
next = start;
for ( ; ; ) {
cond_resched();
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
if (next == start)
break;
next = start;
continue;
}
if (pvec.pages[0]->index > end) {
pagevec_release(&pvec);
break;
}
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
if (page->index > end)
break;
lock_page(page);
wait_on_page_writeback(page);
if (page->index > next)
next = page->index;
next++;
truncate_complete_page(mapping, page);
unlock_page(page);
}
pagevec_release(&pvec);
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_mutex.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);
/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
pgoff_t next = start;
unsigned long ret = 0;
int i;
pagevec_init(&pvec, 0);
while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t index;
int lock_failed;
lock_failed = TestSetPageLocked(page);
/*
* We really shouldn't be looking at the ->index of an
* unlocked page. But we're not allowed to lock these
* pages. So we rely upon nobody altering the ->index
* of this (pinned-by-us) page.
*/
index = page->index;
if (index > next)
next = index;
next++;
if (lock_failed)
continue;
if (PageDirty(page) || PageWriteback(page))
goto unlock;
if (page_mapped(page))
goto unlock;
ret += invalidate_complete_page(mapping, page);
unlock:
unlock_page(page);
if (next > end)
break;
}
pagevec_release(&pvec);
}
return ret;
}
EXPORT_SYMBOL(invalidate_mapping_pages);
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_list() has a temp ref on them, or because they're transiently sitting
* in the lru_cache_add() pagevecs.
*/
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
if (page->mapping != mapping)
return 0;
if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
write_lock_irq(&mapping->tree_lock);
if (PageDirty(page))
goto failed;
BUG_ON(PagePrivate(page));
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
ClearPageUptodate(page);
page_cache_release(page); /* pagecache ref */
return 1;
failed:
write_unlock_irq(&mapping->tree_lock);
return 0;
}
static int do_launder_page(struct address_space *mapping, struct page *page)
{
if (!PageDirty(page))
return 0;
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
return mapping->a_ops->launder_page(page);
}
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space
* @start: the page offset 'from' which to invalidate
* @end: the page offset 'to' which to invalidate (inclusive)
*
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
* Returns -EIO if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
pgoff_t next;
int i;
int ret = 0;
int did_range_unmap = 0;
int wrapped = 0;
pagevec_init(&pvec, 0);
next = start;
while (next <= end && !wrapped &&
pagevec_lookup(&pvec, mapping, next,
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index;
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
page_index = page->index;
next = page_index + 1;
if (next == 0)
wrapped = 1;
if (page_index > end) {
unlock_page(page);
break;
}
wait_on_page_writeback(page);
while (page_mapped(page)) {
if (!did_range_unmap) {
/*
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
(loff_t)page_index<<PAGE_CACHE_SHIFT,
(loff_t)(end - page_index + 1)
<< PAGE_CACHE_SHIFT,
0);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
unmap_mapping_range(mapping,
(loff_t)page_index<<PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE, 0);
}
}
ret = do_launder_page(mapping, page);
if (ret == 0 && !invalidate_complete_page2(mapping, page))
ret = -EIO;
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
/**
* invalidate_inode_pages2 - remove all pages from an address_space
* @mapping: the address_space
*
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
* Returns -EIO if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

94
mm/util.c Normal file
View File

@@ -0,0 +1,94 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/err.h>
#include <asm/uaccess.h>
/**
* __kzalloc - allocate memory. The memory is set to zero.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*/
void *__kzalloc(size_t size, gfp_t flags)
{
void *ret = kmalloc_track_caller(size, flags);
if (ret)
memset(ret, 0, size);
return ret;
}
EXPORT_SYMBOL(__kzalloc);
/*
* kstrdup - allocate space for and copy an existing string
*
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strlen(s) + 1;
buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
void *p;
p = kmalloc_track_caller(len, gfp);
if (p)
memcpy(p, src, len);
return p;
}
EXPORT_SYMBOL(kmemdup);
/*
* strndup_user - duplicate an existing string from user space
*
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*/
char *strndup_user(const char __user *s, long n)
{
char *p;
long length;
length = strnlen_user(s, n);
if (!length)
return ERR_PTR(-EFAULT);
if (length > n)
return ERR_PTR(-EINVAL);
p = kmalloc(length, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, s, length)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
p[length - 1] = '\0';
return p;
}
EXPORT_SYMBOL(strndup_user);

749
mm/vmalloc.c Normal file
View File

@@ -0,0 +1,749 @@
/*
* linux/mm/vmalloc.c
*
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
int node);
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
pte_t *pte;
pte = pte_offset_kernel(pmd, addr);
do {
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte++, addr += PAGE_SIZE, addr != end);
}
static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
} while (pmd++, addr = next, addr != end);
}
static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
} while (pud++, addr = next, addr != end);
}
void unmap_vm_area(struct vm_struct *area)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = (unsigned long) area->addr;
unsigned long end = addr + area->size;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
flush_cache_vunmap(addr, end);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
vunmap_pud_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
flush_tlb_kernel_range((unsigned long) area->addr, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page ***pages)
{
pte_t *pte;
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
struct page *page = **pages;
WARN_ON(!pte_none(*pte));
if (!page)
return -ENOMEM;
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*pages)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
return 0;
}
static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page ***pages)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_alloc(&init_mm, pud, addr);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_pte_range(pmd, addr, next, prot, pages))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page ***pages)
{
pud_t *pud;
unsigned long next;
pud = pud_alloc(&init_mm, pgd, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_pmd_range(pud, addr, next, prot, pages))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = (unsigned long) area->addr;
unsigned long end = addr + area->size - PAGE_SIZE;
int err;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
} while (pgd++, addr = next, addr != end);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
int node, gfp_t gfp_mask)
{
struct vm_struct **p, *tmp, *area;
unsigned long align = 1;
unsigned long addr;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
int bit = fls(size);
if (bit > IOREMAP_MAX_ORDER)
bit = IOREMAP_MAX_ORDER;
else if (bit < PAGE_SHIFT)
bit = PAGE_SHIFT;
align = 1ul << bit;
}
addr = ALIGN(start, align);
size = PAGE_ALIGN(size);
if (unlikely(!size))
return NULL;
area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
if (unlikely(!area))
return NULL;
/*
* We always allocate a guard page.
*/
size += PAGE_SIZE;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
if ((unsigned long)tmp->addr < addr) {
if((unsigned long)tmp->addr + tmp->size >= addr)
addr = ALIGN(tmp->size +
(unsigned long)tmp->addr, align);
continue;
}
if ((size + addr) < addr)
goto out;
if (size + addr <= (unsigned long)tmp->addr)
goto found;
addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
if (addr > end - size)
goto out;
}
found:
area->next = *p;
*p = area;
area->flags = flags;
area->addr = (void *)addr;
area->size = size;
area->pages = NULL;
area->nr_pages = 0;
area->phys_addr = 0;
write_unlock(&vmlist_lock);
return area;
out:
write_unlock(&vmlist_lock);
kfree(area);
if (printk_ratelimit())
printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
return NULL;
}
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
}
/**
* get_vm_area - reserve a contingous kernel virtual area
* @size: size of the area
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
* Search an area of @size in the kernel virtual mapping area,
* and reserved it for out purposes. Returns the area descriptor
* on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
}
struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
int node, gfp_t gfp_mask)
{
return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
gfp_mask);
}
/* Caller must hold vmlist_lock */
static struct vm_struct *__find_vm_area(void *addr)
{
struct vm_struct *tmp;
for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
if (tmp->addr == addr)
break;
}
return tmp;
}
/* Caller must hold vmlist_lock */
static struct vm_struct *__remove_vm_area(void *addr)
{
struct vm_struct **p, *tmp;
for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
if (tmp->addr == addr)
goto found;
}
return NULL;
found:
unmap_vm_area(tmp);
*p = tmp->next;
/*
* Remove the guard page.
*/
tmp->size -= PAGE_SIZE;
return tmp;
}
/**
* remove_vm_area - find and remove a contingous kernel virtual area
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and remove it.
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*/
struct vm_struct *remove_vm_area(void *addr)
{
struct vm_struct *v;
write_lock(&vmlist_lock);
v = __remove_vm_area(addr);
write_unlock(&vmlist_lock);
return v;
}
void __vunmap(void *addr, int deallocate_pages)
{
struct vm_struct *area;
if (!addr)
return;
if ((PAGE_SIZE-1) & (unsigned long)addr) {
printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
WARN_ON(1);
return;
}
area = remove_vm_area(addr);
if (unlikely(!area)) {
printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
WARN_ON(1);
return;
}
debug_check_no_locks_freed(addr, area->size);
if (deallocate_pages) {
int i;
for (i = 0; i < area->nr_pages; i++) {
BUG_ON(!area->pages[i]);
__free_page(area->pages[i]);
}
if (area->flags & VM_VPAGES)
vfree(area->pages);
else
kfree(area->pages);
}
kfree(area);
return;
}
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr, as
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
*
* Must not be called in interrupt context.
*/
void vfree(void *addr)
{
BUG_ON(in_interrupt());
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
/**
* vunmap - release virtual mapping obtained by vmap()
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr,
* which was created from the page array passed to vmap().
*
* Must not be called in interrupt context.
*/
void vunmap(void *addr)
{
BUG_ON(in_interrupt());
__vunmap(addr, 0);
}
EXPORT_SYMBOL(vunmap);
/**
* vmap - map an array of pages into virtually contiguous space
* @pages: array of page pointers
* @count: number of pages to map
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
* Maps @count pages from @pages into contiguous kernel virtual
* space.
*/
void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
if (count > num_physpages)
return NULL;
area = get_vm_area((count << PAGE_SHIFT), flags);
if (!area)
return NULL;
if (map_vm_area(area, prot, &pages)) {
vunmap(area->addr);
return NULL;
}
return area->addr;
}
EXPORT_SYMBOL(vmap);
void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size,
(gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
if (node < 0)
area->pages[i] = alloc_page(gfp_mask);
else
area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!area->pages[i])) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
}
if (map_vm_area(area, prot, &pages))
goto fail;
return area->addr;
fail:
vfree(area->addr);
return NULL;
}
void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_area_node(area, gfp_mask, prot, -1);
}
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
int node)
{
struct vm_struct *area;
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > num_physpages)
return NULL;
area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
if (!area)
return NULL;
return __vmalloc_area_node(area, gfp_mask, prot, node);
}
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_node(size, gfp_mask, prot, -1);
}
EXPORT_SYMBOL(__vmalloc);
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
*/
void *vmalloc_user(unsigned long size)
{
struct vm_struct *area;
void *ret;
ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
if (ret) {
write_lock(&vmlist_lock);
area = __find_vm_area(ret);
area->flags |= VM_USERMAP;
write_unlock(&vmlist_lock);
}
return ret;
}
EXPORT_SYMBOL(vmalloc_user);
/**
* vmalloc_node - allocate memory on a specific node
* @size: allocation size
* @node: numa node
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc_node(unsigned long size, int node)
{
return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
}
EXPORT_SYMBOL(vmalloc_node);
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
*
* Kernel-internal function to allocate enough pages to cover @size
* the page level allocator and map them into contiguous and
* executable kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc_exec(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
}
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
* page level allocator and map them into contiguous kernel virtual space.
*/
void *vmalloc_32(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
EXPORT_SYMBOL(vmalloc_32);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
* @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
*/
void *vmalloc_32_user(unsigned long size)
{
struct vm_struct *area;
void *ret;
ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
if (ret) {
write_lock(&vmlist_lock);
area = __find_vm_area(ret);
area->flags |= VM_USERMAP;
write_unlock(&vmlist_lock);
}
return ret;
}
EXPORT_SYMBOL(vmalloc_32_user);
long vread(char *buf, char *addr, unsigned long count)
{
struct vm_struct *tmp;
char *vaddr, *buf_start = buf;
unsigned long n;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
read_lock(&vmlist_lock);
for (tmp = vmlist; tmp; tmp = tmp->next) {
vaddr = (char *) tmp->addr;
if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
goto finished;
*buf = '\0';
buf++;
addr++;
count--;
}
n = vaddr + tmp->size - PAGE_SIZE - addr;
do {
if (count == 0)
goto finished;
*buf = *addr;
buf++;
addr++;
count--;
} while (--n > 0);
}
finished:
read_unlock(&vmlist_lock);
return buf - buf_start;
}
long vwrite(char *buf, char *addr, unsigned long count)
{
struct vm_struct *tmp;
char *vaddr, *buf_start = buf;
unsigned long n;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
read_lock(&vmlist_lock);
for (tmp = vmlist; tmp; tmp = tmp->next) {
vaddr = (char *) tmp->addr;
if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
goto finished;
buf++;
addr++;
count--;
}
n = vaddr + tmp->size - PAGE_SIZE - addr;
do {
if (count == 0)
goto finished;
*addr = *buf;
buf++;
addr++;
count--;
} while (--n > 0);
}
finished:
read_unlock(&vmlist_lock);
return buf - buf_start;
}
/**
* remap_vmalloc_range - map vmalloc pages to userspace
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
* @returns: 0 for success, -Exxx on failure
*
* This function checks that addr is a valid vmalloc'ed area, and
* that it is big enough to cover the vma. Will return failure if
* that criteria isn't met.
*
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
struct vm_struct *area;
unsigned long uaddr = vma->vm_start;
unsigned long usize = vma->vm_end - vma->vm_start;
int ret;
if ((PAGE_SIZE-1) & (unsigned long)addr)
return -EINVAL;
read_lock(&vmlist_lock);
area = __find_vm_area(addr);
if (!area)
goto out_einval_locked;
if (!(area->flags & VM_USERMAP))
goto out_einval_locked;
if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
goto out_einval_locked;
read_unlock(&vmlist_lock);
addr += pgoff << PAGE_SHIFT;
do {
struct page *page = vmalloc_to_page(addr);
ret = vm_insert_page(vma, uaddr, page);
if (ret)
return ret;
uaddr += PAGE_SIZE;
addr += PAGE_SIZE;
usize -= PAGE_SIZE;
} while (usize > 0);
/* Prevent "things" like memory migration? VM_flags need a cleanup... */
vma->vm_flags |= VM_RESERVED;
return ret;
out_einval_locked:
read_unlock(&vmlist_lock);
return -EINVAL;
}
EXPORT_SYMBOL(remap_vmalloc_range);

1730
mm/vmscan.c Normal file

File diff suppressed because it is too large Load Diff

673
mm/vmstat.c Normal file
View File

@@ -0,0 +1,673 @@
/*
* linux/mm/vmstat.c
*
* Manages VM statistics
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
* Christoph Lameter <christoph@lameter.com>
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/cpu.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
{
int cpu = 0;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
cpu = first_cpu(*cpumask);
while (cpu < NR_CPUS) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
cpu = next_cpu(cpu, *cpumask);
if (cpu < NR_CPUS)
prefetch(&per_cpu(vm_event_states, cpu));
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
ret[i] += this->event[i];
}
}
/*
* Accumulate the vm event counters across all CPUs.
* The result is unavoidably approximate - it can change
* during and after execution of this function.
*/
void all_vm_events(unsigned long *ret)
{
sum_vm_events(ret, &cpu_online_map);
}
EXPORT_SYMBOL_GPL(all_vm_events);
#ifdef CONFIG_HOTPLUG
/*
* Fold the foreign cpu events into our own.
*
* This is adding to the events on one processor
* but keeps the global counts constant.
*/
void vm_events_fold_cpu(int cpu)
{
struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
int i;
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
count_vm_events(i, fold_state->event[i]);
fold_state->event[i] = 0;
}
}
#endif /* CONFIG_HOTPLUG */
#endif /* CONFIG_VM_EVENT_COUNTERS */
/*
* Manage combined zone based / global counters
*
* vm_stat contains the global counters
*/
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
static int calculate_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
/*
* The threshold scales with the number of processors and the amount
* of memory per zone. More memory means that we can defer updates for
* longer, more processors could lead to more contention.
* fls() is used to have a cheap way of logarithmic scaling.
*
* Some sample thresholds:
*
* Threshold Processors (fls) Zonesize fls(mem+1)
* ------------------------------------------------------------------
* 8 1 1 0.9-1 GB 4
* 16 2 2 0.9-1 GB 4
* 20 2 2 1-2 GB 5
* 24 2 2 2-4 GB 6
* 28 2 2 4-8 GB 7
* 32 2 2 8-16 GB 8
* 4 2 2 <128M 1
* 30 4 3 2-4 GB 5
* 48 4 3 8-16 GB 8
* 32 8 4 1-2 GB 4
* 32 8 4 0.9-1GB 4
* 10 16 5 <128M 1
* 40 16 5 900M 4
* 70 64 7 2-4 GB 5
* 84 64 7 4-8 GB 6
* 108 512 9 4-8 GB 6
* 125 1024 10 8-16 GB 8
* 125 1024 10 16-32 GB 9
*/
mem = zone->present_pages >> (27 - PAGE_SHIFT);
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
/*
* Maximum threshold is 125
*/
threshold = min(125, threshold);
return threshold;
}
/*
* Refresh the thresholds for each zone.
*/
static void refresh_zone_stat_thresholds(void)
{
struct zone *zone;
int cpu;
int threshold;
for_each_zone(zone) {
if (!zone->present_pages)
continue;
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
zone_pcp(zone, cpu)->stat_threshold = threshold;
}
}
/*
* For use when we know that interrupts are disabled.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
s8 *p = pcp->vm_stat_diff + item;
long x;
x = delta + *p;
if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
zone_page_state_add(x, zone, item);
x = 0;
}
*p = x;
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
* For an unknown interrupt state
*/
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
unsigned long flags;
local_irq_save(flags);
__mod_zone_page_state(zone, item, delta);
local_irq_restore(flags);
}
EXPORT_SYMBOL(mod_zone_page_state);
/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
* argument instead of struct zone *. This allows the inclusion of the code
* generated for page_zone(page) into the optimized functions.
*
* No overflow check is necessary and therefore the differential can be
* incremented or decremented in place which may allow the compilers to
* generate better code.
* The increment or decrement is known and therefore one boundary check can
* be omitted.
*
* NOTE: These functions are very performance sensitive. Change only
* with care.
*
* Some processors have inc/dec instructions that are atomic vs an interrupt.
* However, the code must first determine the differential location in a zone
* based on the processor number and then inc/dec the counter. There is no
* guarantee without disabling preemption that the processor will not change
* in between and therefore the atomicity vs. interrupt cannot be exploited
* in a useful way here.
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
s8 *p = pcp->vm_stat_diff + item;
(*p)++;
if (unlikely(*p > pcp->stat_threshold)) {
int overstep = pcp->stat_threshold / 2;
zone_page_state_add(*p + overstep, zone, item);
*p = -overstep;
}
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
__inc_zone_state(page_zone(page), item);
}
EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
s8 *p = pcp->vm_stat_diff + item;
(*p)--;
if (unlikely(*p < - pcp->stat_threshold)) {
int overstep = pcp->stat_threshold / 2;
zone_page_state_add(*p - overstep, zone, item);
*p = overstep;
}
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
__dec_zone_state(page_zone(page), item);
}
EXPORT_SYMBOL(__dec_zone_page_state);
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
__inc_zone_state(zone, item);
local_irq_restore(flags);
}
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
unsigned long flags;
struct zone *zone;
zone = page_zone(page);
local_irq_save(flags);
__inc_zone_state(zone, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(inc_zone_page_state);
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
unsigned long flags;
local_irq_save(flags);
__dec_zone_page_state(page, item);
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
/*
* Update the zone counters for one cpu.
*/
void refresh_cpu_vm_stats(int cpu)
{
struct zone *zone;
int i;
unsigned long flags;
for_each_zone(zone) {
struct per_cpu_pageset *pcp;
if (!populated_zone(zone))
continue;
pcp = zone_pcp(zone, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
if (pcp->vm_stat_diff[i]) {
local_irq_save(flags);
zone_page_state_add(pcp->vm_stat_diff[i],
zone, i);
pcp->vm_stat_diff[i] = 0;
local_irq_restore(flags);
}
}
}
static void __refresh_cpu_vm_stats(void *dummy)
{
refresh_cpu_vm_stats(smp_processor_id());
}
/*
* Consolidate all counters.
*
* Note that the result is less inaccurate but still inaccurate
* if concurrent processes are allowed to run.
*/
void refresh_vm_stats(void)
{
on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
}
EXPORT_SYMBOL(refresh_vm_stats);
#endif
#ifdef CONFIG_NUMA
/*
* zonelist = the list of zones passed to the allocator
* z = the zone from which the allocation occurred.
*
* Must be called with interrupts disabled.
*/
void zone_statistics(struct zonelist *zonelist, struct zone *z)
{
if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
__inc_zone_state(z, NUMA_HIT);
} else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
}
if (z->node == numa_node_id())
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
}
#endif
#ifdef CONFIG_PROC_FS
#include <linux/seq_file.h>
static void *frag_start(struct seq_file *m, loff_t *pos)
{
pg_data_t *pgdat;
loff_t node = *pos;
for (pgdat = first_online_pgdat();
pgdat && node;
pgdat = next_online_pgdat(pgdat))
--node;
return pgdat;
}
static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
{
pg_data_t *pgdat = (pg_data_t *)arg;
(*pos)++;
return next_online_pgdat(pgdat);
}
static void frag_stop(struct seq_file *m, void *arg)
{
}
/*
* This walks the free areas for each zone.
*/
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
int order;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
return 0;
}
const struct seq_operations fragmentation_op = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = frag_show,
};
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
#else
#define TEXT_FOR_DMA(xx)
#endif
#ifdef CONFIG_ZONE_DMA32
#define TEXT_FOR_DMA32(xx) xx "_dma32",
#else
#define TEXT_FOR_DMA32(xx)
#endif
#ifdef CONFIG_HIGHMEM
#define TEXT_FOR_HIGHMEM(xx) xx "_high",
#else
#define TEXT_FOR_HIGHMEM(xx)
#endif
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
TEXT_FOR_HIGHMEM(xx)
static const char * const vmstat_text[] = {
/* Zoned VM counters */
"nr_free_pages",
"nr_active",
"nr_inactive",
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",
"nr_dirty",
"nr_writeback",
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
"numa_foreign",
"numa_interleave",
"numa_local",
"numa_other",
#endif
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
"pgpgout",
"pswpin",
"pswpout",
TEXTS_FOR_ZONES("pgalloc")
"pgfree",
"pgactivate",
"pgdeactivate",
"pgfault",
"pgmajfault",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal")
TEXTS_FOR_ZONES("pgscan_kswapd")
TEXTS_FOR_ZONES("pgscan_direct")
"pginodesteal",
"slabs_scanned",
"kswapd_steal",
"kswapd_inodesteal",
"pageoutrun",
"allocstall",
"pgrotated",
#endif
};
/*
* Output information about zones in @pgdat.
*/
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = arg;
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
int i;
if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
seq_printf(m,
"\n pages free %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
"\n scanned %lu (a: %lu i: %lu)"
"\n spanned %lu"
"\n present %lu",
zone_page_state(zone, NR_FREE_PAGES),
zone->pages_min,
zone->pages_low,
zone->pages_high,
zone->pages_scanned,
zone->nr_scan_active, zone->nr_scan_inactive,
zone->spanned_pages,
zone->present_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
seq_printf(m,
"\n protection: (%lu",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
seq_printf(m,
")"
"\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
pageset = zone_pcp(zone, i);
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
seq_printf(m,
"\n cpu: %i pcp: %i"
"\n count: %i"
"\n high: %i"
"\n batch: %i",
i, j,
pageset->pcp[j].count,
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
#ifdef CONFIG_SMP
seq_printf(m, "\n vm stats threshold: %d",
pageset->stat_threshold);
#endif
}
seq_printf(m,
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
"\n start_pfn: %lu",
zone->all_unreclaimable,
zone->prev_priority,
zone->zone_start_pfn);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
return 0;
}
const struct seq_operations zoneinfo_op = {
.start = frag_start, /* iterate over all zones. The same as in
* fragmentation. */
.next = frag_next,
.stop = frag_stop,
.show = zoneinfo_show,
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
#ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long *e;
#endif
int i;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
#ifdef CONFIG_VM_EVENT_COUNTERS
v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+ sizeof(struct vm_event_state), GFP_KERNEL);
#else
v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
GFP_KERNEL);
#endif
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
#ifdef CONFIG_VM_EVENT_COUNTERS
e = v + NR_VM_ZONE_STAT_ITEMS;
all_vm_events(e);
e[PGPGIN] /= 2; /* sectors -> kbytes */
e[PGPGOUT] /= 2;
#endif
return v + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
{
(*pos)++;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
return (unsigned long *)m->private + *pos;
}
static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
return 0;
}
static void vmstat_stop(struct seq_file *m, void *arg)
{
kfree(m->private);
m->private = NULL;
}
const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
.show = vmstat_show,
};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
*/
static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_CANCELED:
case CPU_DEAD:
refresh_zone_stat_thresholds();
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata vmstat_notifier =
{ &vmstat_cpuup_callback, NULL, 0 };
int __init setup_vmstat(void)
{
refresh_zone_stat_thresholds();
register_cpu_notifier(&vmstat_notifier);
return 0;
}
module_init(setup_vmstat)
#endif