Creation of Cybook 2416 (actually Gen4) repository
This commit is contained in:
165
mm/Kconfig
Normal file
165
mm/Kconfig
Normal file
@@ -0,0 +1,165 @@
|
||||
config SELECT_MEMORY_MODEL
|
||||
def_bool y
|
||||
depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
|
||||
|
||||
choice
|
||||
prompt "Memory model"
|
||||
depends on SELECT_MEMORY_MODEL
|
||||
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
|
||||
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
|
||||
default FLATMEM_MANUAL
|
||||
|
||||
config FLATMEM_MANUAL
|
||||
bool "Flat Memory"
|
||||
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
|
||||
help
|
||||
This option allows you to change some of the ways that
|
||||
Linux manages its memory internally. Most users will
|
||||
only have one option here: FLATMEM. This is normal
|
||||
and a correct option.
|
||||
|
||||
Some users of more advanced features like NUMA and
|
||||
memory hotplug may have different options here.
|
||||
DISCONTIGMEM is an more mature, better tested system,
|
||||
but is incompatible with memory hotplug and may suffer
|
||||
decreased performance over SPARSEMEM. If unsure between
|
||||
"Sparse Memory" and "Discontiguous Memory", choose
|
||||
"Discontiguous Memory".
|
||||
|
||||
If unsure, choose this option (Flat Memory) over any other.
|
||||
|
||||
config DISCONTIGMEM_MANUAL
|
||||
bool "Discontiguous Memory"
|
||||
depends on ARCH_DISCONTIGMEM_ENABLE
|
||||
help
|
||||
This option provides enhanced support for discontiguous
|
||||
memory systems, over FLATMEM. These systems have holes
|
||||
in their physical address spaces, and this option provides
|
||||
more efficient handling of these holes. However, the vast
|
||||
majority of hardware has quite flat address spaces, and
|
||||
can have degraded performance from extra overhead that
|
||||
this option imposes.
|
||||
|
||||
Many NUMA configurations will have this as the only option.
|
||||
|
||||
If unsure, choose "Flat Memory" over this option.
|
||||
|
||||
config SPARSEMEM_MANUAL
|
||||
bool "Sparse Memory"
|
||||
depends on ARCH_SPARSEMEM_ENABLE
|
||||
help
|
||||
This will be the only option for some systems, including
|
||||
memory hotplug systems. This is normal.
|
||||
|
||||
For many other systems, this will be an alternative to
|
||||
"Discontiguous Memory". This option provides some potential
|
||||
performance benefits, along with decreased code complexity,
|
||||
but it is newer, and more experimental.
|
||||
|
||||
If unsure, choose "Discontiguous Memory" or "Flat Memory"
|
||||
over this option.
|
||||
|
||||
endchoice
|
||||
|
||||
config DISCONTIGMEM
|
||||
def_bool y
|
||||
depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
|
||||
|
||||
config SPARSEMEM
|
||||
def_bool y
|
||||
depends on SPARSEMEM_MANUAL
|
||||
|
||||
config FLATMEM
|
||||
def_bool y
|
||||
depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
|
||||
|
||||
config FLAT_NODE_MEM_MAP
|
||||
def_bool y
|
||||
depends on !SPARSEMEM
|
||||
|
||||
#
|
||||
# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
|
||||
# to represent different areas of memory. This variable allows
|
||||
# those dependencies to exist individually.
|
||||
#
|
||||
config NEED_MULTIPLE_NODES
|
||||
def_bool y
|
||||
depends on DISCONTIGMEM || NUMA
|
||||
|
||||
config HAVE_MEMORY_PRESENT
|
||||
def_bool y
|
||||
depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
|
||||
|
||||
#
|
||||
# SPARSEMEM_EXTREME (which is the default) does some bootmem
|
||||
# allocations when memory_present() is called. If this cannot
|
||||
# be done on your architecture, select this option. However,
|
||||
# statically allocating the mem_section[] array can potentially
|
||||
# consume vast quantities of .bss, so be careful.
|
||||
#
|
||||
# This option will also potentially produce smaller runtime code
|
||||
# with gcc 3.4 and later.
|
||||
#
|
||||
config SPARSEMEM_STATIC
|
||||
def_bool n
|
||||
|
||||
#
|
||||
# Architecture platforms which require a two level mem_section in SPARSEMEM
|
||||
# must select this option. This is usually for architecture platforms with
|
||||
# an extremely sparse physical address space.
|
||||
#
|
||||
config SPARSEMEM_EXTREME
|
||||
def_bool y
|
||||
depends on SPARSEMEM && !SPARSEMEM_STATIC
|
||||
|
||||
# eventually, we can have this option just 'select SPARSEMEM'
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on (IA64 || X86 || PPC64)
|
||||
|
||||
comment "Memory hotplug is currently incompatible with Software Suspend"
|
||||
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
|
||||
|
||||
config MEMORY_HOTPLUG_SPARSE
|
||||
def_bool y
|
||||
depends on SPARSEMEM && MEMORY_HOTPLUG
|
||||
|
||||
# Heavily threaded applications may benefit from splitting the mm-wide
|
||||
# page_table_lock, so that faults on different parts of the user address
|
||||
# space can be handled with less contention: split it at this NR_CPUS.
|
||||
# Default to 4 for wider testing, though 8 might be more appropriate.
|
||||
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
|
||||
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
|
||||
#
|
||||
config SPLIT_PTLOCK_CPUS
|
||||
int
|
||||
default "4096" if ARM && !CPU_CACHE_VIPT
|
||||
default "4096" if PARISC && !PA20
|
||||
default "4"
|
||||
|
||||
#
|
||||
# support for page migration
|
||||
#
|
||||
config MIGRATION
|
||||
bool "Page migration"
|
||||
def_bool y
|
||||
depends on NUMA
|
||||
help
|
||||
Allows the migration of the physical location of pages of processes
|
||||
while the virtual addresses are not changed. This is useful for
|
||||
example on NUMA systems to put pages nearer to the processors accessing
|
||||
the page.
|
||||
|
||||
config RESOURCES_64BIT
|
||||
bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
|
||||
default 64BIT
|
||||
help
|
||||
This option allows memory and IO resources to be 64 bit.
|
||||
|
||||
config ZONE_DMA_FLAG
|
||||
int
|
||||
default "0" if !ZONE_DMA
|
||||
default "1"
|
||||
|
||||
31
mm/Makefile
Normal file
31
mm/Makefile
Normal file
@@ -0,0 +1,31 @@
|
||||
#
|
||||
# Makefile for the linux memory manager.
|
||||
#
|
||||
|
||||
mmu-y := nommu.o
|
||||
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
|
||||
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
|
||||
vmalloc.o
|
||||
|
||||
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
page_alloc.o page-writeback.o pdflush.o \
|
||||
readahead.o swap.o truncate.o vmscan.o \
|
||||
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
|
||||
$(mmu-y)
|
||||
|
||||
ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
|
||||
obj-y += bounce.o
|
||||
endif
|
||||
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
|
||||
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
|
||||
obj-$(CONFIG_NUMA) += mempolicy.o
|
||||
obj-$(CONFIG_SPARSEMEM) += sparse.o
|
||||
obj-$(CONFIG_SHMEM) += shmem.o
|
||||
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
|
||||
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
|
||||
obj-$(CONFIG_SLOB) += slob.o
|
||||
obj-$(CONFIG_SLAB) += slab.o
|
||||
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
|
||||
obj-$(CONFIG_FS_XIP) += filemap_xip.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
obj-$(CONFIG_SMP) += allocpercpu.o
|
||||
130
mm/allocpercpu.c
Normal file
130
mm/allocpercpu.c
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* linux/mm/allocpercpu.c
|
||||
*
|
||||
* Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
|
||||
*/
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
/**
|
||||
* percpu_depopulate - depopulate per-cpu data for given cpu
|
||||
* @__pdata: per-cpu data to depopulate
|
||||
* @cpu: depopulate per-cpu data for this cpu
|
||||
*
|
||||
* Depopulating per-cpu data for a cpu going offline would be a typical
|
||||
* use case. You need to register a cpu hotplug handler for that purpose.
|
||||
*/
|
||||
void percpu_depopulate(void *__pdata, int cpu)
|
||||
{
|
||||
struct percpu_data *pdata = __percpu_disguise(__pdata);
|
||||
|
||||
kfree(pdata->ptrs[cpu]);
|
||||
pdata->ptrs[cpu] = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_depopulate);
|
||||
|
||||
/**
|
||||
* percpu_depopulate_mask - depopulate per-cpu data for some cpu's
|
||||
* @__pdata: per-cpu data to depopulate
|
||||
* @mask: depopulate per-cpu data for cpu's selected through mask bits
|
||||
*/
|
||||
void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
|
||||
{
|
||||
int cpu;
|
||||
for_each_cpu_mask(cpu, *mask)
|
||||
percpu_depopulate(__pdata, cpu);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
|
||||
|
||||
/**
|
||||
* percpu_populate - populate per-cpu data for given cpu
|
||||
* @__pdata: per-cpu data to populate further
|
||||
* @size: size of per-cpu object
|
||||
* @gfp: may sleep or not etc.
|
||||
* @cpu: populate per-data for this cpu
|
||||
*
|
||||
* Populating per-cpu data for a cpu coming online would be a typical
|
||||
* use case. You need to register a cpu hotplug handler for that purpose.
|
||||
* Per-cpu object is populated with zeroed buffer.
|
||||
*/
|
||||
void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
|
||||
{
|
||||
struct percpu_data *pdata = __percpu_disguise(__pdata);
|
||||
int node = cpu_to_node(cpu);
|
||||
|
||||
BUG_ON(pdata->ptrs[cpu]);
|
||||
if (node_online(node)) {
|
||||
/* FIXME: kzalloc_node(size, gfp, node) */
|
||||
pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
|
||||
if (pdata->ptrs[cpu])
|
||||
memset(pdata->ptrs[cpu], 0, size);
|
||||
} else
|
||||
pdata->ptrs[cpu] = kzalloc(size, gfp);
|
||||
return pdata->ptrs[cpu];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_populate);
|
||||
|
||||
/**
|
||||
* percpu_populate_mask - populate per-cpu data for more cpu's
|
||||
* @__pdata: per-cpu data to populate further
|
||||
* @size: size of per-cpu object
|
||||
* @gfp: may sleep or not etc.
|
||||
* @mask: populate per-cpu data for cpu's selected through mask bits
|
||||
*
|
||||
* Per-cpu objects are populated with zeroed buffers.
|
||||
*/
|
||||
int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
|
||||
cpumask_t *mask)
|
||||
{
|
||||
cpumask_t populated = CPU_MASK_NONE;
|
||||
int cpu;
|
||||
|
||||
for_each_cpu_mask(cpu, *mask)
|
||||
if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
|
||||
__percpu_depopulate_mask(__pdata, &populated);
|
||||
return -ENOMEM;
|
||||
} else
|
||||
cpu_set(cpu, populated);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_populate_mask);
|
||||
|
||||
/**
|
||||
* percpu_alloc_mask - initial setup of per-cpu data
|
||||
* @size: size of per-cpu object
|
||||
* @gfp: may sleep or not etc.
|
||||
* @mask: populate per-data for cpu's selected through mask bits
|
||||
*
|
||||
* Populating per-cpu data for all online cpu's would be a typical use case,
|
||||
* which is simplified by the percpu_alloc() wrapper.
|
||||
* Per-cpu objects are populated with zeroed buffers.
|
||||
*/
|
||||
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
|
||||
{
|
||||
void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
|
||||
void *__pdata = __percpu_disguise(pdata);
|
||||
|
||||
if (unlikely(!pdata))
|
||||
return NULL;
|
||||
if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
|
||||
return __pdata;
|
||||
kfree(pdata);
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
|
||||
|
||||
/**
|
||||
* percpu_free - final cleanup of per-cpu data
|
||||
* @__pdata: object to clean up
|
||||
*
|
||||
* We simply clean up any per-cpu object left. No need for the client to
|
||||
* track and specify through a bis mask which per-cpu objects are to free.
|
||||
*/
|
||||
void percpu_free(void *__pdata)
|
||||
{
|
||||
if (unlikely(!__pdata))
|
||||
return;
|
||||
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
|
||||
kfree(__percpu_disguise(__pdata));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_free);
|
||||
85
mm/backing-dev.c
Normal file
85
mm/backing-dev.c
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
#include <linux/wait.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
static wait_queue_head_t congestion_wqh[2] = {
|
||||
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
|
||||
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
|
||||
};
|
||||
|
||||
|
||||
void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
|
||||
{
|
||||
enum bdi_state bit;
|
||||
wait_queue_head_t *wqh = &congestion_wqh[rw];
|
||||
|
||||
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
|
||||
clear_bit(bit, &bdi->state);
|
||||
smp_mb__after_clear_bit();
|
||||
if (waitqueue_active(wqh))
|
||||
wake_up(wqh);
|
||||
}
|
||||
EXPORT_SYMBOL(clear_bdi_congested);
|
||||
|
||||
void set_bdi_congested(struct backing_dev_info *bdi, int rw)
|
||||
{
|
||||
enum bdi_state bit;
|
||||
|
||||
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
|
||||
set_bit(bit, &bdi->state);
|
||||
}
|
||||
EXPORT_SYMBOL(set_bdi_congested);
|
||||
|
||||
/**
|
||||
* congestion_wait - wait for a backing_dev to become uncongested
|
||||
* @rw: READ or WRITE
|
||||
* @timeout: timeout in jiffies
|
||||
*
|
||||
* Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
|
||||
* write congestion. If no backing_devs are congested then just wait for the
|
||||
* next write to be completed.
|
||||
*/
|
||||
long congestion_wait(int rw, long timeout)
|
||||
{
|
||||
long ret;
|
||||
DEFINE_WAIT(wait);
|
||||
wait_queue_head_t *wqh = &congestion_wqh[rw];
|
||||
|
||||
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
|
||||
ret = io_schedule_timeout(timeout);
|
||||
finish_wait(wqh, &wait);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(congestion_wait);
|
||||
|
||||
long congestion_wait_interruptible(int rw, long timeout)
|
||||
{
|
||||
long ret;
|
||||
DEFINE_WAIT(wait);
|
||||
wait_queue_head_t *wqh = &congestion_wqh[rw];
|
||||
|
||||
prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
|
||||
if (signal_pending(current))
|
||||
ret = -ERESTARTSYS;
|
||||
else
|
||||
ret = io_schedule_timeout(timeout);
|
||||
finish_wait(wqh, &wait);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(congestion_wait_interruptible);
|
||||
|
||||
/**
|
||||
* congestion_end - wake up sleepers on a congested backing_dev_info
|
||||
* @rw: READ or WRITE
|
||||
*/
|
||||
void congestion_end(int rw)
|
||||
{
|
||||
wait_queue_head_t *wqh = &congestion_wqh[rw];
|
||||
|
||||
if (waitqueue_active(wqh))
|
||||
wake_up(wqh);
|
||||
}
|
||||
EXPORT_SYMBOL(congestion_end);
|
||||
489
mm/bootmem.c
Normal file
489
mm/bootmem.c
Normal file
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
* linux/mm/bootmem.c
|
||||
*
|
||||
* Copyright (C) 1999 Ingo Molnar
|
||||
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
|
||||
*
|
||||
* simple boot-time physical memory area allocator and
|
||||
* free memory collector. It's used to deal with reserved
|
||||
* system memory and memory holes as well.
|
||||
*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/bug.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/processor.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/*
|
||||
* Access to this subsystem has to be serialized externally. (this is
|
||||
* true for the boot process anyway)
|
||||
*/
|
||||
unsigned long max_low_pfn;
|
||||
unsigned long min_low_pfn;
|
||||
unsigned long max_pfn;
|
||||
|
||||
static LIST_HEAD(bdata_list);
|
||||
#ifdef CONFIG_CRASH_DUMP
|
||||
/*
|
||||
* If we have booted due to a crash, max_pfn will be a very low value. We need
|
||||
* to know the amount of memory that the previous kernel used.
|
||||
*/
|
||||
unsigned long saved_max_pfn;
|
||||
#endif
|
||||
|
||||
/* return the number of _pages_ that will be allocated for the boot bitmap */
|
||||
unsigned long __init bootmem_bootmap_pages(unsigned long pages)
|
||||
{
|
||||
unsigned long mapsize;
|
||||
|
||||
mapsize = (pages+7)/8;
|
||||
mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
|
||||
mapsize >>= PAGE_SHIFT;
|
||||
|
||||
return mapsize;
|
||||
}
|
||||
|
||||
/*
|
||||
* link bdata in order
|
||||
*/
|
||||
static void __init link_bootmem(bootmem_data_t *bdata)
|
||||
{
|
||||
bootmem_data_t *ent;
|
||||
|
||||
if (list_empty(&bdata_list)) {
|
||||
list_add(&bdata->list, &bdata_list);
|
||||
return;
|
||||
}
|
||||
/* insert in order */
|
||||
list_for_each_entry(ent, &bdata_list, list) {
|
||||
if (bdata->node_boot_start < ent->node_boot_start) {
|
||||
list_add_tail(&bdata->list, &ent->list);
|
||||
return;
|
||||
}
|
||||
}
|
||||
list_add_tail(&bdata->list, &bdata_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an initialised bdata, it returns the size of the boot bitmap
|
||||
*/
|
||||
static unsigned long __init get_mapsize(bootmem_data_t *bdata)
|
||||
{
|
||||
unsigned long mapsize;
|
||||
unsigned long start = PFN_DOWN(bdata->node_boot_start);
|
||||
unsigned long end = bdata->node_low_pfn;
|
||||
|
||||
mapsize = ((end - start) + 7) / 8;
|
||||
return ALIGN(mapsize, sizeof(long));
|
||||
}
|
||||
|
||||
/*
|
||||
* Called once to set up the allocator itself.
|
||||
*/
|
||||
static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
|
||||
unsigned long mapstart, unsigned long start, unsigned long end)
|
||||
{
|
||||
bootmem_data_t *bdata = pgdat->bdata;
|
||||
unsigned long mapsize;
|
||||
|
||||
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
|
||||
bdata->node_boot_start = PFN_PHYS(start);
|
||||
bdata->node_low_pfn = end;
|
||||
link_bootmem(bdata);
|
||||
|
||||
/*
|
||||
* Initially all pages are reserved - setup_arch() has to
|
||||
* register free RAM areas explicitly.
|
||||
*/
|
||||
mapsize = get_mapsize(bdata);
|
||||
memset(bdata->node_bootmem_map, 0xff, mapsize);
|
||||
|
||||
return mapsize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Marks a particular physical memory range as unallocatable. Usable RAM
|
||||
* might be used for boot-time allocations - or it might get added
|
||||
* to the free page pool later on.
|
||||
*/
|
||||
static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
|
||||
unsigned long size)
|
||||
{
|
||||
unsigned long sidx, eidx;
|
||||
unsigned long i;
|
||||
|
||||
/*
|
||||
* round up, partially reserved pages are considered
|
||||
* fully reserved.
|
||||
*/
|
||||
BUG_ON(!size);
|
||||
BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
|
||||
BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
|
||||
|
||||
sidx = PFN_DOWN(addr - bdata->node_boot_start);
|
||||
eidx = PFN_UP(addr + size - bdata->node_boot_start);
|
||||
|
||||
for (i = sidx; i < eidx; i++)
|
||||
if (test_and_set_bit(i, bdata->node_bootmem_map)) {
|
||||
#ifdef CONFIG_DEBUG_BOOTMEM
|
||||
printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
|
||||
unsigned long size)
|
||||
{
|
||||
unsigned long sidx, eidx;
|
||||
unsigned long i;
|
||||
|
||||
/*
|
||||
* round down end of usable mem, partially free pages are
|
||||
* considered reserved.
|
||||
*/
|
||||
BUG_ON(!size);
|
||||
BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
|
||||
|
||||
if (addr < bdata->last_success)
|
||||
bdata->last_success = addr;
|
||||
|
||||
/*
|
||||
* Round up the beginning of the address.
|
||||
*/
|
||||
sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
|
||||
eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
|
||||
|
||||
for (i = sidx; i < eidx; i++) {
|
||||
if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We 'merge' subsequent allocations to save space. We might 'lose'
|
||||
* some fraction of a page if allocations cannot be satisfied due to
|
||||
* size constraints on boxes where there is physical RAM space
|
||||
* fragmentation - in these cases (mostly large memory boxes) this
|
||||
* is not a problem.
|
||||
*
|
||||
* On low memory boxes we get it right in 100% of the cases.
|
||||
*
|
||||
* alignment has to be a power of 2 value.
|
||||
*
|
||||
* NOTE: This function is _not_ reentrant.
|
||||
*/
|
||||
void * __init
|
||||
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
|
||||
unsigned long align, unsigned long goal, unsigned long limit)
|
||||
{
|
||||
unsigned long offset, remaining_size, areasize, preferred;
|
||||
unsigned long i, start = 0, incr, eidx, end_pfn;
|
||||
void *ret;
|
||||
|
||||
if (!size) {
|
||||
printk("__alloc_bootmem_core(): zero-sized request\n");
|
||||
BUG();
|
||||
}
|
||||
BUG_ON(align & (align-1));
|
||||
|
||||
if (limit && bdata->node_boot_start >= limit)
|
||||
return NULL;
|
||||
|
||||
/* on nodes without memory - bootmem_map is NULL */
|
||||
if (!bdata->node_bootmem_map)
|
||||
return NULL;
|
||||
|
||||
end_pfn = bdata->node_low_pfn;
|
||||
limit = PFN_DOWN(limit);
|
||||
if (limit && end_pfn > limit)
|
||||
end_pfn = limit;
|
||||
|
||||
eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
|
||||
offset = 0;
|
||||
if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
|
||||
offset = align - (bdata->node_boot_start & (align - 1UL));
|
||||
offset = PFN_DOWN(offset);
|
||||
|
||||
/*
|
||||
* We try to allocate bootmem pages above 'goal'
|
||||
* first, then we try to allocate lower pages.
|
||||
*/
|
||||
if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
|
||||
preferred = goal - bdata->node_boot_start;
|
||||
|
||||
if (bdata->last_success >= preferred)
|
||||
if (!limit || (limit && limit > bdata->last_success))
|
||||
preferred = bdata->last_success;
|
||||
} else
|
||||
preferred = 0;
|
||||
|
||||
preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
|
||||
areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
|
||||
incr = align >> PAGE_SHIFT ? : 1;
|
||||
|
||||
restart_scan:
|
||||
for (i = preferred; i < eidx; i += incr) {
|
||||
unsigned long j;
|
||||
i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
|
||||
i = ALIGN(i, incr);
|
||||
if (i >= eidx)
|
||||
break;
|
||||
if (test_bit(i, bdata->node_bootmem_map))
|
||||
continue;
|
||||
for (j = i + 1; j < i + areasize; ++j) {
|
||||
if (j >= eidx)
|
||||
goto fail_block;
|
||||
if (test_bit(j, bdata->node_bootmem_map))
|
||||
goto fail_block;
|
||||
}
|
||||
start = i;
|
||||
goto found;
|
||||
fail_block:
|
||||
i = ALIGN(j, incr);
|
||||
}
|
||||
|
||||
if (preferred > offset) {
|
||||
preferred = offset;
|
||||
goto restart_scan;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
found:
|
||||
bdata->last_success = PFN_PHYS(start);
|
||||
BUG_ON(start >= eidx);
|
||||
|
||||
/*
|
||||
* Is the next page of the previous allocation-end the start
|
||||
* of this allocation's buffer? If yes then we can 'merge'
|
||||
* the previous partial page with this allocation.
|
||||
*/
|
||||
if (align < PAGE_SIZE &&
|
||||
bdata->last_offset && bdata->last_pos+1 == start) {
|
||||
offset = ALIGN(bdata->last_offset, align);
|
||||
BUG_ON(offset > PAGE_SIZE);
|
||||
remaining_size = PAGE_SIZE - offset;
|
||||
if (size < remaining_size) {
|
||||
areasize = 0;
|
||||
/* last_pos unchanged */
|
||||
bdata->last_offset = offset + size;
|
||||
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
|
||||
offset +
|
||||
bdata->node_boot_start);
|
||||
} else {
|
||||
remaining_size = size - remaining_size;
|
||||
areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
|
||||
ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
|
||||
offset +
|
||||
bdata->node_boot_start);
|
||||
bdata->last_pos = start + areasize - 1;
|
||||
bdata->last_offset = remaining_size;
|
||||
}
|
||||
bdata->last_offset &= ~PAGE_MASK;
|
||||
} else {
|
||||
bdata->last_pos = start + areasize - 1;
|
||||
bdata->last_offset = size & ~PAGE_MASK;
|
||||
ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reserve the area now:
|
||||
*/
|
||||
for (i = start; i < start + areasize; i++)
|
||||
if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
|
||||
BUG();
|
||||
memset(ret, 0, size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned long pfn;
|
||||
bootmem_data_t *bdata = pgdat->bdata;
|
||||
unsigned long i, count, total = 0;
|
||||
unsigned long idx;
|
||||
unsigned long *map;
|
||||
int gofast = 0;
|
||||
|
||||
BUG_ON(!bdata->node_bootmem_map);
|
||||
|
||||
count = 0;
|
||||
/* first extant page of the node */
|
||||
pfn = PFN_DOWN(bdata->node_boot_start);
|
||||
idx = bdata->node_low_pfn - pfn;
|
||||
map = bdata->node_bootmem_map;
|
||||
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
|
||||
if (bdata->node_boot_start == 0 ||
|
||||
ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
|
||||
gofast = 1;
|
||||
for (i = 0; i < idx; ) {
|
||||
unsigned long v = ~map[i / BITS_PER_LONG];
|
||||
|
||||
if (gofast && v == ~0UL) {
|
||||
int order;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
count += BITS_PER_LONG;
|
||||
order = ffs(BITS_PER_LONG) - 1;
|
||||
__free_pages_bootmem(page, order);
|
||||
i += BITS_PER_LONG;
|
||||
page += BITS_PER_LONG;
|
||||
} else if (v) {
|
||||
unsigned long m;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
for (m = 1; m && i < idx; m<<=1, page++, i++) {
|
||||
if (v & m) {
|
||||
count++;
|
||||
__free_pages_bootmem(page, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
i += BITS_PER_LONG;
|
||||
}
|
||||
pfn += BITS_PER_LONG;
|
||||
}
|
||||
total += count;
|
||||
|
||||
/*
|
||||
* Now free the allocator bitmap itself, it's not
|
||||
* needed anymore:
|
||||
*/
|
||||
page = virt_to_page(bdata->node_bootmem_map);
|
||||
count = 0;
|
||||
idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
|
||||
for (i = 0; i < idx; i++, page++) {
|
||||
__free_pages_bootmem(page, 0);
|
||||
count++;
|
||||
}
|
||||
total += count;
|
||||
bdata->node_bootmem_map = NULL;
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
|
||||
unsigned long startpfn, unsigned long endpfn)
|
||||
{
|
||||
return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
|
||||
}
|
||||
|
||||
void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
|
||||
unsigned long size)
|
||||
{
|
||||
reserve_bootmem_core(pgdat->bdata, physaddr, size);
|
||||
}
|
||||
|
||||
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
|
||||
unsigned long size)
|
||||
{
|
||||
free_bootmem_core(pgdat->bdata, physaddr, size);
|
||||
}
|
||||
|
||||
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
|
||||
{
|
||||
return free_all_bootmem_core(pgdat);
|
||||
}
|
||||
|
||||
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
|
||||
{
|
||||
max_low_pfn = pages;
|
||||
min_low_pfn = start;
|
||||
return init_bootmem_core(NODE_DATA(0), start, 0, pages);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
||||
void __init reserve_bootmem(unsigned long addr, unsigned long size)
|
||||
{
|
||||
reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
|
||||
}
|
||||
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
||||
|
||||
void __init free_bootmem(unsigned long addr, unsigned long size)
|
||||
{
|
||||
free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
|
||||
}
|
||||
|
||||
unsigned long __init free_all_bootmem(void)
|
||||
{
|
||||
return free_all_bootmem_core(NODE_DATA(0));
|
||||
}
|
||||
|
||||
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
|
||||
unsigned long goal)
|
||||
{
|
||||
bootmem_data_t *bdata;
|
||||
void *ptr;
|
||||
|
||||
list_for_each_entry(bdata, &bdata_list, list) {
|
||||
ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
|
||||
if (ptr)
|
||||
return ptr;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
|
||||
unsigned long goal)
|
||||
{
|
||||
void *mem = __alloc_bootmem_nopanic(size,align,goal);
|
||||
|
||||
if (mem)
|
||||
return mem;
|
||||
/*
|
||||
* Whoops, we cannot satisfy the allocation request.
|
||||
*/
|
||||
printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
|
||||
panic("Out of memory");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
|
||||
unsigned long align, unsigned long goal)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
|
||||
if (ptr)
|
||||
return ptr;
|
||||
|
||||
return __alloc_bootmem(size, align, goal);
|
||||
}
|
||||
|
||||
#ifndef ARCH_LOW_ADDRESS_LIMIT
|
||||
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
|
||||
#endif
|
||||
|
||||
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
|
||||
unsigned long goal)
|
||||
{
|
||||
bootmem_data_t *bdata;
|
||||
void *ptr;
|
||||
|
||||
list_for_each_entry(bdata, &bdata_list, list) {
|
||||
ptr = __alloc_bootmem_core(bdata, size, align, goal,
|
||||
ARCH_LOW_ADDRESS_LIMIT);
|
||||
if (ptr)
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Whoops, we cannot satisfy the allocation request.
|
||||
*/
|
||||
printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
|
||||
panic("Out of low memory");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
|
||||
unsigned long align, unsigned long goal)
|
||||
{
|
||||
return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
|
||||
ARCH_LOW_ADDRESS_LIMIT);
|
||||
}
|
||||
302
mm/bounce.c
Normal file
302
mm/bounce.c
Normal file
@@ -0,0 +1,302 @@
|
||||
/* bounce buffer handling for block devices
|
||||
*
|
||||
* - Split from highmem.c
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/blktrace_api.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#define POOL_SIZE 64
|
||||
#define ISA_POOL_SIZE 16
|
||||
|
||||
static mempool_t *page_pool, *isa_page_pool;
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
static __init int init_emergency_pool(void)
|
||||
{
|
||||
struct sysinfo i;
|
||||
si_meminfo(&i);
|
||||
si_swapinfo(&i);
|
||||
|
||||
if (!i.totalhigh)
|
||||
return 0;
|
||||
|
||||
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
|
||||
BUG_ON(!page_pool);
|
||||
printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__initcall(init_emergency_pool);
|
||||
|
||||
/*
|
||||
* highmem version, map in to vec
|
||||
*/
|
||||
static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned char *vto;
|
||||
|
||||
local_irq_save(flags);
|
||||
vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
|
||||
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
|
||||
kunmap_atomic(vto, KM_BOUNCE_READ);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
#else /* CONFIG_HIGHMEM */
|
||||
|
||||
#define bounce_copy_vec(to, vfrom) \
|
||||
memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
|
||||
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
/*
|
||||
* allocate pages in the DMA region for the ISA pool
|
||||
*/
|
||||
static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
|
||||
{
|
||||
return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
|
||||
}
|
||||
|
||||
/*
|
||||
* gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
|
||||
* as the max address, so check if the pool has already been created.
|
||||
*/
|
||||
int init_emergency_isa_pool(void)
|
||||
{
|
||||
if (isa_page_pool)
|
||||
return 0;
|
||||
|
||||
isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
|
||||
mempool_free_pages, (void *) 0);
|
||||
BUG_ON(!isa_page_pool);
|
||||
|
||||
printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple bounce buffer support for highmem pages. Depending on the
|
||||
* queue gfp mask set, *to may or may not be a highmem page. kmap it
|
||||
* always, it will do the Right Thing
|
||||
*/
|
||||
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
|
||||
{
|
||||
unsigned char *vfrom;
|
||||
struct bio_vec *tovec, *fromvec;
|
||||
int i;
|
||||
|
||||
__bio_for_each_segment(tovec, to, i, 0) {
|
||||
fromvec = from->bi_io_vec + i;
|
||||
|
||||
/*
|
||||
* not bounced
|
||||
*/
|
||||
if (tovec->bv_page == fromvec->bv_page)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* fromvec->bv_offset and fromvec->bv_len might have been
|
||||
* modified by the block layer, so use the original copy,
|
||||
* bounce_copy_vec already uses tovec->bv_len
|
||||
*/
|
||||
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
|
||||
|
||||
flush_dcache_page(tovec->bv_page);
|
||||
bounce_copy_vec(tovec, vfrom);
|
||||
}
|
||||
}
|
||||
|
||||
static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
|
||||
{
|
||||
struct bio *bio_orig = bio->bi_private;
|
||||
struct bio_vec *bvec, *org_vec;
|
||||
int i;
|
||||
|
||||
if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
|
||||
set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
|
||||
|
||||
/*
|
||||
* free up bounce indirect pages used
|
||||
*/
|
||||
__bio_for_each_segment(bvec, bio, i, 0) {
|
||||
org_vec = bio_orig->bi_io_vec + i;
|
||||
if (bvec->bv_page == org_vec->bv_page)
|
||||
continue;
|
||||
|
||||
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
|
||||
mempool_free(bvec->bv_page, pool);
|
||||
}
|
||||
|
||||
bio_endio(bio_orig, bio_orig->bi_size, err);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
bounce_end_io(bio, page_pool, err);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
bounce_end_io(bio, isa_page_pool, err);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
|
||||
{
|
||||
struct bio *bio_orig = bio->bi_private;
|
||||
|
||||
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
copy_to_high_bio_irq(bio_orig, bio);
|
||||
|
||||
bounce_end_io(bio, pool, err);
|
||||
}
|
||||
|
||||
static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
__bounce_end_io_read(bio, page_pool, err);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
__bounce_end_io_read(bio, isa_page_pool, err);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
|
||||
mempool_t *pool)
|
||||
{
|
||||
struct page *page;
|
||||
struct bio *bio = NULL;
|
||||
int i, rw = bio_data_dir(*bio_orig);
|
||||
struct bio_vec *to, *from;
|
||||
|
||||
bio_for_each_segment(from, *bio_orig, i) {
|
||||
page = from->bv_page;
|
||||
|
||||
/*
|
||||
* is destination page below bounce pfn?
|
||||
*/
|
||||
if (page_to_pfn(page) <= q->bounce_pfn)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* irk, bounce it
|
||||
*/
|
||||
if (!bio)
|
||||
bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
|
||||
|
||||
to = bio->bi_io_vec + i;
|
||||
|
||||
to->bv_page = mempool_alloc(pool, q->bounce_gfp);
|
||||
to->bv_len = from->bv_len;
|
||||
to->bv_offset = from->bv_offset;
|
||||
inc_zone_page_state(to->bv_page, NR_BOUNCE);
|
||||
|
||||
if (rw == WRITE) {
|
||||
char *vto, *vfrom;
|
||||
|
||||
flush_dcache_page(from->bv_page);
|
||||
vto = page_address(to->bv_page) + to->bv_offset;
|
||||
vfrom = kmap(from->bv_page) + from->bv_offset;
|
||||
memcpy(vto, vfrom, to->bv_len);
|
||||
kunmap(from->bv_page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* no pages bounced
|
||||
*/
|
||||
if (!bio)
|
||||
return;
|
||||
|
||||
blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
|
||||
|
||||
/*
|
||||
* at least one page was bounced, fill in possible non-highmem
|
||||
* pages
|
||||
*/
|
||||
__bio_for_each_segment(from, *bio_orig, i, 0) {
|
||||
to = bio_iovec_idx(bio, i);
|
||||
if (!to->bv_page) {
|
||||
to->bv_page = from->bv_page;
|
||||
to->bv_len = from->bv_len;
|
||||
to->bv_offset = from->bv_offset;
|
||||
}
|
||||
}
|
||||
|
||||
bio->bi_bdev = (*bio_orig)->bi_bdev;
|
||||
bio->bi_flags |= (1 << BIO_BOUNCED);
|
||||
bio->bi_sector = (*bio_orig)->bi_sector;
|
||||
bio->bi_rw = (*bio_orig)->bi_rw;
|
||||
|
||||
bio->bi_vcnt = (*bio_orig)->bi_vcnt;
|
||||
bio->bi_idx = (*bio_orig)->bi_idx;
|
||||
bio->bi_size = (*bio_orig)->bi_size;
|
||||
|
||||
if (pool == page_pool) {
|
||||
bio->bi_end_io = bounce_end_io_write;
|
||||
if (rw == READ)
|
||||
bio->bi_end_io = bounce_end_io_read;
|
||||
} else {
|
||||
bio->bi_end_io = bounce_end_io_write_isa;
|
||||
if (rw == READ)
|
||||
bio->bi_end_io = bounce_end_io_read_isa;
|
||||
}
|
||||
|
||||
bio->bi_private = *bio_orig;
|
||||
*bio_orig = bio;
|
||||
}
|
||||
|
||||
void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
|
||||
{
|
||||
mempool_t *pool;
|
||||
|
||||
/*
|
||||
* for non-isa bounce case, just check if the bounce pfn is equal
|
||||
* to or bigger than the highest pfn in the system -- in that case,
|
||||
* don't waste time iterating over bio segments
|
||||
*/
|
||||
if (!(q->bounce_gfp & GFP_DMA)) {
|
||||
if (q->bounce_pfn >= blk_max_pfn)
|
||||
return;
|
||||
pool = page_pool;
|
||||
} else {
|
||||
BUG_ON(!isa_page_pool);
|
||||
pool = isa_page_pool;
|
||||
}
|
||||
|
||||
/*
|
||||
* slow path
|
||||
*/
|
||||
__blk_queue_bounce(q, bio_orig, pool);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(blk_queue_bounce);
|
||||
125
mm/fadvise.c
Normal file
125
mm/fadvise.c
Normal file
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
* mm/fadvise.c
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 11Jan2003 akpm@digeo.com
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/fadvise.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/unistd.h>
|
||||
|
||||
/*
|
||||
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
|
||||
* deactivate the pages and clear PG_Referenced.
|
||||
*/
|
||||
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
|
||||
{
|
||||
struct file *file = fget(fd);
|
||||
struct address_space *mapping;
|
||||
struct backing_dev_info *bdi;
|
||||
loff_t endbyte; /* inclusive */
|
||||
pgoff_t start_index;
|
||||
pgoff_t end_index;
|
||||
unsigned long nrpages;
|
||||
int ret = 0;
|
||||
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
|
||||
if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
|
||||
ret = -ESPIPE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mapping = file->f_mapping;
|
||||
if (!mapping || len < 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mapping->a_ops->get_xip_page)
|
||||
/* no bad return value, but ignore advice */
|
||||
goto out;
|
||||
|
||||
/* Careful about overflows. Len == 0 means "as much as possible" */
|
||||
endbyte = offset + len;
|
||||
if (!len || endbyte < len)
|
||||
endbyte = -1;
|
||||
else
|
||||
endbyte--; /* inclusive */
|
||||
|
||||
bdi = mapping->backing_dev_info;
|
||||
|
||||
switch (advice) {
|
||||
case POSIX_FADV_NORMAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages;
|
||||
break;
|
||||
case POSIX_FADV_RANDOM:
|
||||
file->f_ra.ra_pages = 0;
|
||||
break;
|
||||
case POSIX_FADV_SEQUENTIAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages * 2;
|
||||
break;
|
||||
case POSIX_FADV_WILLNEED:
|
||||
if (!mapping->a_ops->readpage) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* First and last PARTIAL page! */
|
||||
start_index = offset >> PAGE_CACHE_SHIFT;
|
||||
end_index = endbyte >> PAGE_CACHE_SHIFT;
|
||||
|
||||
/* Careful about overflow on the "+1" */
|
||||
nrpages = end_index - start_index + 1;
|
||||
if (!nrpages)
|
||||
nrpages = ~0UL;
|
||||
|
||||
ret = force_page_cache_readahead(mapping, file,
|
||||
start_index,
|
||||
max_sane_readahead(nrpages));
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
break;
|
||||
case POSIX_FADV_NOREUSE:
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
if (!bdi_write_congested(mapping->backing_dev_info))
|
||||
filemap_flush(mapping);
|
||||
|
||||
/* First and last FULL page! */
|
||||
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
|
||||
end_index = (endbyte >> PAGE_CACHE_SHIFT);
|
||||
|
||||
if (end_index >= start_index)
|
||||
invalidate_mapping_pages(mapping, start_index,
|
||||
end_index);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
out:
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef __ARCH_WANT_SYS_FADVISE64
|
||||
|
||||
asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
|
||||
{
|
||||
return sys_fadvise64_64(fd, offset, len, advice);
|
||||
}
|
||||
|
||||
#endif
|
||||
2463
mm/filemap.c
Normal file
2463
mm/filemap.c
Normal file
File diff suppressed because it is too large
Load Diff
103
mm/filemap.h
Normal file
103
mm/filemap.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* linux/mm/filemap.h
|
||||
*
|
||||
* Copyright (C) 1994-1999 Linus Torvalds
|
||||
*/
|
||||
|
||||
#ifndef __FILEMAP_H
|
||||
#define __FILEMAP_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
size_t
|
||||
__filemap_copy_from_user_iovec_inatomic(char *vaddr,
|
||||
const struct iovec *iov,
|
||||
size_t base,
|
||||
size_t bytes);
|
||||
|
||||
/*
|
||||
* Copy as much as we can into the page and return the number of bytes which
|
||||
* were sucessfully copied. If a fault is encountered then clear the page
|
||||
* out to (offset+bytes) and return the number of bytes which were copied.
|
||||
*
|
||||
* NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
|
||||
* to *NOT* zero any tail of the buffer that it failed to copy. If it does,
|
||||
* and if the following non-atomic copy succeeds, then there is a small window
|
||||
* where the target page contains neither the data before the write, nor the
|
||||
* data after the write (it contains zero). A read at this time will see
|
||||
* data that is inconsistent with any ordering of the read and the write.
|
||||
* (This has been detected in practice).
|
||||
*/
|
||||
static inline size_t
|
||||
filemap_copy_from_user(struct page *page, unsigned long offset,
|
||||
const char __user *buf, unsigned bytes)
|
||||
{
|
||||
char *kaddr;
|
||||
int left;
|
||||
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
|
||||
if (left != 0) {
|
||||
/* Do it the slow way */
|
||||
kaddr = kmap(page);
|
||||
left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
|
||||
kunmap(page);
|
||||
}
|
||||
return bytes - left;
|
||||
}
|
||||
|
||||
/*
|
||||
* This has the same sideeffects and return value as filemap_copy_from_user().
|
||||
* The difference is that on a fault we need to memset the remainder of the
|
||||
* page (out to offset+bytes), to emulate filemap_copy_from_user()'s
|
||||
* single-segment behaviour.
|
||||
*/
|
||||
static inline size_t
|
||||
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
|
||||
const struct iovec *iov, size_t base, size_t bytes)
|
||||
{
|
||||
char *kaddr;
|
||||
size_t copied;
|
||||
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
|
||||
base, bytes);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
if (copied != bytes) {
|
||||
kaddr = kmap(page);
|
||||
copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
|
||||
base, bytes);
|
||||
if (bytes - copied)
|
||||
memset(kaddr + offset + copied, 0, bytes - copied);
|
||||
kunmap(page);
|
||||
}
|
||||
return copied;
|
||||
}
|
||||
|
||||
static inline void
|
||||
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
|
||||
{
|
||||
const struct iovec *iov = *iovp;
|
||||
size_t base = *basep;
|
||||
|
||||
do {
|
||||
int copy = min(bytes, iov->iov_len - base);
|
||||
|
||||
bytes -= copy;
|
||||
base += copy;
|
||||
if (iov->iov_len == base) {
|
||||
iov++;
|
||||
base = 0;
|
||||
}
|
||||
} while (bytes);
|
||||
*iovp = iov;
|
||||
*basep = base;
|
||||
}
|
||||
#endif
|
||||
468
mm/filemap_xip.c
Normal file
468
mm/filemap_xip.c
Normal file
@@ -0,0 +1,468 @@
|
||||
/*
|
||||
* linux/mm/filemap_xip.c
|
||||
*
|
||||
* Copyright (C) 2005 IBM Corporation
|
||||
* Author: Carsten Otte <cotte@de.ibm.com>
|
||||
*
|
||||
* derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "filemap.h"
|
||||
|
||||
/*
|
||||
* We do use our own empty page to avoid interference with other users
|
||||
* of ZERO_PAGE(), such as /dev/zero
|
||||
*/
|
||||
static struct page *__xip_sparse_page;
|
||||
|
||||
static struct page *xip_sparse_page(void)
|
||||
{
|
||||
if (!__xip_sparse_page) {
|
||||
unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER);
|
||||
if (zeroes) {
|
||||
static DEFINE_SPINLOCK(xip_alloc_lock);
|
||||
spin_lock(&xip_alloc_lock);
|
||||
if (!__xip_sparse_page)
|
||||
__xip_sparse_page = virt_to_page(zeroes);
|
||||
else
|
||||
free_page(zeroes);
|
||||
spin_unlock(&xip_alloc_lock);
|
||||
}
|
||||
}
|
||||
return __xip_sparse_page;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a file read routine for execute in place files, and uses
|
||||
* the mapping->a_ops->get_xip_page() function for the actual low-level
|
||||
* stuff.
|
||||
*
|
||||
* Note the struct file* is not used at all. It may be NULL.
|
||||
*/
|
||||
static void
|
||||
do_xip_mapping_read(struct address_space *mapping,
|
||||
struct file_ra_state *_ra,
|
||||
struct file *filp,
|
||||
loff_t *ppos,
|
||||
read_descriptor_t *desc,
|
||||
read_actor_t actor)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
unsigned long index, end_index, offset;
|
||||
loff_t isize;
|
||||
|
||||
BUG_ON(!mapping->a_ops->get_xip_page);
|
||||
|
||||
index = *ppos >> PAGE_CACHE_SHIFT;
|
||||
offset = *ppos & ~PAGE_CACHE_MASK;
|
||||
|
||||
isize = i_size_read(inode);
|
||||
if (!isize)
|
||||
goto out;
|
||||
|
||||
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
|
||||
for (;;) {
|
||||
struct page *page;
|
||||
unsigned long nr, ret;
|
||||
|
||||
/* nr is the maximum number of bytes to copy from this page */
|
||||
nr = PAGE_CACHE_SIZE;
|
||||
if (index >= end_index) {
|
||||
if (index > end_index)
|
||||
goto out;
|
||||
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
|
||||
if (nr <= offset) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
nr = nr - offset;
|
||||
|
||||
page = mapping->a_ops->get_xip_page(mapping,
|
||||
index*(PAGE_SIZE/512), 0);
|
||||
if (!page)
|
||||
goto no_xip_page;
|
||||
if (unlikely(IS_ERR(page))) {
|
||||
if (PTR_ERR(page) == -ENODATA) {
|
||||
/* sparse */
|
||||
page = ZERO_PAGE(0);
|
||||
} else {
|
||||
desc->error = PTR_ERR(page);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* If users can be writing to this page using arbitrary
|
||||
* virtual addresses, take care about potential aliasing
|
||||
* before reading the page on the kernel side.
|
||||
*/
|
||||
if (mapping_writably_mapped(mapping))
|
||||
flush_dcache_page(page);
|
||||
|
||||
/*
|
||||
* Ok, we have the page, so now we can copy it to user space...
|
||||
*
|
||||
* The actor routine returns how many bytes were actually used..
|
||||
* NOTE! This may not be the same as how much of a user buffer
|
||||
* we filled up (we may be padding etc), so we can only update
|
||||
* "pos" here (the actor routine has to update the user buffer
|
||||
* pointers and the remaining count).
|
||||
*/
|
||||
ret = actor(desc, page, offset, nr);
|
||||
offset += ret;
|
||||
index += offset >> PAGE_CACHE_SHIFT;
|
||||
offset &= ~PAGE_CACHE_MASK;
|
||||
|
||||
if (ret == nr && desc->count)
|
||||
continue;
|
||||
goto out;
|
||||
|
||||
no_xip_page:
|
||||
/* Did not get the page. Report it */
|
||||
desc->error = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
|
||||
if (filp)
|
||||
file_accessed(filp);
|
||||
}
|
||||
|
||||
ssize_t
|
||||
xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
|
||||
{
|
||||
read_descriptor_t desc;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, buf, len))
|
||||
return -EFAULT;
|
||||
|
||||
desc.written = 0;
|
||||
desc.arg.buf = buf;
|
||||
desc.count = len;
|
||||
desc.error = 0;
|
||||
|
||||
do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
|
||||
ppos, &desc, file_read_actor);
|
||||
|
||||
if (desc.written)
|
||||
return desc.written;
|
||||
else
|
||||
return desc.error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xip_file_read);
|
||||
|
||||
ssize_t
|
||||
xip_file_sendfile(struct file *in_file, loff_t *ppos,
|
||||
size_t count, read_actor_t actor, void *target)
|
||||
{
|
||||
read_descriptor_t desc;
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
desc.written = 0;
|
||||
desc.count = count;
|
||||
desc.arg.data = target;
|
||||
desc.error = 0;
|
||||
|
||||
do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
|
||||
ppos, &desc, actor);
|
||||
if (desc.written)
|
||||
return desc.written;
|
||||
return desc.error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xip_file_sendfile);
|
||||
|
||||
/*
|
||||
* __xip_unmap is invoked from xip_unmap and
|
||||
* xip_write
|
||||
*
|
||||
* This function walks all vmas of the address_space and unmaps the
|
||||
* __xip_sparse_page when found at pgoff.
|
||||
*/
|
||||
static void
|
||||
__xip_unmap (struct address_space * mapping,
|
||||
unsigned long pgoff)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm;
|
||||
struct prio_tree_iter iter;
|
||||
unsigned long address;
|
||||
pte_t *pte;
|
||||
pte_t pteval;
|
||||
spinlock_t *ptl;
|
||||
struct page *page;
|
||||
|
||||
page = __xip_sparse_page;
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
|
||||
mm = vma->vm_mm;
|
||||
address = vma->vm_start +
|
||||
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
|
||||
pte = page_check_address(page, mm, address, &ptl);
|
||||
if (pte) {
|
||||
/* Nuke the page table entry. */
|
||||
flush_cache_page(vma, address, pte_pfn(*pte));
|
||||
pteval = ptep_clear_flush(vma, address, pte);
|
||||
page_remove_rmap(page, vma);
|
||||
dec_mm_counter(mm, file_rss);
|
||||
BUG_ON(pte_dirty(pteval));
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
page_cache_release(page);
|
||||
}
|
||||
}
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* xip_nopage() is invoked via the vma operations vector for a
|
||||
* mapped memory region to read in file data during a page fault.
|
||||
*
|
||||
* This function is derived from filemap_nopage, but used for execute in place
|
||||
*/
|
||||
static struct page *
|
||||
xip_file_nopage(struct vm_area_struct * area,
|
||||
unsigned long address,
|
||||
int *type)
|
||||
{
|
||||
struct file *file = area->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
struct page *page;
|
||||
unsigned long size, pgoff, endoff;
|
||||
|
||||
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
|
||||
+ area->vm_pgoff;
|
||||
endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
|
||||
+ area->vm_pgoff;
|
||||
|
||||
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||
if (pgoff >= size)
|
||||
return NOPAGE_SIGBUS;
|
||||
|
||||
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
|
||||
if (!IS_ERR(page))
|
||||
goto out;
|
||||
if (PTR_ERR(page) != -ENODATA)
|
||||
return NOPAGE_SIGBUS;
|
||||
|
||||
/* sparse block */
|
||||
if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
|
||||
(area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
|
||||
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
|
||||
/* maybe shared writable, allocate new block */
|
||||
page = mapping->a_ops->get_xip_page (mapping,
|
||||
pgoff*(PAGE_SIZE/512), 1);
|
||||
if (IS_ERR(page))
|
||||
return NOPAGE_SIGBUS;
|
||||
/* unmap page at pgoff from all other vmas */
|
||||
__xip_unmap(mapping, pgoff);
|
||||
} else {
|
||||
/* not shared and writable, use xip_sparse_page() */
|
||||
page = xip_sparse_page();
|
||||
if (!page)
|
||||
return NOPAGE_OOM;
|
||||
}
|
||||
|
||||
out:
|
||||
page_cache_get(page);
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct vm_operations_struct xip_file_vm_ops = {
|
||||
.nopage = xip_file_nopage,
|
||||
};
|
||||
|
||||
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
|
||||
{
|
||||
BUG_ON(!file->f_mapping->a_ops->get_xip_page);
|
||||
|
||||
file_accessed(file);
|
||||
vma->vm_ops = &xip_file_vm_ops;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xip_file_mmap);
|
||||
|
||||
static ssize_t
|
||||
__xip_file_write(struct file *filp, const char __user *buf,
|
||||
size_t count, loff_t pos, loff_t *ppos)
|
||||
{
|
||||
struct address_space * mapping = filp->f_mapping;
|
||||
const struct address_space_operations *a_ops = mapping->a_ops;
|
||||
struct inode *inode = mapping->host;
|
||||
long status = 0;
|
||||
struct page *page;
|
||||
size_t bytes;
|
||||
ssize_t written = 0;
|
||||
|
||||
BUG_ON(!mapping->a_ops->get_xip_page);
|
||||
|
||||
do {
|
||||
unsigned long index;
|
||||
unsigned long offset;
|
||||
size_t copied;
|
||||
|
||||
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
|
||||
index = pos >> PAGE_CACHE_SHIFT;
|
||||
bytes = PAGE_CACHE_SIZE - offset;
|
||||
if (bytes > count)
|
||||
bytes = count;
|
||||
|
||||
/*
|
||||
* Bring in the user page that we will copy from _first_.
|
||||
* Otherwise there's a nasty deadlock on copying from the
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*/
|
||||
fault_in_pages_readable(buf, bytes);
|
||||
|
||||
page = a_ops->get_xip_page(mapping,
|
||||
index*(PAGE_SIZE/512), 0);
|
||||
if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
|
||||
/* we allocate a new page unmap it */
|
||||
page = a_ops->get_xip_page(mapping,
|
||||
index*(PAGE_SIZE/512), 1);
|
||||
if (!IS_ERR(page))
|
||||
/* unmap page at pgoff from all other vmas */
|
||||
__xip_unmap(mapping, index);
|
||||
}
|
||||
|
||||
if (IS_ERR(page)) {
|
||||
status = PTR_ERR(page);
|
||||
break;
|
||||
}
|
||||
|
||||
copied = filemap_copy_from_user(page, offset, buf, bytes);
|
||||
flush_dcache_page(page);
|
||||
if (likely(copied > 0)) {
|
||||
status = copied;
|
||||
|
||||
if (status >= 0) {
|
||||
written += status;
|
||||
count -= status;
|
||||
pos += status;
|
||||
buf += status;
|
||||
}
|
||||
}
|
||||
if (unlikely(copied != bytes))
|
||||
if (status >= 0)
|
||||
status = -EFAULT;
|
||||
if (status < 0)
|
||||
break;
|
||||
} while (count);
|
||||
*ppos = pos;
|
||||
/*
|
||||
* No need to use i_size_read() here, the i_size
|
||||
* cannot change under us because we hold i_mutex.
|
||||
*/
|
||||
if (pos > inode->i_size) {
|
||||
i_size_write(inode, pos);
|
||||
mark_inode_dirty(inode);
|
||||
}
|
||||
|
||||
return written ? written : status;
|
||||
}
|
||||
|
||||
ssize_t
|
||||
xip_file_write(struct file *filp, const char __user *buf, size_t len,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct address_space *mapping = filp->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
size_t count;
|
||||
loff_t pos;
|
||||
ssize_t ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
if (!access_ok(VERIFY_READ, buf, len)) {
|
||||
ret=-EFAULT;
|
||||
goto out_up;
|
||||
}
|
||||
|
||||
pos = *ppos;
|
||||
count = len;
|
||||
|
||||
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
|
||||
|
||||
/* We can write back this queue in page reclaim */
|
||||
current->backing_dev_info = mapping->backing_dev_info;
|
||||
|
||||
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
|
||||
if (ret)
|
||||
goto out_backing;
|
||||
if (count == 0)
|
||||
goto out_backing;
|
||||
|
||||
ret = remove_suid(filp->f_path.dentry);
|
||||
if (ret)
|
||||
goto out_backing;
|
||||
|
||||
file_update_time(filp);
|
||||
|
||||
ret = __xip_file_write (filp, buf, count, pos, ppos);
|
||||
|
||||
out_backing:
|
||||
current->backing_dev_info = NULL;
|
||||
out_up:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xip_file_write);
|
||||
|
||||
/*
|
||||
* truncate a page used for execute in place
|
||||
* functionality is analog to block_truncate_page but does use get_xip_page
|
||||
* to get the page instead of page cache
|
||||
*/
|
||||
int
|
||||
xip_truncate_page(struct address_space *mapping, loff_t from)
|
||||
{
|
||||
pgoff_t index = from >> PAGE_CACHE_SHIFT;
|
||||
unsigned offset = from & (PAGE_CACHE_SIZE-1);
|
||||
unsigned blocksize;
|
||||
unsigned length;
|
||||
struct page *page;
|
||||
void *kaddr;
|
||||
|
||||
BUG_ON(!mapping->a_ops->get_xip_page);
|
||||
|
||||
blocksize = 1 << mapping->host->i_blkbits;
|
||||
length = offset & (blocksize - 1);
|
||||
|
||||
/* Block boundary? Nothing to do */
|
||||
if (!length)
|
||||
return 0;
|
||||
|
||||
length = blocksize - length;
|
||||
|
||||
page = mapping->a_ops->get_xip_page(mapping,
|
||||
index*(PAGE_SIZE/512), 0);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
if (unlikely(IS_ERR(page))) {
|
||||
if (PTR_ERR(page) == -ENODATA)
|
||||
/* Hole? No need to truncate */
|
||||
return 0;
|
||||
else
|
||||
return PTR_ERR(page);
|
||||
}
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
memset(kaddr + offset, 0, length);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
|
||||
flush_dcache_page(page);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xip_truncate_page);
|
||||
228
mm/fremap.c
Normal file
228
mm/fremap.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* linux/mm/fremap.c
|
||||
*
|
||||
* Explicit pagetable population and nonlinear (random) mappings support.
|
||||
*
|
||||
* started by Ingo Molnar, Copyright (C) 2002, 2003
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
pte_t pte = *ptep;
|
||||
struct page *page = NULL;
|
||||
|
||||
if (pte_present(pte)) {
|
||||
flush_cache_page(vma, addr, pte_pfn(pte));
|
||||
pte = ptep_clear_flush(vma, addr, ptep);
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (page) {
|
||||
if (pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
page_remove_rmap(page, vma);
|
||||
page_cache_release(page);
|
||||
}
|
||||
} else {
|
||||
if (!pte_file(pte))
|
||||
free_swap_and_cache(pte_to_swp_entry(pte));
|
||||
pte_clear_not_present_full(mm, addr, ptep, 0);
|
||||
}
|
||||
return !!page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Install a file page to a given virtual memory address, release any
|
||||
* previously existing mapping.
|
||||
*/
|
||||
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, struct page *page, pgprot_t prot)
|
||||
{
|
||||
struct inode *inode;
|
||||
pgoff_t size;
|
||||
int err = -ENOMEM;
|
||||
pte_t *pte;
|
||||
pte_t pte_val;
|
||||
spinlock_t *ptl;
|
||||
|
||||
pte = get_locked_pte(mm, addr, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* This page may have been truncated. Tell the
|
||||
* caller about it.
|
||||
*/
|
||||
err = -EINVAL;
|
||||
inode = vma->vm_file->f_mapping->host;
|
||||
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||
if (!page->mapping || page->index >= size)
|
||||
goto unlock;
|
||||
err = -ENOMEM;
|
||||
if (page_mapcount(page) > INT_MAX/2)
|
||||
goto unlock;
|
||||
|
||||
if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
|
||||
inc_mm_counter(mm, file_rss);
|
||||
|
||||
flush_icache_page(vma, page);
|
||||
pte_val = mk_pte(page, prot);
|
||||
set_pte_at(mm, addr, pte, pte_val);
|
||||
page_add_file_rmap(page);
|
||||
update_mmu_cache(vma, addr, pte_val);
|
||||
lazy_mmu_prot_update(pte_val);
|
||||
err = 0;
|
||||
unlock:
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(install_page);
|
||||
|
||||
/*
|
||||
* Install a file pte to a given virtual memory address, release any
|
||||
* previously existing mapping.
|
||||
*/
|
||||
int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long pgoff, pgprot_t prot)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
pte = get_locked_pte(mm, addr, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
|
||||
update_hiwater_rss(mm);
|
||||
dec_mm_counter(mm, file_rss);
|
||||
}
|
||||
|
||||
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
|
||||
/*
|
||||
* We don't need to run update_mmu_cache() here because the "file pte"
|
||||
* being installed by install_file_pte() is not a real pte - it's a
|
||||
* non-present entry (like a swap entry), noting what file offset should
|
||||
* be mapped there when there's a fault (in a non-linear vma where
|
||||
* that's not obvious).
|
||||
*/
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
err = 0;
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/***
|
||||
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
|
||||
* file within an existing vma.
|
||||
* @start: start of the remapped virtual memory range
|
||||
* @size: size of the remapped virtual memory range
|
||||
* @prot: new protection bits of the range
|
||||
* @pgoff: to be mapped page of the backing store file
|
||||
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
|
||||
*
|
||||
* this syscall works purely via pagetables, so it's the most efficient
|
||||
* way to map the same (large) file into a given virtual window. Unlike
|
||||
* mmap()/mremap() it does not create any new vmas. The new mappings are
|
||||
* also safe across swapout.
|
||||
*
|
||||
* NOTE: the 'prot' parameter right now is ignored, and the vma's default
|
||||
* protection is used. Arbitrary protections might be implemented in the
|
||||
* future.
|
||||
*/
|
||||
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
||||
unsigned long __prot, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct address_space *mapping;
|
||||
unsigned long end = start + size;
|
||||
struct vm_area_struct *vma;
|
||||
int err = -EINVAL;
|
||||
int has_write_lock = 0;
|
||||
|
||||
if (__prot)
|
||||
return err;
|
||||
/*
|
||||
* Sanitize the syscall parameters:
|
||||
*/
|
||||
start = start & PAGE_MASK;
|
||||
size = size & PAGE_MASK;
|
||||
|
||||
/* Does the address range wrap, or is the span zero-sized? */
|
||||
if (start + size <= start)
|
||||
return err;
|
||||
|
||||
/* Can we represent this offset inside this architecture's pte's? */
|
||||
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
|
||||
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
|
||||
return err;
|
||||
#endif
|
||||
|
||||
/* We need down_write() to change vma->vm_flags. */
|
||||
down_read(&mm->mmap_sem);
|
||||
retry:
|
||||
vma = find_vma(mm, start);
|
||||
|
||||
/*
|
||||
* Make sure the vma is shared, that it supports prefaulting,
|
||||
* and that the remapped range is valid and fully within
|
||||
* the single existing vma. vm_private_data is used as a
|
||||
* swapout cursor in a VM_NONLINEAR vma.
|
||||
*/
|
||||
if (vma && (vma->vm_flags & VM_SHARED) &&
|
||||
(!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
|
||||
vma->vm_ops && vma->vm_ops->populate &&
|
||||
end > start && start >= vma->vm_start &&
|
||||
end <= vma->vm_end) {
|
||||
|
||||
/* Must set VM_NONLINEAR before any pages are populated. */
|
||||
if (pgoff != linear_page_index(vma, start) &&
|
||||
!(vma->vm_flags & VM_NONLINEAR)) {
|
||||
if (!has_write_lock) {
|
||||
up_read(&mm->mmap_sem);
|
||||
down_write(&mm->mmap_sem);
|
||||
has_write_lock = 1;
|
||||
goto retry;
|
||||
}
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
flush_dcache_mmap_lock(mapping);
|
||||
vma->vm_flags |= VM_NONLINEAR;
|
||||
vma_prio_tree_remove(vma, &mapping->i_mmap);
|
||||
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
|
||||
flush_dcache_mmap_unlock(mapping);
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
err = vma->vm_ops->populate(vma, start, size,
|
||||
vma->vm_page_prot,
|
||||
pgoff, flags & MAP_NONBLOCK);
|
||||
|
||||
/*
|
||||
* We can't clear VM_NONLINEAR because we'd have to do
|
||||
* it after ->populate completes, and that would prevent
|
||||
* downgrading the lock. (Locks can't be upgraded).
|
||||
*/
|
||||
}
|
||||
if (likely(!has_write_lock))
|
||||
up_read(&mm->mmap_sem);
|
||||
else
|
||||
up_write(&mm->mmap_sem);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
335
mm/highmem.c
Normal file
335
mm/highmem.c
Normal file
@@ -0,0 +1,335 @@
|
||||
/*
|
||||
* High memory handling common code and variables.
|
||||
*
|
||||
* (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
|
||||
* Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
|
||||
*
|
||||
*
|
||||
* Redesigned the x86 32-bit VM architecture to deal with
|
||||
* 64-bit physical space. With current x86 CPUs this
|
||||
* means up to 64 Gigabytes physical RAM.
|
||||
*
|
||||
* Rewrote high memory support to move the page cache into
|
||||
* high memory. Implemented permanent (schedulable) kmaps
|
||||
* based on Linus' idea.
|
||||
*
|
||||
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/blktrace_api.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
/*
|
||||
* Virtual_count is not a pure "count".
|
||||
* 0 means that it is not mapped, and has not been mapped
|
||||
* since a TLB flush - it is usable.
|
||||
* 1 means that there are no users, but it has been mapped
|
||||
* since the last TLB flush - so we can't use it.
|
||||
* n means that there are (n-1) current users of it.
|
||||
*/
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
|
||||
unsigned long totalhigh_pages __read_mostly;
|
||||
|
||||
unsigned int nr_free_highpages (void)
|
||||
{
|
||||
pg_data_t *pgdat;
|
||||
unsigned int pages = 0;
|
||||
|
||||
for_each_online_pgdat(pgdat)
|
||||
pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
|
||||
NR_FREE_PAGES);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
static int pkmap_count[LAST_PKMAP];
|
||||
static unsigned int last_pkmap_nr;
|
||||
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
|
||||
|
||||
pte_t * pkmap_page_table;
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
|
||||
|
||||
static void flush_all_zero_pkmaps(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
flush_cache_kmaps();
|
||||
|
||||
for (i = 0; i < LAST_PKMAP; i++) {
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* zero means we don't have anything to do,
|
||||
* >1 means that it is still in use. Only
|
||||
* a count of 1 means that it is free but
|
||||
* needs to be unmapped
|
||||
*/
|
||||
if (pkmap_count[i] != 1)
|
||||
continue;
|
||||
pkmap_count[i] = 0;
|
||||
|
||||
/* sanity check */
|
||||
BUG_ON(pte_none(pkmap_page_table[i]));
|
||||
|
||||
/*
|
||||
* Don't need an atomic fetch-and-clear op here;
|
||||
* no-one has the page mapped, and cannot get at
|
||||
* its virtual address (and hence PTE) without first
|
||||
* getting the kmap_lock (which is held here).
|
||||
* So no dangers, even with speculative execution.
|
||||
*/
|
||||
page = pte_page(pkmap_page_table[i]);
|
||||
pte_clear(&init_mm, (unsigned long)page_address(page),
|
||||
&pkmap_page_table[i]);
|
||||
|
||||
set_page_address(page, NULL);
|
||||
}
|
||||
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
|
||||
}
|
||||
|
||||
static inline unsigned long map_new_virtual(struct page *page)
|
||||
{
|
||||
unsigned long vaddr;
|
||||
int count;
|
||||
|
||||
start:
|
||||
count = LAST_PKMAP;
|
||||
/* Find an empty entry */
|
||||
for (;;) {
|
||||
last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
|
||||
if (!last_pkmap_nr) {
|
||||
flush_all_zero_pkmaps();
|
||||
count = LAST_PKMAP;
|
||||
}
|
||||
if (!pkmap_count[last_pkmap_nr])
|
||||
break; /* Found a usable entry */
|
||||
if (--count)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Sleep for somebody else to unmap their entries
|
||||
*/
|
||||
{
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
add_wait_queue(&pkmap_map_wait, &wait);
|
||||
spin_unlock(&kmap_lock);
|
||||
schedule();
|
||||
remove_wait_queue(&pkmap_map_wait, &wait);
|
||||
spin_lock(&kmap_lock);
|
||||
|
||||
/* Somebody else might have mapped it while we slept */
|
||||
if (page_address(page))
|
||||
return (unsigned long)page_address(page);
|
||||
|
||||
/* Re-start */
|
||||
goto start;
|
||||
}
|
||||
}
|
||||
vaddr = PKMAP_ADDR(last_pkmap_nr);
|
||||
set_pte_at(&init_mm, vaddr,
|
||||
&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
|
||||
|
||||
pkmap_count[last_pkmap_nr] = 1;
|
||||
set_page_address(page, (void *)vaddr);
|
||||
|
||||
return vaddr;
|
||||
}
|
||||
|
||||
void fastcall *kmap_high(struct page *page)
|
||||
{
|
||||
unsigned long vaddr;
|
||||
|
||||
/*
|
||||
* For highmem pages, we can't trust "virtual" until
|
||||
* after we have the lock.
|
||||
*
|
||||
* We cannot call this from interrupts, as it may block
|
||||
*/
|
||||
spin_lock(&kmap_lock);
|
||||
vaddr = (unsigned long)page_address(page);
|
||||
if (!vaddr)
|
||||
vaddr = map_new_virtual(page);
|
||||
pkmap_count[PKMAP_NR(vaddr)]++;
|
||||
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
|
||||
spin_unlock(&kmap_lock);
|
||||
return (void*) vaddr;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kmap_high);
|
||||
|
||||
void fastcall kunmap_high(struct page *page)
|
||||
{
|
||||
unsigned long vaddr;
|
||||
unsigned long nr;
|
||||
int need_wakeup;
|
||||
|
||||
spin_lock(&kmap_lock);
|
||||
vaddr = (unsigned long)page_address(page);
|
||||
BUG_ON(!vaddr);
|
||||
nr = PKMAP_NR(vaddr);
|
||||
|
||||
/*
|
||||
* A count must never go down to zero
|
||||
* without a TLB flush!
|
||||
*/
|
||||
need_wakeup = 0;
|
||||
switch (--pkmap_count[nr]) {
|
||||
case 0:
|
||||
BUG();
|
||||
case 1:
|
||||
/*
|
||||
* Avoid an unnecessary wake_up() function call.
|
||||
* The common case is pkmap_count[] == 1, but
|
||||
* no waiters.
|
||||
* The tasks queued in the wait-queue are guarded
|
||||
* by both the lock in the wait-queue-head and by
|
||||
* the kmap_lock. As the kmap_lock is held here,
|
||||
* no need for the wait-queue-head's lock. Simply
|
||||
* test if the queue is empty.
|
||||
*/
|
||||
need_wakeup = waitqueue_active(&pkmap_map_wait);
|
||||
}
|
||||
spin_unlock(&kmap_lock);
|
||||
|
||||
/* do wake-up, if needed, race-free outside of the spin lock */
|
||||
if (need_wakeup)
|
||||
wake_up(&pkmap_map_wait);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kunmap_high);
|
||||
#endif
|
||||
|
||||
#if defined(HASHED_PAGE_VIRTUAL)
|
||||
|
||||
#define PA_HASH_ORDER 7
|
||||
|
||||
/*
|
||||
* Describes one page->virtual association
|
||||
*/
|
||||
struct page_address_map {
|
||||
struct page *page;
|
||||
void *virtual;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
/*
|
||||
* page_address_map freelist, allocated from page_address_maps.
|
||||
*/
|
||||
static struct list_head page_address_pool; /* freelist */
|
||||
static spinlock_t pool_lock; /* protects page_address_pool */
|
||||
|
||||
/*
|
||||
* Hash table bucket
|
||||
*/
|
||||
static struct page_address_slot {
|
||||
struct list_head lh; /* List of page_address_maps */
|
||||
spinlock_t lock; /* Protect this bucket's list */
|
||||
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
|
||||
|
||||
static struct page_address_slot *page_slot(struct page *page)
|
||||
{
|
||||
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
|
||||
}
|
||||
|
||||
void *page_address(struct page *page)
|
||||
{
|
||||
unsigned long flags;
|
||||
void *ret;
|
||||
struct page_address_slot *pas;
|
||||
|
||||
if (!PageHighMem(page))
|
||||
return lowmem_page_address(page);
|
||||
|
||||
pas = page_slot(page);
|
||||
ret = NULL;
|
||||
spin_lock_irqsave(&pas->lock, flags);
|
||||
if (!list_empty(&pas->lh)) {
|
||||
struct page_address_map *pam;
|
||||
|
||||
list_for_each_entry(pam, &pas->lh, list) {
|
||||
if (pam->page == page) {
|
||||
ret = pam->virtual;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
}
|
||||
done:
|
||||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(page_address);
|
||||
|
||||
void set_page_address(struct page *page, void *virtual)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct page_address_slot *pas;
|
||||
struct page_address_map *pam;
|
||||
|
||||
BUG_ON(!PageHighMem(page));
|
||||
|
||||
pas = page_slot(page);
|
||||
if (virtual) { /* Add */
|
||||
BUG_ON(list_empty(&page_address_pool));
|
||||
|
||||
spin_lock_irqsave(&pool_lock, flags);
|
||||
pam = list_entry(page_address_pool.next,
|
||||
struct page_address_map, list);
|
||||
list_del(&pam->list);
|
||||
spin_unlock_irqrestore(&pool_lock, flags);
|
||||
|
||||
pam->page = page;
|
||||
pam->virtual = virtual;
|
||||
|
||||
spin_lock_irqsave(&pas->lock, flags);
|
||||
list_add_tail(&pam->list, &pas->lh);
|
||||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
} else { /* Remove */
|
||||
spin_lock_irqsave(&pas->lock, flags);
|
||||
list_for_each_entry(pam, &pas->lh, list) {
|
||||
if (pam->page == page) {
|
||||
list_del(&pam->list);
|
||||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
spin_lock_irqsave(&pool_lock, flags);
|
||||
list_add_tail(&pam->list, &page_address_pool);
|
||||
spin_unlock_irqrestore(&pool_lock, flags);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&pas->lock, flags);
|
||||
}
|
||||
done:
|
||||
return;
|
||||
}
|
||||
|
||||
static struct page_address_map page_address_maps[LAST_PKMAP];
|
||||
|
||||
void __init page_address_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
INIT_LIST_HEAD(&page_address_pool);
|
||||
for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
|
||||
list_add(&page_address_maps[i].list, &page_address_pool);
|
||||
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
|
||||
INIT_LIST_HEAD(&page_address_htable[i].lh);
|
||||
spin_lock_init(&page_address_htable[i].lock);
|
||||
}
|
||||
spin_lock_init(&pool_lock);
|
||||
}
|
||||
|
||||
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
|
||||
833
mm/hugetlb.c
Normal file
833
mm/hugetlb.c
Normal file
@@ -0,0 +1,833 @@
|
||||
/*
|
||||
* Generic hugetlb support.
|
||||
* (C) William Irwin, April 2004
|
||||
*/
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/mutex.h>
|
||||
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
#include <linux/hugetlb.h>
|
||||
#include "internal.h"
|
||||
|
||||
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
|
||||
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
|
||||
unsigned long max_huge_pages;
|
||||
static struct list_head hugepage_freelists[MAX_NUMNODES];
|
||||
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
||||
static unsigned int free_huge_pages_node[MAX_NUMNODES];
|
||||
/*
|
||||
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
|
||||
*/
|
||||
static DEFINE_SPINLOCK(hugetlb_lock);
|
||||
|
||||
static void clear_huge_page(struct page *page, unsigned long addr)
|
||||
{
|
||||
int i;
|
||||
|
||||
might_sleep();
|
||||
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
|
||||
cond_resched();
|
||||
clear_user_highpage(page + i, addr);
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_huge_page(struct page *dst, struct page *src,
|
||||
unsigned long addr, struct vm_area_struct *vma)
|
||||
{
|
||||
int i;
|
||||
|
||||
might_sleep();
|
||||
for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
|
||||
cond_resched();
|
||||
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
|
||||
}
|
||||
}
|
||||
|
||||
static void enqueue_huge_page(struct page *page)
|
||||
{
|
||||
int nid = page_to_nid(page);
|
||||
list_add(&page->lru, &hugepage_freelists[nid]);
|
||||
free_huge_pages++;
|
||||
free_huge_pages_node[nid]++;
|
||||
}
|
||||
|
||||
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long address)
|
||||
{
|
||||
int nid = numa_node_id();
|
||||
struct page *page = NULL;
|
||||
struct zonelist *zonelist = huge_zonelist(vma, address);
|
||||
struct zone **z;
|
||||
|
||||
for (z = zonelist->zones; *z; z++) {
|
||||
nid = zone_to_nid(*z);
|
||||
if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
|
||||
!list_empty(&hugepage_freelists[nid]))
|
||||
break;
|
||||
}
|
||||
|
||||
if (*z) {
|
||||
page = list_entry(hugepage_freelists[nid].next,
|
||||
struct page, lru);
|
||||
list_del(&page->lru);
|
||||
free_huge_pages--;
|
||||
free_huge_pages_node[nid]--;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
static void free_huge_page(struct page *page)
|
||||
{
|
||||
BUG_ON(page_count(page));
|
||||
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
enqueue_huge_page(page);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
}
|
||||
|
||||
static int alloc_fresh_huge_page(void)
|
||||
{
|
||||
static int nid = 0;
|
||||
struct page *page;
|
||||
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
|
||||
HUGETLB_PAGE_ORDER);
|
||||
nid = next_node(nid, node_online_map);
|
||||
if (nid == MAX_NUMNODES)
|
||||
nid = first_node(node_online_map);
|
||||
if (page) {
|
||||
set_compound_page_dtor(page, free_huge_page);
|
||||
spin_lock(&hugetlb_lock);
|
||||
nr_huge_pages++;
|
||||
nr_huge_pages_node[page_to_nid(page)]++;
|
||||
spin_unlock(&hugetlb_lock);
|
||||
put_page(page); /* free it into the hugepage allocator */
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
resv_huge_pages--;
|
||||
else if (free_huge_pages <= resv_huge_pages)
|
||||
goto fail;
|
||||
|
||||
page = dequeue_huge_page(vma, addr);
|
||||
if (!page)
|
||||
goto fail;
|
||||
|
||||
spin_unlock(&hugetlb_lock);
|
||||
set_page_refcounted(page);
|
||||
return page;
|
||||
|
||||
fail:
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
resv_huge_pages++;
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int __init hugetlb_init(void)
|
||||
{
|
||||
unsigned long i;
|
||||
|
||||
if (HPAGE_SHIFT == 0)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < MAX_NUMNODES; ++i)
|
||||
INIT_LIST_HEAD(&hugepage_freelists[i]);
|
||||
|
||||
for (i = 0; i < max_huge_pages; ++i) {
|
||||
if (!alloc_fresh_huge_page())
|
||||
break;
|
||||
}
|
||||
max_huge_pages = free_huge_pages = nr_huge_pages = i;
|
||||
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
|
||||
return 0;
|
||||
}
|
||||
module_init(hugetlb_init);
|
||||
|
||||
static int __init hugetlb_setup(char *s)
|
||||
{
|
||||
if (sscanf(s, "%lu", &max_huge_pages) <= 0)
|
||||
max_huge_pages = 0;
|
||||
return 1;
|
||||
}
|
||||
__setup("hugepages=", hugetlb_setup);
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static void update_and_free_page(struct page *page)
|
||||
{
|
||||
int i;
|
||||
nr_huge_pages--;
|
||||
nr_huge_pages_node[page_to_nid(page)]--;
|
||||
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
|
||||
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
|
||||
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
|
||||
1 << PG_private | 1<< PG_writeback);
|
||||
}
|
||||
page[1].lru.next = NULL;
|
||||
set_page_refcounted(page);
|
||||
__free_pages(page, HUGETLB_PAGE_ORDER);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
static void try_to_free_low(unsigned long count)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MAX_NUMNODES; ++i) {
|
||||
struct page *page, *next;
|
||||
list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
|
||||
if (PageHighMem(page))
|
||||
continue;
|
||||
list_del(&page->lru);
|
||||
update_and_free_page(page);
|
||||
free_huge_pages--;
|
||||
free_huge_pages_node[page_to_nid(page)]--;
|
||||
if (count >= nr_huge_pages)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void try_to_free_low(unsigned long count)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned long set_max_huge_pages(unsigned long count)
|
||||
{
|
||||
while (count > nr_huge_pages) {
|
||||
if (!alloc_fresh_huge_page())
|
||||
return nr_huge_pages;
|
||||
}
|
||||
if (count >= nr_huge_pages)
|
||||
return nr_huge_pages;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
count = max(count, resv_huge_pages);
|
||||
try_to_free_low(count);
|
||||
while (count < nr_huge_pages) {
|
||||
struct page *page = dequeue_huge_page(NULL, 0);
|
||||
if (!page)
|
||||
break;
|
||||
update_and_free_page(page);
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return nr_huge_pages;
|
||||
}
|
||||
|
||||
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
|
||||
struct file *file, void __user *buffer,
|
||||
size_t *length, loff_t *ppos)
|
||||
{
|
||||
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
|
||||
max_huge_pages = set_max_huge_pages(max_huge_pages);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
int hugetlb_report_meminfo(char *buf)
|
||||
{
|
||||
return sprintf(buf,
|
||||
"HugePages_Total: %5lu\n"
|
||||
"HugePages_Free: %5lu\n"
|
||||
"HugePages_Rsvd: %5lu\n"
|
||||
"Hugepagesize: %5lu kB\n",
|
||||
nr_huge_pages,
|
||||
free_huge_pages,
|
||||
resv_huge_pages,
|
||||
HPAGE_SIZE/1024);
|
||||
}
|
||||
|
||||
int hugetlb_report_node_meminfo(int nid, char *buf)
|
||||
{
|
||||
return sprintf(buf,
|
||||
"Node %d HugePages_Total: %5u\n"
|
||||
"Node %d HugePages_Free: %5u\n",
|
||||
nid, nr_huge_pages_node[nid],
|
||||
nid, free_huge_pages_node[nid]);
|
||||
}
|
||||
|
||||
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
|
||||
unsigned long hugetlb_total_pages(void)
|
||||
{
|
||||
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot handle pagefaults against hugetlb pages at all. They cause
|
||||
* handle_mm_fault() to try to instantiate regular-sized pages in the
|
||||
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
|
||||
* this far.
|
||||
*/
|
||||
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
|
||||
unsigned long address, int *unused)
|
||||
{
|
||||
BUG();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct vm_operations_struct hugetlb_vm_ops = {
|
||||
.nopage = hugetlb_nopage,
|
||||
};
|
||||
|
||||
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
|
||||
int writable)
|
||||
{
|
||||
pte_t entry;
|
||||
|
||||
if (writable) {
|
||||
entry =
|
||||
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
|
||||
} else {
|
||||
entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
|
||||
}
|
||||
entry = pte_mkyoung(entry);
|
||||
entry = pte_mkhuge(entry);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void set_huge_ptep_writable(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)
|
||||
{
|
||||
pte_t entry;
|
||||
|
||||
entry = pte_mkwrite(pte_mkdirty(*ptep));
|
||||
ptep_set_access_flags(vma, address, ptep, entry, 1);
|
||||
update_mmu_cache(vma, address, entry);
|
||||
lazy_mmu_prot_update(entry);
|
||||
}
|
||||
|
||||
|
||||
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
pte_t *src_pte, *dst_pte, entry;
|
||||
struct page *ptepage;
|
||||
unsigned long addr;
|
||||
int cow;
|
||||
|
||||
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
|
||||
|
||||
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
|
||||
src_pte = huge_pte_offset(src, addr);
|
||||
if (!src_pte)
|
||||
continue;
|
||||
dst_pte = huge_pte_alloc(dst, addr);
|
||||
if (!dst_pte)
|
||||
goto nomem;
|
||||
spin_lock(&dst->page_table_lock);
|
||||
spin_lock(&src->page_table_lock);
|
||||
if (!pte_none(*src_pte)) {
|
||||
if (cow)
|
||||
ptep_set_wrprotect(src, addr, src_pte);
|
||||
entry = *src_pte;
|
||||
ptepage = pte_page(entry);
|
||||
get_page(ptepage);
|
||||
set_huge_pte_at(dst, addr, dst_pte, entry);
|
||||
}
|
||||
spin_unlock(&src->page_table_lock);
|
||||
spin_unlock(&dst->page_table_lock);
|
||||
}
|
||||
return 0;
|
||||
|
||||
nomem:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long address;
|
||||
pte_t *ptep;
|
||||
pte_t pte;
|
||||
struct page *page;
|
||||
struct page *tmp;
|
||||
/*
|
||||
* A page gathering list, protected by per file i_mmap_lock. The
|
||||
* lock is used to avoid list corruption from multiple unmapping
|
||||
* of the same page since we are using page->lru.
|
||||
*/
|
||||
LIST_HEAD(page_list);
|
||||
|
||||
WARN_ON(!is_vm_hugetlb_page(vma));
|
||||
BUG_ON(start & ~HPAGE_MASK);
|
||||
BUG_ON(end & ~HPAGE_MASK);
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
for (address = start; address < end; address += HPAGE_SIZE) {
|
||||
ptep = huge_pte_offset(mm, address);
|
||||
if (!ptep)
|
||||
continue;
|
||||
|
||||
if (huge_pmd_unshare(mm, &address, ptep))
|
||||
continue;
|
||||
|
||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||
if (pte_none(pte))
|
||||
continue;
|
||||
|
||||
page = pte_page(pte);
|
||||
if (pte_dirty(pte))
|
||||
set_page_dirty(page);
|
||||
list_add(&page->lru, &page_list);
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
flush_tlb_range(vma, start, end);
|
||||
list_for_each_entry_safe(page, tmp, &page_list, lru) {
|
||||
list_del(&page->lru);
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
/*
|
||||
* It is undesirable to test vma->vm_file as it should be non-null
|
||||
* for valid hugetlb area. However, vm_file will be NULL in the error
|
||||
* cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
|
||||
* do_mmap_pgoff() nullifies vma->vm_file before calling this function
|
||||
* to clean up. Since no pte has actually been setup, it is safe to
|
||||
* do nothing in this case.
|
||||
*/
|
||||
if (vma->vm_file) {
|
||||
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
|
||||
__unmap_hugepage_range(vma, start, end);
|
||||
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep, pte_t pte)
|
||||
{
|
||||
struct page *old_page, *new_page;
|
||||
int avoidcopy;
|
||||
|
||||
old_page = pte_page(pte);
|
||||
|
||||
/* If no-one else is actually using this page, avoid the copy
|
||||
* and just make the page writable */
|
||||
avoidcopy = (page_count(old_page) == 1);
|
||||
if (avoidcopy) {
|
||||
set_huge_ptep_writable(vma, address, ptep);
|
||||
return VM_FAULT_MINOR;
|
||||
}
|
||||
|
||||
page_cache_get(old_page);
|
||||
new_page = alloc_huge_page(vma, address);
|
||||
|
||||
if (!new_page) {
|
||||
page_cache_release(old_page);
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
copy_huge_page(new_page, old_page, address, vma);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
||||
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
|
||||
if (likely(pte_same(*ptep, pte))) {
|
||||
/* Break COW */
|
||||
set_huge_pte_at(mm, address, ptep,
|
||||
make_huge_pte(vma, new_page, 1));
|
||||
/* Make the old page be freed below */
|
||||
new_page = old_page;
|
||||
}
|
||||
page_cache_release(new_page);
|
||||
page_cache_release(old_page);
|
||||
return VM_FAULT_MINOR;
|
||||
}
|
||||
|
||||
int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep, int write_access)
|
||||
{
|
||||
int ret = VM_FAULT_SIGBUS;
|
||||
unsigned long idx;
|
||||
unsigned long size;
|
||||
struct page *page;
|
||||
struct address_space *mapping;
|
||||
pte_t new_pte;
|
||||
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
|
||||
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
|
||||
|
||||
/*
|
||||
* Use page lock to guard against racing truncation
|
||||
* before we get page_table_lock.
|
||||
*/
|
||||
retry:
|
||||
page = find_lock_page(mapping, idx);
|
||||
if (!page) {
|
||||
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
|
||||
if (idx >= size)
|
||||
goto out;
|
||||
if (hugetlb_get_quota(mapping))
|
||||
goto out;
|
||||
page = alloc_huge_page(vma, address);
|
||||
if (!page) {
|
||||
hugetlb_put_quota(mapping);
|
||||
ret = VM_FAULT_OOM;
|
||||
goto out;
|
||||
}
|
||||
clear_huge_page(page, address);
|
||||
|
||||
if (vma->vm_flags & VM_SHARED) {
|
||||
int err;
|
||||
|
||||
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
||||
if (err) {
|
||||
put_page(page);
|
||||
hugetlb_put_quota(mapping);
|
||||
if (err == -EEXIST)
|
||||
goto retry;
|
||||
goto out;
|
||||
}
|
||||
} else
|
||||
lock_page(page);
|
||||
}
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
|
||||
if (idx >= size)
|
||||
goto backout;
|
||||
|
||||
ret = VM_FAULT_MINOR;
|
||||
if (!pte_none(*ptep))
|
||||
goto backout;
|
||||
|
||||
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
|
||||
&& (vma->vm_flags & VM_SHARED)));
|
||||
set_huge_pte_at(mm, address, ptep, new_pte);
|
||||
|
||||
if (write_access && !(vma->vm_flags & VM_SHARED)) {
|
||||
/* Optimization, do the COW without a second fault */
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
|
||||
}
|
||||
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
unlock_page(page);
|
||||
out:
|
||||
return ret;
|
||||
|
||||
backout:
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
hugetlb_put_quota(mapping);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long address, int write_access)
|
||||
{
|
||||
pte_t *ptep;
|
||||
pte_t entry;
|
||||
int ret;
|
||||
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
|
||||
|
||||
ptep = huge_pte_alloc(mm, address);
|
||||
if (!ptep)
|
||||
return VM_FAULT_OOM;
|
||||
|
||||
/*
|
||||
* Serialize hugepage allocation and instantiation, so that we don't
|
||||
* get spurious allocation failures if two CPUs race to instantiate
|
||||
* the same page in the page cache.
|
||||
*/
|
||||
mutex_lock(&hugetlb_instantiation_mutex);
|
||||
entry = *ptep;
|
||||
if (pte_none(entry)) {
|
||||
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
|
||||
mutex_unlock(&hugetlb_instantiation_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = VM_FAULT_MINOR;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
/* Check for a racing update before calling hugetlb_cow */
|
||||
if (likely(pte_same(entry, *ptep)))
|
||||
if (write_access && !pte_write(entry))
|
||||
ret = hugetlb_cow(mm, vma, address, ptep, entry);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
mutex_unlock(&hugetlb_instantiation_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
unsigned long *position, int *length, int i)
|
||||
{
|
||||
unsigned long pfn_offset;
|
||||
unsigned long vaddr = *position;
|
||||
int remainder = *length;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
while (vaddr < vma->vm_end && remainder) {
|
||||
pte_t *pte;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* Some archs (sparc64, sh*) have multiple pte_ts to
|
||||
* each hugepage. We have to make * sure we get the
|
||||
* first, for the page indexing below to work.
|
||||
*/
|
||||
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
|
||||
|
||||
if (!pte || pte_none(*pte)) {
|
||||
int ret;
|
||||
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
ret = hugetlb_fault(mm, vma, vaddr, 0);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (ret == VM_FAULT_MINOR)
|
||||
continue;
|
||||
|
||||
remainder = 0;
|
||||
if (!i)
|
||||
i = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
|
||||
page = pte_page(*pte);
|
||||
same_page:
|
||||
if (pages) {
|
||||
get_page(page);
|
||||
pages[i] = page + pfn_offset;
|
||||
}
|
||||
|
||||
if (vmas)
|
||||
vmas[i] = vma;
|
||||
|
||||
vaddr += PAGE_SIZE;
|
||||
++pfn_offset;
|
||||
--remainder;
|
||||
++i;
|
||||
if (vaddr < vma->vm_end && remainder &&
|
||||
pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
|
||||
/*
|
||||
* We use pfn_offset to avoid touching the pageframes
|
||||
* of this compound page.
|
||||
*/
|
||||
goto same_page;
|
||||
}
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
*length = remainder;
|
||||
*position = vaddr;
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned long end, pgprot_t newprot)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long start = address;
|
||||
pte_t *ptep;
|
||||
pte_t pte;
|
||||
|
||||
BUG_ON(address >= end);
|
||||
flush_cache_range(vma, address, end);
|
||||
|
||||
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
|
||||
spin_lock(&mm->page_table_lock);
|
||||
for (; address < end; address += HPAGE_SIZE) {
|
||||
ptep = huge_pte_offset(mm, address);
|
||||
if (!ptep)
|
||||
continue;
|
||||
if (huge_pmd_unshare(mm, &address, ptep))
|
||||
continue;
|
||||
if (!pte_none(*ptep)) {
|
||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||
pte = pte_mkhuge(pte_modify(pte, newprot));
|
||||
set_huge_pte_at(mm, address, ptep, pte);
|
||||
lazy_mmu_prot_update(pte);
|
||||
}
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
|
||||
|
||||
flush_tlb_range(vma, start, end);
|
||||
}
|
||||
|
||||
struct file_region {
|
||||
struct list_head link;
|
||||
long from;
|
||||
long to;
|
||||
};
|
||||
|
||||
static long region_add(struct list_head *head, long f, long t)
|
||||
{
|
||||
struct file_region *rg, *nrg, *trg;
|
||||
|
||||
/* Locate the region we are either in or before. */
|
||||
list_for_each_entry(rg, head, link)
|
||||
if (f <= rg->to)
|
||||
break;
|
||||
|
||||
/* Round our left edge to the current segment if it encloses us. */
|
||||
if (f > rg->from)
|
||||
f = rg->from;
|
||||
|
||||
/* Check for and consume any regions we now overlap with. */
|
||||
nrg = rg;
|
||||
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
||||
if (&rg->link == head)
|
||||
break;
|
||||
if (rg->from > t)
|
||||
break;
|
||||
|
||||
/* If this area reaches higher then extend our area to
|
||||
* include it completely. If this is not the first area
|
||||
* which we intend to reuse, free it. */
|
||||
if (rg->to > t)
|
||||
t = rg->to;
|
||||
if (rg != nrg) {
|
||||
list_del(&rg->link);
|
||||
kfree(rg);
|
||||
}
|
||||
}
|
||||
nrg->from = f;
|
||||
nrg->to = t;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long region_chg(struct list_head *head, long f, long t)
|
||||
{
|
||||
struct file_region *rg, *nrg;
|
||||
long chg = 0;
|
||||
|
||||
/* Locate the region we are before or in. */
|
||||
list_for_each_entry(rg, head, link)
|
||||
if (f <= rg->to)
|
||||
break;
|
||||
|
||||
/* If we are below the current region then a new region is required.
|
||||
* Subtle, allocate a new region at the position but make it zero
|
||||
* size such that we can guarentee to record the reservation. */
|
||||
if (&rg->link == head || t < rg->from) {
|
||||
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
||||
if (nrg == 0)
|
||||
return -ENOMEM;
|
||||
nrg->from = f;
|
||||
nrg->to = f;
|
||||
INIT_LIST_HEAD(&nrg->link);
|
||||
list_add(&nrg->link, rg->link.prev);
|
||||
|
||||
return t - f;
|
||||
}
|
||||
|
||||
/* Round our left edge to the current segment if it encloses us. */
|
||||
if (f > rg->from)
|
||||
f = rg->from;
|
||||
chg = t - f;
|
||||
|
||||
/* Check for and consume any regions we now overlap with. */
|
||||
list_for_each_entry(rg, rg->link.prev, link) {
|
||||
if (&rg->link == head)
|
||||
break;
|
||||
if (rg->from > t)
|
||||
return chg;
|
||||
|
||||
/* We overlap with this area, if it extends futher than
|
||||
* us then we must extend ourselves. Account for its
|
||||
* existing reservation. */
|
||||
if (rg->to > t) {
|
||||
chg += rg->to - t;
|
||||
t = rg->to;
|
||||
}
|
||||
chg -= rg->to - rg->from;
|
||||
}
|
||||
return chg;
|
||||
}
|
||||
|
||||
static long region_truncate(struct list_head *head, long end)
|
||||
{
|
||||
struct file_region *rg, *trg;
|
||||
long chg = 0;
|
||||
|
||||
/* Locate the region we are either in or before. */
|
||||
list_for_each_entry(rg, head, link)
|
||||
if (end <= rg->to)
|
||||
break;
|
||||
if (&rg->link == head)
|
||||
return 0;
|
||||
|
||||
/* If we are in the middle of a region then adjust it. */
|
||||
if (end > rg->from) {
|
||||
chg = rg->to - end;
|
||||
rg->to = end;
|
||||
rg = list_entry(rg->link.next, typeof(*rg), link);
|
||||
}
|
||||
|
||||
/* Drop any remaining regions. */
|
||||
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
||||
if (&rg->link == head)
|
||||
break;
|
||||
chg += rg->to - rg->from;
|
||||
list_del(&rg->link);
|
||||
kfree(rg);
|
||||
}
|
||||
return chg;
|
||||
}
|
||||
|
||||
static int hugetlb_acct_memory(long delta)
|
||||
{
|
||||
int ret = -ENOMEM;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if ((delta + resv_huge_pages) <= free_huge_pages) {
|
||||
resv_huge_pages += delta;
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hugetlb_reserve_pages(struct inode *inode, long from, long to)
|
||||
{
|
||||
long ret, chg;
|
||||
|
||||
chg = region_chg(&inode->i_mapping->private_list, from, to);
|
||||
if (chg < 0)
|
||||
return chg;
|
||||
ret = hugetlb_acct_memory(chg);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
region_add(&inode->i_mapping->private_list, from, to);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
{
|
||||
long chg = region_truncate(&inode->i_mapping->private_list, offset);
|
||||
hugetlb_acct_memory(freed - chg);
|
||||
}
|
||||
40
mm/internal.h
Normal file
40
mm/internal.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/* internal.h: mm/ internal definitions
|
||||
*
|
||||
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
|
||||
* Written by David Howells (dhowells@redhat.com)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#ifndef __MM_INTERNAL_H
|
||||
#define __MM_INTERNAL_H
|
||||
|
||||
#include <linux/mm.h>
|
||||
|
||||
static inline void set_page_count(struct page *page, int v)
|
||||
{
|
||||
atomic_set(&page->_count, v);
|
||||
}
|
||||
|
||||
/*
|
||||
* Turn a non-refcounted page (->_count == 0) into refcounted with
|
||||
* a count of one.
|
||||
*/
|
||||
static inline void set_page_refcounted(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
|
||||
VM_BUG_ON(atomic_read(&page->_count));
|
||||
set_page_count(page, 1);
|
||||
}
|
||||
|
||||
static inline void __put_page(struct page *page)
|
||||
{
|
||||
atomic_dec(&page->_count);
|
||||
}
|
||||
|
||||
extern void fastcall __init __free_pages_bootmem(struct page *page,
|
||||
unsigned int order);
|
||||
|
||||
#endif
|
||||
337
mm/madvise.c
Normal file
337
mm/madvise.c
Normal file
@@ -0,0 +1,337 @@
|
||||
/*
|
||||
* linux/mm/madvise.c
|
||||
*
|
||||
* Copyright (C) 1999 Linus Torvalds
|
||||
* Copyright (C) 2002 Christoph Hellwig
|
||||
*/
|
||||
|
||||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/hugetlb.h>
|
||||
|
||||
/*
|
||||
* We can potentially split a vm area into separate
|
||||
* areas, each area with its own behavior.
|
||||
*/
|
||||
static long madvise_behavior(struct vm_area_struct * vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, int behavior)
|
||||
{
|
||||
struct mm_struct * mm = vma->vm_mm;
|
||||
int error = 0;
|
||||
pgoff_t pgoff;
|
||||
int new_flags = vma->vm_flags;
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_NORMAL:
|
||||
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_SEQUENTIAL:
|
||||
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_RANDOM:
|
||||
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
|
||||
break;
|
||||
case MADV_DONTFORK:
|
||||
new_flags |= VM_DONTCOPY;
|
||||
break;
|
||||
case MADV_DOFORK:
|
||||
new_flags &= ~VM_DONTCOPY;
|
||||
break;
|
||||
}
|
||||
|
||||
if (new_flags == vma->vm_flags) {
|
||||
*prev = vma;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
|
||||
vma->vm_file, pgoff, vma_policy(vma));
|
||||
if (*prev) {
|
||||
vma = *prev;
|
||||
goto success;
|
||||
}
|
||||
|
||||
*prev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
error = split_vma(mm, vma, start, 1);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
error = split_vma(mm, vma, end, 0);
|
||||
if (error)
|
||||
goto out;
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* vm_flags is protected by the mmap_sem held in write mode.
|
||||
*/
|
||||
vma->vm_flags = new_flags;
|
||||
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
error = -EAGAIN;
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule all required I/O operations. Do not wait for completion.
|
||||
*/
|
||||
static long madvise_willneed(struct vm_area_struct * vma,
|
||||
struct vm_area_struct ** prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
|
||||
if (file->f_mapping->a_ops->get_xip_page) {
|
||||
/* no bad return value, but ignore advice */
|
||||
return 0;
|
||||
}
|
||||
|
||||
*prev = vma;
|
||||
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
|
||||
force_page_cache_readahead(file->f_mapping,
|
||||
file, start, max_sane_readahead(end - start));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Application no longer needs these pages. If the pages are dirty,
|
||||
* it's OK to just throw them away. The app will be more careful about
|
||||
* data it wants to keep. Be sure to free swap resources too. The
|
||||
* zap_page_range call sets things up for refill_inactive to actually free
|
||||
* these pages later if no one else has touched them in the meantime,
|
||||
* although we could add these pages to a global reuse list for
|
||||
* refill_inactive to pick up before reclaiming other pages.
|
||||
*
|
||||
* NB: This interface discards data rather than pushes it out to swap,
|
||||
* as some implementations do. This has performance implications for
|
||||
* applications like large transactional databases which want to discard
|
||||
* pages in anonymous maps after committing to backing store the data
|
||||
* that was kept in them. There is no reason to write this data out to
|
||||
* the swap area if the application is discarding it.
|
||||
*
|
||||
* An interface that causes the system to free clean pages and flush
|
||||
* dirty pages is already available as msync(MS_INVALIDATE).
|
||||
*/
|
||||
static long madvise_dontneed(struct vm_area_struct * vma,
|
||||
struct vm_area_struct ** prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
*prev = vma;
|
||||
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
|
||||
return -EINVAL;
|
||||
|
||||
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
|
||||
struct zap_details details = {
|
||||
.nonlinear_vma = vma,
|
||||
.last_index = ULONG_MAX,
|
||||
};
|
||||
zap_page_range(vma, start, end - start, &details);
|
||||
} else
|
||||
zap_page_range(vma, start, end - start, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Application wants to free up the pages and associated backing store.
|
||||
* This is effectively punching a hole into the middle of a file.
|
||||
*
|
||||
* NOTE: Currently, only shmfs/tmpfs is supported for this operation.
|
||||
* Other filesystems return -ENOSYS.
|
||||
*/
|
||||
static long madvise_remove(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
loff_t offset, endoff;
|
||||
int error;
|
||||
|
||||
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
|
||||
|
||||
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
|
||||
return -EINVAL;
|
||||
|
||||
if (!vma->vm_file || !vma->vm_file->f_mapping
|
||||
|| !vma->vm_file->f_mapping->host) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
|
||||
return -EACCES;
|
||||
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
|
||||
offset = (loff_t)(start - vma->vm_start)
|
||||
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
|
||||
endoff = (loff_t)(end - vma->vm_start - 1)
|
||||
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
|
||||
|
||||
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
error = vmtruncate_range(mapping->host, offset, endoff);
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
|
||||
static long
|
||||
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, int behavior)
|
||||
{
|
||||
long error;
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_DOFORK:
|
||||
if (vma->vm_flags & VM_IO) {
|
||||
error = -EINVAL;
|
||||
break;
|
||||
}
|
||||
case MADV_DONTFORK:
|
||||
case MADV_NORMAL:
|
||||
case MADV_SEQUENTIAL:
|
||||
case MADV_RANDOM:
|
||||
error = madvise_behavior(vma, prev, start, end, behavior);
|
||||
break;
|
||||
case MADV_REMOVE:
|
||||
error = madvise_remove(vma, prev, start, end);
|
||||
break;
|
||||
|
||||
case MADV_WILLNEED:
|
||||
error = madvise_willneed(vma, prev, start, end);
|
||||
break;
|
||||
|
||||
case MADV_DONTNEED:
|
||||
error = madvise_dontneed(vma, prev, start, end);
|
||||
break;
|
||||
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* The madvise(2) system call.
|
||||
*
|
||||
* Applications can use madvise() to advise the kernel how it should
|
||||
* handle paging I/O in this VM area. The idea is to help the kernel
|
||||
* use appropriate read-ahead and caching techniques. The information
|
||||
* provided is advisory only, and can be safely disregarded by the
|
||||
* kernel without affecting the correct operation of the application.
|
||||
*
|
||||
* behavior values:
|
||||
* MADV_NORMAL - the default behavior is to read clusters. This
|
||||
* results in some read-ahead and read-behind.
|
||||
* MADV_RANDOM - the system should read the minimum amount of data
|
||||
* on any access, since it is unlikely that the appli-
|
||||
* cation will need more than what it asks for.
|
||||
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
|
||||
* once, so they can be aggressively read ahead, and
|
||||
* can be freed soon after they are accessed.
|
||||
* MADV_WILLNEED - the application is notifying the system to read
|
||||
* some pages ahead.
|
||||
* MADV_DONTNEED - the application is finished with the given range,
|
||||
* so the kernel can free resources associated with it.
|
||||
* MADV_REMOVE - the application wants to free up the given range of
|
||||
* pages and associated backing store.
|
||||
*
|
||||
* return values:
|
||||
* zero - success
|
||||
* -EINVAL - start + len < 0, start is not page-aligned,
|
||||
* "behavior" is not a valid value, or application
|
||||
* is attempting to release locked or shared pages.
|
||||
* -ENOMEM - addresses in the specified range are not currently
|
||||
* mapped, or are outside the AS of the process.
|
||||
* -EIO - an I/O error occurred while paging in data.
|
||||
* -EBADF - map exists, but area maps something that isn't a file.
|
||||
* -EAGAIN - a kernel resource was temporarily unavailable.
|
||||
*/
|
||||
asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
|
||||
{
|
||||
unsigned long end, tmp;
|
||||
struct vm_area_struct * vma, *prev;
|
||||
int unmapped_error = 0;
|
||||
int error = -EINVAL;
|
||||
size_t len;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
if (start & ~PAGE_MASK)
|
||||
goto out;
|
||||
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
|
||||
|
||||
/* Check to see whether len was rounded up from small -ve to zero */
|
||||
if (len_in && !len)
|
||||
goto out;
|
||||
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
goto out;
|
||||
|
||||
error = 0;
|
||||
if (end == start)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address
|
||||
* ranges, just ignore them, but return -ENOMEM at the end.
|
||||
* - different from the way of handling in mlock etc.
|
||||
*/
|
||||
vma = find_vma_prev(current->mm, start, &prev);
|
||||
if (vma && start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (;;) {
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
/* Here start < (end|vma->vm_end). */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < (end|vma->vm_end) */
|
||||
tmp = vma->vm_end;
|
||||
if (end < tmp)
|
||||
tmp = end;
|
||||
|
||||
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
|
||||
error = madvise_vma(vma, &prev, start, tmp, behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
start = tmp;
|
||||
if (prev && start < prev->vm_end)
|
||||
start = prev->vm_end;
|
||||
error = unmapped_error;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
if (prev)
|
||||
vma = prev->vm_next;
|
||||
else /* madvise_remove dropped mmap_sem */
|
||||
vma = find_vma(current->mm, start);
|
||||
}
|
||||
out:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
2738
mm/memory.c
Normal file
2738
mm/memory.c
Normal file
File diff suppressed because it is too large
Load Diff
310
mm/memory_hotplug.c
Normal file
310
mm/memory_hotplug.c
Normal file
@@ -0,0 +1,310 @@
|
||||
/*
|
||||
* linux/mm/memory_hotplug.c
|
||||
*
|
||||
* Copyright (C)
|
||||
*/
|
||||
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/cpuset.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
/* add this memory to iomem resource */
|
||||
static struct resource *register_memory_resource(u64 start, u64 size)
|
||||
{
|
||||
struct resource *res;
|
||||
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
|
||||
BUG_ON(!res);
|
||||
|
||||
res->name = "System RAM";
|
||||
res->start = start;
|
||||
res->end = start + size - 1;
|
||||
res->flags = IORESOURCE_MEM;
|
||||
if (request_resource(&iomem_resource, res) < 0) {
|
||||
printk("System RAM resource %llx - %llx cannot be added\n",
|
||||
(unsigned long long)res->start, (unsigned long long)res->end);
|
||||
kfree(res);
|
||||
res = NULL;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static void release_memory_resource(struct resource *res)
|
||||
{
|
||||
if (!res)
|
||||
return;
|
||||
release_resource(res);
|
||||
kfree(res);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
|
||||
{
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
int nr_pages = PAGES_PER_SECTION;
|
||||
int nid = pgdat->node_id;
|
||||
int zone_type;
|
||||
|
||||
zone_type = zone - pgdat->node_zones;
|
||||
if (!populated_zone(zone)) {
|
||||
int ret = 0;
|
||||
ret = init_currently_empty_zone(zone, phys_start_pfn,
|
||||
nr_pages, MEMMAP_HOTPLUG);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
memmap_init_zone(nr_pages, nid, zone_type,
|
||||
phys_start_pfn, MEMMAP_HOTPLUG);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
|
||||
{
|
||||
int nr_pages = PAGES_PER_SECTION;
|
||||
int ret;
|
||||
|
||||
if (pfn_valid(phys_start_pfn))
|
||||
return -EEXIST;
|
||||
|
||||
ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = __add_zone(zone, phys_start_pfn);
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
return register_new_memory(__pfn_to_section(phys_start_pfn));
|
||||
}
|
||||
|
||||
/*
|
||||
* Reasonably generic function for adding memory. It is
|
||||
* expected that archs that support memory hotplug will
|
||||
* call this function after deciding the zone to which to
|
||||
* add the new pages.
|
||||
*/
|
||||
int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
unsigned long i;
|
||||
int err = 0;
|
||||
int start_sec, end_sec;
|
||||
/* during initialize mem_map, align hot-added range to section */
|
||||
start_sec = pfn_to_section_nr(phys_start_pfn);
|
||||
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
|
||||
|
||||
for (i = start_sec; i <= end_sec; i++) {
|
||||
err = __add_section(zone, i << PFN_SECTION_SHIFT);
|
||||
|
||||
/*
|
||||
* EEXIST is finally dealed with by ioresource collision
|
||||
* check. see add_memory() => register_memory_resource()
|
||||
* Warning will be printed if there is collision.
|
||||
*/
|
||||
if (err && (err != -EEXIST))
|
||||
break;
|
||||
err = 0;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__add_pages);
|
||||
|
||||
static void grow_zone_span(struct zone *zone,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
unsigned long old_zone_end_pfn;
|
||||
|
||||
zone_span_writelock(zone);
|
||||
|
||||
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
if (start_pfn < zone->zone_start_pfn)
|
||||
zone->zone_start_pfn = start_pfn;
|
||||
|
||||
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
|
||||
zone->zone_start_pfn;
|
||||
|
||||
zone_span_writeunlock(zone);
|
||||
}
|
||||
|
||||
static void grow_pgdat_span(struct pglist_data *pgdat,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
unsigned long old_pgdat_end_pfn =
|
||||
pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||
|
||||
if (start_pfn < pgdat->node_start_pfn)
|
||||
pgdat->node_start_pfn = start_pfn;
|
||||
|
||||
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
|
||||
pgdat->node_start_pfn;
|
||||
}
|
||||
|
||||
int online_pages(unsigned long pfn, unsigned long nr_pages)
|
||||
{
|
||||
unsigned long i;
|
||||
unsigned long flags;
|
||||
unsigned long onlined_pages = 0;
|
||||
struct resource res;
|
||||
u64 section_end;
|
||||
unsigned long start_pfn;
|
||||
struct zone *zone;
|
||||
int need_zonelists_rebuild = 0;
|
||||
|
||||
/*
|
||||
* This doesn't need a lock to do pfn_to_page().
|
||||
* The section can't be removed here because of the
|
||||
* memory_block->state_sem.
|
||||
*/
|
||||
zone = page_zone(pfn_to_page(pfn));
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
grow_zone_span(zone, pfn, pfn + nr_pages);
|
||||
grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
|
||||
/*
|
||||
* If this zone is not populated, then it is not in zonelist.
|
||||
* This means the page allocator ignores this zone.
|
||||
* So, zonelist must be updated after online.
|
||||
*/
|
||||
if (!populated_zone(zone))
|
||||
need_zonelists_rebuild = 1;
|
||||
|
||||
res.start = (u64)pfn << PAGE_SHIFT;
|
||||
res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
|
||||
res.flags = IORESOURCE_MEM; /* we just need system ram */
|
||||
section_end = res.end;
|
||||
|
||||
while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
|
||||
start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
|
||||
nr_pages = (unsigned long)
|
||||
((res.end + 1 - res.start) >> PAGE_SHIFT);
|
||||
|
||||
if (PageReserved(pfn_to_page(start_pfn))) {
|
||||
/* this region's page is not onlined now */
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
struct page *page = pfn_to_page(start_pfn + i);
|
||||
online_page(page);
|
||||
onlined_pages++;
|
||||
}
|
||||
}
|
||||
|
||||
res.start = res.end + 1;
|
||||
res.end = section_end;
|
||||
}
|
||||
zone->present_pages += onlined_pages;
|
||||
zone->zone_pgdat->node_present_pages += onlined_pages;
|
||||
|
||||
setup_per_zone_pages_min();
|
||||
|
||||
if (need_zonelists_rebuild)
|
||||
build_all_zonelists();
|
||||
vm_total_pages = nr_free_pagecache_pages();
|
||||
writeback_set_ratelimit();
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
||||
|
||||
static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
|
||||
{
|
||||
struct pglist_data *pgdat;
|
||||
unsigned long zones_size[MAX_NR_ZONES] = {0};
|
||||
unsigned long zholes_size[MAX_NR_ZONES] = {0};
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
|
||||
pgdat = arch_alloc_nodedata(nid);
|
||||
if (!pgdat)
|
||||
return NULL;
|
||||
|
||||
arch_refresh_nodedata(nid, pgdat);
|
||||
|
||||
/* we can use NODE_DATA(nid) from here */
|
||||
|
||||
/* init node's zones as empty zones, we don't have any present pages.*/
|
||||
free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
|
||||
|
||||
return pgdat;
|
||||
}
|
||||
|
||||
static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
|
||||
{
|
||||
arch_refresh_nodedata(nid, NULL);
|
||||
arch_free_nodedata(pgdat);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
int add_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
pg_data_t *pgdat = NULL;
|
||||
int new_pgdat = 0;
|
||||
struct resource *res;
|
||||
int ret;
|
||||
|
||||
res = register_memory_resource(start, size);
|
||||
if (!res)
|
||||
return -EEXIST;
|
||||
|
||||
if (!node_online(nid)) {
|
||||
pgdat = hotadd_new_pgdat(nid, start);
|
||||
if (!pgdat)
|
||||
return -ENOMEM;
|
||||
new_pgdat = 1;
|
||||
ret = kswapd_run(nid);
|
||||
if (ret)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* call arch's memory hotadd */
|
||||
ret = arch_add_memory(nid, start, size);
|
||||
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
|
||||
/* we online node here. we can't roll back from here. */
|
||||
node_set_online(nid);
|
||||
|
||||
cpuset_track_online_nodes();
|
||||
|
||||
if (new_pgdat) {
|
||||
ret = register_one_node(nid);
|
||||
/*
|
||||
* If sysfs file of new node can't create, cpu on the node
|
||||
* can't be hot-added. There is no rollback way now.
|
||||
* So, check by BUG_ON() to catch it reluctantly..
|
||||
*/
|
||||
BUG_ON(ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
error:
|
||||
/* rollback pgdat allocation and others */
|
||||
if (new_pgdat)
|
||||
rollback_node_hotadd(nid, pgdat);
|
||||
if (res)
|
||||
release_memory_resource(res);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(add_memory);
|
||||
1911
mm/mempolicy.c
Normal file
1911
mm/mempolicy.c
Normal file
File diff suppressed because it is too large
Load Diff
338
mm/mempool.c
Normal file
338
mm/mempool.c
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
* linux/mm/mempool.c
|
||||
*
|
||||
* memory buffer pool support. Such pools are mostly used
|
||||
* for guaranteed, deadlock-free memory allocations during
|
||||
* extreme VM load.
|
||||
*
|
||||
* started by Ingo Molnar, Copyright (C) 2001
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
static void add_element(mempool_t *pool, void *element)
|
||||
{
|
||||
BUG_ON(pool->curr_nr >= pool->min_nr);
|
||||
pool->elements[pool->curr_nr++] = element;
|
||||
}
|
||||
|
||||
static void *remove_element(mempool_t *pool)
|
||||
{
|
||||
BUG_ON(pool->curr_nr <= 0);
|
||||
return pool->elements[--pool->curr_nr];
|
||||
}
|
||||
|
||||
static void free_pool(mempool_t *pool)
|
||||
{
|
||||
while (pool->curr_nr) {
|
||||
void *element = remove_element(pool);
|
||||
pool->free(element, pool->pool_data);
|
||||
}
|
||||
kfree(pool->elements);
|
||||
kfree(pool);
|
||||
}
|
||||
|
||||
/**
|
||||
* mempool_create - create a memory pool
|
||||
* @min_nr: the minimum number of elements guaranteed to be
|
||||
* allocated for this pool.
|
||||
* @alloc_fn: user-defined element-allocation function.
|
||||
* @free_fn: user-defined element-freeing function.
|
||||
* @pool_data: optional private data available to the user-defined functions.
|
||||
*
|
||||
* this function creates and allocates a guaranteed size, preallocated
|
||||
* memory pool. The pool can be used from the mempool_alloc() and mempool_free()
|
||||
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
|
||||
* functions might sleep - as long as the mempool_alloc() function is not called
|
||||
* from IRQ contexts.
|
||||
*/
|
||||
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
|
||||
mempool_free_t *free_fn, void *pool_data)
|
||||
{
|
||||
return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_create);
|
||||
|
||||
mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
|
||||
mempool_free_t *free_fn, void *pool_data, int node_id)
|
||||
{
|
||||
mempool_t *pool;
|
||||
pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
|
||||
if (!pool)
|
||||
return NULL;
|
||||
memset(pool, 0, sizeof(*pool));
|
||||
pool->elements = kmalloc_node(min_nr * sizeof(void *),
|
||||
GFP_KERNEL, node_id);
|
||||
if (!pool->elements) {
|
||||
kfree(pool);
|
||||
return NULL;
|
||||
}
|
||||
spin_lock_init(&pool->lock);
|
||||
pool->min_nr = min_nr;
|
||||
pool->pool_data = pool_data;
|
||||
init_waitqueue_head(&pool->wait);
|
||||
pool->alloc = alloc_fn;
|
||||
pool->free = free_fn;
|
||||
|
||||
/*
|
||||
* First pre-allocate the guaranteed number of buffers.
|
||||
*/
|
||||
while (pool->curr_nr < pool->min_nr) {
|
||||
void *element;
|
||||
|
||||
element = pool->alloc(GFP_KERNEL, pool->pool_data);
|
||||
if (unlikely(!element)) {
|
||||
free_pool(pool);
|
||||
return NULL;
|
||||
}
|
||||
add_element(pool, element);
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_create_node);
|
||||
|
||||
/**
|
||||
* mempool_resize - resize an existing memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
* @new_min_nr: the new minimum number of elements guaranteed to be
|
||||
* allocated for this pool.
|
||||
* @gfp_mask: the usual allocation bitmask.
|
||||
*
|
||||
* This function shrinks/grows the pool. In the case of growing,
|
||||
* it cannot be guaranteed that the pool will be grown to the new
|
||||
* size immediately, but new mempool_free() calls will refill it.
|
||||
*
|
||||
* Note, the caller must guarantee that no mempool_destroy is called
|
||||
* while this function is running. mempool_alloc() & mempool_free()
|
||||
* might be called (eg. from IRQ contexts) while this function executes.
|
||||
*/
|
||||
int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
|
||||
{
|
||||
void *element;
|
||||
void **new_elements;
|
||||
unsigned long flags;
|
||||
|
||||
BUG_ON(new_min_nr <= 0);
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (new_min_nr <= pool->min_nr) {
|
||||
while (new_min_nr < pool->curr_nr) {
|
||||
element = remove_element(pool);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
pool->free(element, pool->pool_data);
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
}
|
||||
pool->min_nr = new_min_nr;
|
||||
goto out_unlock;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
/* Grow the pool */
|
||||
new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
|
||||
if (!new_elements)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (unlikely(new_min_nr <= pool->min_nr)) {
|
||||
/* Raced, other resize will do our work */
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
kfree(new_elements);
|
||||
goto out;
|
||||
}
|
||||
memcpy(new_elements, pool->elements,
|
||||
pool->curr_nr * sizeof(*new_elements));
|
||||
kfree(pool->elements);
|
||||
pool->elements = new_elements;
|
||||
pool->min_nr = new_min_nr;
|
||||
|
||||
while (pool->curr_nr < pool->min_nr) {
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
element = pool->alloc(gfp_mask, pool->pool_data);
|
||||
if (!element)
|
||||
goto out;
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
add_element(pool, element);
|
||||
} else {
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
pool->free(element, pool->pool_data); /* Raced */
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out_unlock:
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_resize);
|
||||
|
||||
/**
|
||||
* mempool_destroy - deallocate a memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
*
|
||||
* this function only sleeps if the free_fn() function sleeps. The caller
|
||||
* has to guarantee that all elements have been returned to the pool (ie:
|
||||
* freed) prior to calling mempool_destroy().
|
||||
*/
|
||||
void mempool_destroy(mempool_t *pool)
|
||||
{
|
||||
/* Check for outstanding elements */
|
||||
BUG_ON(pool->curr_nr != pool->min_nr);
|
||||
free_pool(pool);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_destroy);
|
||||
|
||||
/**
|
||||
* mempool_alloc - allocate an element from a specific memory pool
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
* @gfp_mask: the usual allocation bitmask.
|
||||
*
|
||||
* this function only sleeps if the alloc_fn() function sleeps or
|
||||
* returns NULL. Note that due to preallocation, this function
|
||||
* *never* fails when called from process contexts. (it might
|
||||
* fail if called from an IRQ context.)
|
||||
*/
|
||||
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
|
||||
{
|
||||
void *element;
|
||||
unsigned long flags;
|
||||
wait_queue_t wait;
|
||||
gfp_t gfp_temp;
|
||||
|
||||
might_sleep_if(gfp_mask & __GFP_WAIT);
|
||||
|
||||
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
|
||||
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
|
||||
gfp_mask |= __GFP_NOWARN; /* failures are OK */
|
||||
|
||||
gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
|
||||
|
||||
repeat_alloc:
|
||||
|
||||
element = pool->alloc(gfp_temp, pool->pool_data);
|
||||
if (likely(element != NULL))
|
||||
return element;
|
||||
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (likely(pool->curr_nr)) {
|
||||
element = remove_element(pool);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
return element;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
|
||||
/* We must not sleep in the GFP_ATOMIC case */
|
||||
if (!(gfp_mask & __GFP_WAIT))
|
||||
return NULL;
|
||||
|
||||
/* Now start performing page reclaim */
|
||||
gfp_temp = gfp_mask;
|
||||
init_wait(&wait);
|
||||
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
smp_mb();
|
||||
if (!pool->curr_nr) {
|
||||
/*
|
||||
* FIXME: this should be io_schedule(). The timeout is there
|
||||
* as a workaround for some DM problems in 2.6.18.
|
||||
*/
|
||||
io_schedule_timeout(5*HZ);
|
||||
}
|
||||
finish_wait(&pool->wait, &wait);
|
||||
|
||||
goto repeat_alloc;
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_alloc);
|
||||
|
||||
/**
|
||||
* mempool_free - return an element to the pool.
|
||||
* @element: pool element pointer.
|
||||
* @pool: pointer to the memory pool which was allocated via
|
||||
* mempool_create().
|
||||
*
|
||||
* this function only sleeps if the free_fn() function sleeps.
|
||||
*/
|
||||
void mempool_free(void *element, mempool_t *pool)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
smp_mb();
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
spin_lock_irqsave(&pool->lock, flags);
|
||||
if (pool->curr_nr < pool->min_nr) {
|
||||
add_element(pool, element);
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
wake_up(&pool->wait);
|
||||
return;
|
||||
}
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
}
|
||||
pool->free(element, pool->pool_data);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_free);
|
||||
|
||||
/*
|
||||
* A commonly used alloc and free fn.
|
||||
*/
|
||||
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
struct kmem_cache *mem = pool_data;
|
||||
return kmem_cache_alloc(mem, gfp_mask);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_alloc_slab);
|
||||
|
||||
void mempool_free_slab(void *element, void *pool_data)
|
||||
{
|
||||
struct kmem_cache *mem = pool_data;
|
||||
kmem_cache_free(mem, element);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_free_slab);
|
||||
|
||||
/*
|
||||
* A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
|
||||
* specfied by pool_data
|
||||
*/
|
||||
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t)(long)pool_data;
|
||||
return kmalloc(size, gfp_mask);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_kmalloc);
|
||||
|
||||
void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
size_t size = (size_t) pool_data;
|
||||
return kzalloc(size, gfp_mask);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_kzalloc);
|
||||
|
||||
void mempool_kfree(void *element, void *pool_data)
|
||||
{
|
||||
kfree(element);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_kfree);
|
||||
|
||||
/*
|
||||
* A simple mempool-backed page allocator that allocates pages
|
||||
* of the order specified by pool_data.
|
||||
*/
|
||||
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
|
||||
{
|
||||
int order = (int)(long)pool_data;
|
||||
return alloc_pages(gfp_mask, order);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_alloc_pages);
|
||||
|
||||
void mempool_free_pages(void *element, void *pool_data)
|
||||
{
|
||||
int order = (int)(long)pool_data;
|
||||
__free_pages(element, order);
|
||||
}
|
||||
EXPORT_SYMBOL(mempool_free_pages);
|
||||
1017
mm/migrate.c
Normal file
1017
mm/migrate.c
Normal file
File diff suppressed because it is too large
Load Diff
229
mm/mincore.c
Normal file
229
mm/mincore.c
Normal file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* linux/mm/mincore.c
|
||||
*
|
||||
* Copyright (C) 1994-2006 Linus Torvalds
|
||||
*/
|
||||
|
||||
/*
|
||||
* The mincore() system call.
|
||||
*/
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
/*
|
||||
* Later we can get more picky about what "in core" means precisely.
|
||||
* For now, simply check to see if the page is in the page cache,
|
||||
* and is up to date; i.e. that no page-in operation would be required
|
||||
* at this time if an application were to map and access this page.
|
||||
*/
|
||||
static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
|
||||
{
|
||||
unsigned char present = 0;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* When tmpfs swaps out a page from a file, any process mapping that
|
||||
* file will not get a swp_entry_t in its pte, but rather it is like
|
||||
* any other file mapping (ie. marked !present and faulted in with
|
||||
* tmpfs's .nopage). So swapped out tmpfs mappings are tested here.
|
||||
*
|
||||
* However when tmpfs moves the page from pagecache and into swapcache,
|
||||
* it is still in core, but the find_get_page below won't find it.
|
||||
* No big deal, but make a note of it.
|
||||
*/
|
||||
page = find_get_page(mapping, pgoff);
|
||||
if (page) {
|
||||
present = PageUptodate(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
return present;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do a chunk of "sys_mincore()". We've already checked
|
||||
* all the arguments, we hold the mmap semaphore: we should
|
||||
* just return the amount of info we're asked for.
|
||||
*/
|
||||
static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *ptep;
|
||||
spinlock_t *ptl;
|
||||
unsigned long nr;
|
||||
int i;
|
||||
pgoff_t pgoff;
|
||||
struct vm_area_struct *vma = find_vma(current->mm, addr);
|
||||
|
||||
/*
|
||||
* find_vma() didn't find anything above us, or we're
|
||||
* in an unmapped hole in the address space: ENOMEM.
|
||||
*/
|
||||
if (!vma || addr < vma->vm_start)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Calculate how many pages there are left in the last level of the
|
||||
* PTE array for our address.
|
||||
*/
|
||||
nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
|
||||
|
||||
/*
|
||||
* Don't overrun this vma
|
||||
*/
|
||||
nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
|
||||
|
||||
/*
|
||||
* Don't return more than the caller asked for
|
||||
*/
|
||||
nr = min(nr, pages);
|
||||
|
||||
pgd = pgd_offset(vma->vm_mm, addr);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
goto none_mapped;
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
goto none_mapped;
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
goto none_mapped;
|
||||
|
||||
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||||
for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
|
||||
unsigned char present;
|
||||
pte_t pte = *ptep;
|
||||
|
||||
if (pte_present(pte)) {
|
||||
present = 1;
|
||||
|
||||
} else if (pte_none(pte)) {
|
||||
if (vma->vm_file) {
|
||||
pgoff = linear_page_index(vma, addr);
|
||||
present = mincore_page(vma->vm_file->f_mapping,
|
||||
pgoff);
|
||||
} else
|
||||
present = 0;
|
||||
|
||||
} else if (pte_file(pte)) {
|
||||
pgoff = pte_to_pgoff(pte);
|
||||
present = mincore_page(vma->vm_file->f_mapping, pgoff);
|
||||
|
||||
} else { /* pte is a swap entry */
|
||||
swp_entry_t entry = pte_to_swp_entry(pte);
|
||||
if (is_migration_entry(entry)) {
|
||||
/* migration entries are always uptodate */
|
||||
present = 1;
|
||||
} else {
|
||||
#ifdef CONFIG_SWAP
|
||||
pgoff = entry.val;
|
||||
present = mincore_page(&swapper_space, pgoff);
|
||||
#else
|
||||
WARN_ON(1);
|
||||
present = 1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
vec[i] = present;
|
||||
}
|
||||
pte_unmap_unlock(ptep-1, ptl);
|
||||
|
||||
return nr;
|
||||
|
||||
none_mapped:
|
||||
if (vma->vm_file) {
|
||||
pgoff = linear_page_index(vma, addr);
|
||||
for (i = 0; i < nr; i++, pgoff++)
|
||||
vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
|
||||
} else {
|
||||
for (i = 0; i < nr; i++)
|
||||
vec[i] = 0;
|
||||
}
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
/*
|
||||
* The mincore(2) system call.
|
||||
*
|
||||
* mincore() returns the memory residency status of the pages in the
|
||||
* current process's address space specified by [addr, addr + len).
|
||||
* The status is returned in a vector of bytes. The least significant
|
||||
* bit of each byte is 1 if the referenced page is in memory, otherwise
|
||||
* it is zero.
|
||||
*
|
||||
* Because the status of a page can change after mincore() checks it
|
||||
* but before it returns to the application, the returned vector may
|
||||
* contain stale information. Only locked pages are guaranteed to
|
||||
* remain in memory.
|
||||
*
|
||||
* return values:
|
||||
* zero - success
|
||||
* -EFAULT - vec points to an illegal address
|
||||
* -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
|
||||
* -ENOMEM - Addresses in the range [addr, addr + len] are
|
||||
* invalid for the address space of this process, or
|
||||
* specify one or more pages which are not currently
|
||||
* mapped
|
||||
* -EAGAIN - A kernel resource was temporarily unavailable.
|
||||
*/
|
||||
asmlinkage long sys_mincore(unsigned long start, size_t len,
|
||||
unsigned char __user * vec)
|
||||
{
|
||||
long retval;
|
||||
unsigned long pages;
|
||||
unsigned char *tmp;
|
||||
|
||||
/* Check the start address: needs to be page-aligned.. */
|
||||
if (start & ~PAGE_CACHE_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
/* ..and we need to be passed a valid user-space range */
|
||||
if (!access_ok(VERIFY_READ, (void __user *) start, len))
|
||||
return -ENOMEM;
|
||||
|
||||
/* This also avoids any overflows on PAGE_CACHE_ALIGN */
|
||||
pages = len >> PAGE_SHIFT;
|
||||
pages += (len & ~PAGE_MASK) != 0;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, vec, pages))
|
||||
return -EFAULT;
|
||||
|
||||
tmp = (void *) __get_free_page(GFP_USER);
|
||||
if (!tmp)
|
||||
return -EAGAIN;
|
||||
|
||||
retval = 0;
|
||||
while (pages) {
|
||||
/*
|
||||
* Do at most PAGE_SIZE entries per iteration, due to
|
||||
* the temporary buffer size.
|
||||
*/
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
|
||||
if (retval <= 0)
|
||||
break;
|
||||
if (copy_to_user(vec, tmp, retval)) {
|
||||
retval = -EFAULT;
|
||||
break;
|
||||
}
|
||||
pages -= retval;
|
||||
vec += retval;
|
||||
start += retval << PAGE_SHIFT;
|
||||
retval = 0;
|
||||
}
|
||||
free_page((unsigned long) tmp);
|
||||
return retval;
|
||||
}
|
||||
254
mm/mlock.c
Normal file
254
mm/mlock.c
Normal file
@@ -0,0 +1,254 @@
|
||||
/*
|
||||
* linux/mm/mlock.c
|
||||
*
|
||||
* (C) Copyright 1995 Linus Torvalds
|
||||
* (C) Copyright 2002 Christoph Hellwig
|
||||
*/
|
||||
|
||||
#include <linux/capability.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
|
||||
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, unsigned int newflags)
|
||||
{
|
||||
struct mm_struct * mm = vma->vm_mm;
|
||||
pgoff_t pgoff;
|
||||
int pages;
|
||||
int ret = 0;
|
||||
|
||||
if (newflags == vma->vm_flags) {
|
||||
*prev = vma;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
|
||||
vma->vm_file, pgoff, vma_policy(vma));
|
||||
if (*prev) {
|
||||
vma = *prev;
|
||||
goto success;
|
||||
}
|
||||
|
||||
*prev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
ret = split_vma(mm, vma, start, 1);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
ret = split_vma(mm, vma, end, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* vm_flags is protected by the mmap_sem held in write mode.
|
||||
* It's okay if try_to_unmap_one unmaps a page just after we
|
||||
* set VM_LOCKED, make_pages_present below will bring it back.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
|
||||
/*
|
||||
* Keep track of amount of locked VM.
|
||||
*/
|
||||
pages = (end - start) >> PAGE_SHIFT;
|
||||
if (newflags & VM_LOCKED) {
|
||||
pages = -pages;
|
||||
if (!(newflags & VM_IO))
|
||||
ret = make_pages_present(start, end);
|
||||
}
|
||||
|
||||
mm->locked_vm -= pages;
|
||||
out:
|
||||
if (ret == -ENOMEM)
|
||||
ret = -EAGAIN;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_mlock(unsigned long start, size_t len, int on)
|
||||
{
|
||||
unsigned long nstart, end, tmp;
|
||||
struct vm_area_struct * vma, * prev;
|
||||
int error;
|
||||
|
||||
len = PAGE_ALIGN(len);
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
if (end == start)
|
||||
return 0;
|
||||
vma = find_vma_prev(current->mm, start, &prev);
|
||||
if (!vma || vma->vm_start > start)
|
||||
return -ENOMEM;
|
||||
|
||||
if (start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned int newflags;
|
||||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!on)
|
||||
newflags &= ~VM_LOCKED;
|
||||
|
||||
tmp = vma->vm_end;
|
||||
if (tmp > end)
|
||||
tmp = end;
|
||||
error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
|
||||
if (error)
|
||||
break;
|
||||
nstart = tmp;
|
||||
if (nstart < prev->vm_end)
|
||||
nstart = prev->vm_end;
|
||||
if (nstart >= end)
|
||||
break;
|
||||
|
||||
vma = prev->vm_next;
|
||||
if (!vma || vma->vm_start != nstart) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long sys_mlock(unsigned long start, size_t len)
|
||||
{
|
||||
unsigned long locked;
|
||||
unsigned long lock_limit;
|
||||
int error = -ENOMEM;
|
||||
|
||||
if (!can_do_mlock())
|
||||
return -EPERM;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
|
||||
start &= PAGE_MASK;
|
||||
|
||||
locked = len >> PAGE_SHIFT;
|
||||
locked += current->mm->locked_vm;
|
||||
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
|
||||
/* check against resource limits */
|
||||
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
|
||||
error = do_mlock(start, len, 1);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long sys_munlock(unsigned long start, size_t len)
|
||||
{
|
||||
int ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
|
||||
start &= PAGE_MASK;
|
||||
ret = do_mlock(start, len, 0);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_mlockall(int flags)
|
||||
{
|
||||
struct vm_area_struct * vma, * prev = NULL;
|
||||
unsigned int def_flags = 0;
|
||||
|
||||
if (flags & MCL_FUTURE)
|
||||
def_flags = VM_LOCKED;
|
||||
current->mm->def_flags = def_flags;
|
||||
if (flags == MCL_FUTURE)
|
||||
goto out;
|
||||
|
||||
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
|
||||
unsigned int newflags;
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!(flags & MCL_CURRENT))
|
||||
newflags &= ~VM_LOCKED;
|
||||
|
||||
/* Ignore errors */
|
||||
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
|
||||
}
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
asmlinkage long sys_mlockall(int flags)
|
||||
{
|
||||
unsigned long lock_limit;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
|
||||
goto out;
|
||||
|
||||
ret = -EPERM;
|
||||
if (!can_do_mlock())
|
||||
goto out;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
|
||||
ret = -ENOMEM;
|
||||
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
|
||||
capable(CAP_IPC_LOCK))
|
||||
ret = do_mlockall(flags);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
asmlinkage long sys_munlockall(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
ret = do_mlockall(0);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
|
||||
* shm segments) get accounted against the user_struct instead.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(shmlock_user_lock);
|
||||
|
||||
int user_shm_lock(size_t size, struct user_struct *user)
|
||||
{
|
||||
unsigned long lock_limit, locked;
|
||||
int allowed = 0;
|
||||
|
||||
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
spin_lock(&shmlock_user_lock);
|
||||
if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
|
||||
goto out;
|
||||
get_uid(user);
|
||||
user->locked_shm += locked;
|
||||
allowed = 1;
|
||||
out:
|
||||
spin_unlock(&shmlock_user_lock);
|
||||
return allowed;
|
||||
}
|
||||
|
||||
void user_shm_unlock(size_t size, struct user_struct *user)
|
||||
{
|
||||
spin_lock(&shmlock_user_lock);
|
||||
user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
spin_unlock(&shmlock_user_lock);
|
||||
free_uid(user);
|
||||
}
|
||||
44
mm/mmzone.c
Normal file
44
mm/mmzone.c
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* linux/mm/mmzone.c
|
||||
*
|
||||
* management codes for pgdats and zones.
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
struct pglist_data *first_online_pgdat(void)
|
||||
{
|
||||
return NODE_DATA(first_online_node);
|
||||
}
|
||||
|
||||
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
|
||||
{
|
||||
int nid = next_online_node(pgdat->node_id);
|
||||
|
||||
if (nid == MAX_NUMNODES)
|
||||
return NULL;
|
||||
return NODE_DATA(nid);
|
||||
}
|
||||
|
||||
/*
|
||||
* next_zone - helper magic for for_each_zone()
|
||||
*/
|
||||
struct zone *next_zone(struct zone *zone)
|
||||
{
|
||||
pg_data_t *pgdat = zone->zone_pgdat;
|
||||
|
||||
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
|
||||
zone++;
|
||||
else {
|
||||
pgdat = next_online_pgdat(pgdat);
|
||||
if (pgdat)
|
||||
zone = pgdat->node_zones;
|
||||
else
|
||||
zone = NULL;
|
||||
}
|
||||
return zone;
|
||||
}
|
||||
|
||||
314
mm/mprotect.c
Normal file
314
mm/mprotect.c
Normal file
@@ -0,0 +1,314 @@
|
||||
/*
|
||||
* mm/mprotect.c
|
||||
*
|
||||
* (C) Copyright 1994 Linus Torvalds
|
||||
* (C) Copyright 2002 Christoph Hellwig
|
||||
*
|
||||
* Address space accounting code <alan@redhat.com>
|
||||
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/personality.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
{
|
||||
pte_t *pte, oldpte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
do {
|
||||
oldpte = *pte;
|
||||
if (pte_present(oldpte)) {
|
||||
pte_t ptent;
|
||||
|
||||
/* Avoid an SMP race with hardware updated dirty/clean
|
||||
* bits by wiping the pte and then setting the new pte
|
||||
* into place.
|
||||
*/
|
||||
ptent = ptep_get_and_clear(mm, addr, pte);
|
||||
ptent = pte_modify(ptent, newprot);
|
||||
/*
|
||||
* Avoid taking write faults for pages we know to be
|
||||
* dirty.
|
||||
*/
|
||||
if (dirty_accountable && pte_dirty(ptent))
|
||||
ptent = pte_mkwrite(ptent);
|
||||
set_pte_at(mm, addr, pte, ptent);
|
||||
lazy_mmu_prot_update(ptent);
|
||||
#ifdef CONFIG_MIGRATION
|
||||
} else if (!pte_file(oldpte)) {
|
||||
swp_entry_t entry = pte_to_swp_entry(oldpte);
|
||||
|
||||
if (is_write_migration_entry(entry)) {
|
||||
/*
|
||||
* A protection check is difficult so
|
||||
* just be safe and disable write
|
||||
*/
|
||||
make_migration_entry_read(&entry);
|
||||
set_pte_at(mm, addr, pte,
|
||||
swp_entry_to_pte(entry));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
arch_leave_lazy_mmu_mode();
|
||||
pte_unmap_unlock(pte - 1, ptl);
|
||||
}
|
||||
|
||||
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static void change_protection(struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end, pgprot_t newprot,
|
||||
int dirty_accountable)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
unsigned long start = addr;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset(mm, addr);
|
||||
flush_cache_range(vma, addr, end);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_tlb_range(vma, start, end);
|
||||
}
|
||||
|
||||
static int
|
||||
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
||||
unsigned long start, unsigned long end, unsigned long newflags)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long oldflags = vma->vm_flags;
|
||||
long nrpages = (end - start) >> PAGE_SHIFT;
|
||||
unsigned long charged = 0;
|
||||
pgoff_t pgoff;
|
||||
int error;
|
||||
int dirty_accountable = 0;
|
||||
|
||||
if (newflags == oldflags) {
|
||||
*pprev = vma;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we make a private mapping writable we increase our commit;
|
||||
* but (without finer accounting) cannot reduce our commit if we
|
||||
* make it unwritable again.
|
||||
*
|
||||
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
|
||||
* a MAP_NORESERVE private mapping to writable will now reserve.
|
||||
*/
|
||||
if (newflags & VM_WRITE) {
|
||||
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
|
||||
charged = nrpages;
|
||||
if (security_vm_enough_memory(charged))
|
||||
return -ENOMEM;
|
||||
newflags |= VM_ACCOUNT;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* First try to merge with previous and/or next vma.
|
||||
*/
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*pprev = vma_merge(mm, *pprev, start, end, newflags,
|
||||
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
|
||||
if (*pprev) {
|
||||
vma = *pprev;
|
||||
goto success;
|
||||
}
|
||||
|
||||
*pprev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
error = split_vma(mm, vma, start, 1);
|
||||
if (error)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
error = split_vma(mm, vma, end, 0);
|
||||
if (error)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
success:
|
||||
/*
|
||||
* vm_flags and vm_page_prot are protected by the mmap_sem
|
||||
* held in write mode.
|
||||
*/
|
||||
vma->vm_flags = newflags;
|
||||
vma->vm_page_prot = protection_map[newflags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
|
||||
if (vma_wants_writenotify(vma)) {
|
||||
vma->vm_page_prot = protection_map[newflags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC)];
|
||||
dirty_accountable = 1;
|
||||
}
|
||||
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
|
||||
else
|
||||
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
|
||||
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
|
||||
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
vm_unacct_memory(charged);
|
||||
return error;
|
||||
}
|
||||
|
||||
asmlinkage long
|
||||
sys_mprotect(unsigned long start, size_t len, unsigned long prot)
|
||||
{
|
||||
unsigned long vm_flags, nstart, end, tmp, reqprot;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
int error = -EINVAL;
|
||||
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
|
||||
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
|
||||
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
|
||||
return -EINVAL;
|
||||
|
||||
if (start & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
if (!len)
|
||||
return 0;
|
||||
len = PAGE_ALIGN(len);
|
||||
end = start + len;
|
||||
if (end <= start)
|
||||
return -ENOMEM;
|
||||
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
|
||||
return -EINVAL;
|
||||
|
||||
reqprot = prot;
|
||||
/*
|
||||
* Does the application expect PROT_READ to imply PROT_EXEC:
|
||||
*/
|
||||
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
|
||||
prot |= PROT_EXEC;
|
||||
|
||||
vm_flags = calc_vm_prot_bits(prot);
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
vma = find_vma_prev(current->mm, start, &prev);
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
if (unlikely(grows & PROT_GROWSDOWN)) {
|
||||
if (vma->vm_start >= end)
|
||||
goto out;
|
||||
start = vma->vm_start;
|
||||
error = -EINVAL;
|
||||
if (!(vma->vm_flags & VM_GROWSDOWN))
|
||||
goto out;
|
||||
}
|
||||
else {
|
||||
if (vma->vm_start > start)
|
||||
goto out;
|
||||
if (unlikely(grows & PROT_GROWSUP)) {
|
||||
end = vma->vm_end;
|
||||
error = -EINVAL;
|
||||
if (!(vma->vm_flags & VM_GROWSUP))
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned long newflags;
|
||||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
|
||||
|
||||
/* newflags >> 4 shift VM_MAY% in place of VM_% */
|
||||
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
|
||||
error = -EACCES;
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = security_file_mprotect(vma, reqprot, prot);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
tmp = vma->vm_end;
|
||||
if (tmp > end)
|
||||
tmp = end;
|
||||
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
|
||||
if (error)
|
||||
goto out;
|
||||
nstart = tmp;
|
||||
|
||||
if (nstart < prev->vm_end)
|
||||
nstart = prev->vm_end;
|
||||
if (nstart >= end)
|
||||
goto out;
|
||||
|
||||
vma = prev->vm_next;
|
||||
if (!vma || vma->vm_start != nstart) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return error;
|
||||
}
|
||||
416
mm/mremap.c
Normal file
416
mm/mremap.c
Normal file
@@ -0,0 +1,416 @@
|
||||
/*
|
||||
* mm/mremap.c
|
||||
*
|
||||
* (C) Copyright 1996 Linus Torvalds
|
||||
*
|
||||
* Address space accounting code <alan@redhat.com>
|
||||
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
return NULL;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
return NULL;
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
pgd = pgd_offset(mm, addr);
|
||||
pud = pud_alloc(mm, pgd, addr);
|
||||
if (!pud)
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_alloc(mm, pud, addr);
|
||||
if (!pmd)
|
||||
return NULL;
|
||||
|
||||
if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
|
||||
return NULL;
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
||||
unsigned long old_addr, unsigned long old_end,
|
||||
struct vm_area_struct *new_vma, pmd_t *new_pmd,
|
||||
unsigned long new_addr)
|
||||
{
|
||||
struct address_space *mapping = NULL;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pte_t *old_pte, *new_pte, pte;
|
||||
spinlock_t *old_ptl, *new_ptl;
|
||||
|
||||
if (vma->vm_file) {
|
||||
/*
|
||||
* Subtle point from Rajesh Venkatasubramanian: before
|
||||
* moving file-based ptes, we must lock vmtruncate out,
|
||||
* since it might clean the dst vma before the src vma,
|
||||
* and we propagate stale pages into the dst afterward.
|
||||
*/
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
if (new_vma->vm_truncate_count &&
|
||||
new_vma->vm_truncate_count != vma->vm_truncate_count)
|
||||
new_vma->vm_truncate_count = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't have to worry about the ordering of src and dst
|
||||
* pte locks because exclusive mmap_sem prevents deadlock.
|
||||
*/
|
||||
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
|
||||
new_pte = pte_offset_map_nested(new_pmd, new_addr);
|
||||
new_ptl = pte_lockptr(mm, new_pmd);
|
||||
if (new_ptl != old_ptl)
|
||||
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
|
||||
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
|
||||
new_pte++, new_addr += PAGE_SIZE) {
|
||||
if (pte_none(*old_pte))
|
||||
continue;
|
||||
pte = ptep_clear_flush(vma, old_addr, old_pte);
|
||||
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
|
||||
set_pte_at(mm, new_addr, new_pte, pte);
|
||||
}
|
||||
|
||||
arch_leave_lazy_mmu_mode();
|
||||
if (new_ptl != old_ptl)
|
||||
spin_unlock(new_ptl);
|
||||
pte_unmap_nested(new_pte - 1);
|
||||
pte_unmap_unlock(old_pte - 1, old_ptl);
|
||||
if (mapping)
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
}
|
||||
|
||||
#define LATENCY_LIMIT (64 * PAGE_SIZE)
|
||||
|
||||
static unsigned long move_page_tables(struct vm_area_struct *vma,
|
||||
unsigned long old_addr, struct vm_area_struct *new_vma,
|
||||
unsigned long new_addr, unsigned long len)
|
||||
{
|
||||
unsigned long extent, next, old_end;
|
||||
pmd_t *old_pmd, *new_pmd;
|
||||
|
||||
old_end = old_addr + len;
|
||||
flush_cache_range(vma, old_addr, old_end);
|
||||
|
||||
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
|
||||
cond_resched();
|
||||
next = (old_addr + PMD_SIZE) & PMD_MASK;
|
||||
if (next - 1 > old_end)
|
||||
next = old_end;
|
||||
extent = next - old_addr;
|
||||
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
|
||||
if (!old_pmd)
|
||||
continue;
|
||||
new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
|
||||
if (!new_pmd)
|
||||
break;
|
||||
next = (new_addr + PMD_SIZE) & PMD_MASK;
|
||||
if (extent > next - new_addr)
|
||||
extent = next - new_addr;
|
||||
if (extent > LATENCY_LIMIT)
|
||||
extent = LATENCY_LIMIT;
|
||||
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
|
||||
new_vma, new_pmd, new_addr);
|
||||
}
|
||||
|
||||
return len + old_addr - old_end; /* how much done */
|
||||
}
|
||||
|
||||
static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
unsigned long old_addr, unsigned long old_len,
|
||||
unsigned long new_len, unsigned long new_addr)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *new_vma;
|
||||
unsigned long vm_flags = vma->vm_flags;
|
||||
unsigned long new_pgoff;
|
||||
unsigned long moved_len;
|
||||
unsigned long excess = 0;
|
||||
unsigned long hiwater_vm;
|
||||
int split = 0;
|
||||
|
||||
/*
|
||||
* We'd prefer to avoid failure later on in do_munmap:
|
||||
* which may split one vma into three before unmapping.
|
||||
*/
|
||||
if (mm->map_count >= sysctl_max_map_count - 3)
|
||||
return -ENOMEM;
|
||||
|
||||
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
|
||||
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
|
||||
if (!new_vma)
|
||||
return -ENOMEM;
|
||||
|
||||
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
|
||||
if (moved_len < old_len) {
|
||||
/*
|
||||
* On error, move entries back from new area to old,
|
||||
* which will succeed since page tables still there,
|
||||
* and then proceed to unmap new area instead of old.
|
||||
*/
|
||||
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
|
||||
vma = new_vma;
|
||||
old_len = new_len;
|
||||
old_addr = new_addr;
|
||||
new_addr = -ENOMEM;
|
||||
}
|
||||
|
||||
/* Conceal VM_ACCOUNT so old reservation is not undone */
|
||||
if (vm_flags & VM_ACCOUNT) {
|
||||
vma->vm_flags &= ~VM_ACCOUNT;
|
||||
excess = vma->vm_end - vma->vm_start - old_len;
|
||||
if (old_addr > vma->vm_start &&
|
||||
old_addr + old_len < vma->vm_end)
|
||||
split = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we failed to move page tables we still do total_vm increment
|
||||
* since do_munmap() will decrement it by old_len == new_len.
|
||||
*
|
||||
* Since total_vm is about to be raised artificially high for a
|
||||
* moment, we need to restore high watermark afterwards: if stats
|
||||
* are taken meanwhile, total_vm and hiwater_vm appear too high.
|
||||
* If this were a serious issue, we'd add a flag to do_munmap().
|
||||
*/
|
||||
hiwater_vm = mm->hiwater_vm;
|
||||
mm->total_vm += new_len >> PAGE_SHIFT;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
|
||||
|
||||
if (do_munmap(mm, old_addr, old_len) < 0) {
|
||||
/* OOM: unable to split vma, just get accounts right */
|
||||
vm_unacct_memory(excess >> PAGE_SHIFT);
|
||||
excess = 0;
|
||||
}
|
||||
mm->hiwater_vm = hiwater_vm;
|
||||
|
||||
/* Restore VM_ACCOUNT if one or two pieces of vma left */
|
||||
if (excess) {
|
||||
vma->vm_flags |= VM_ACCOUNT;
|
||||
if (split)
|
||||
vma->vm_next->vm_flags |= VM_ACCOUNT;
|
||||
}
|
||||
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += new_len >> PAGE_SHIFT;
|
||||
if (new_len > old_len)
|
||||
make_pages_present(new_addr + old_len,
|
||||
new_addr + new_len);
|
||||
}
|
||||
|
||||
return new_addr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Expand (or shrink) an existing mapping, potentially moving it at the
|
||||
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
|
||||
*
|
||||
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
|
||||
* This option implies MREMAP_MAYMOVE.
|
||||
*/
|
||||
unsigned long do_mremap(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_len,
|
||||
unsigned long flags, unsigned long new_addr)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long ret = -EINVAL;
|
||||
unsigned long charged = 0;
|
||||
|
||||
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
|
||||
goto out;
|
||||
|
||||
if (addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
|
||||
old_len = PAGE_ALIGN(old_len);
|
||||
new_len = PAGE_ALIGN(new_len);
|
||||
|
||||
/*
|
||||
* We allow a zero old-len as a special case
|
||||
* for DOS-emu "duplicate shm area" thing. But
|
||||
* a zero new-len is nonsensical.
|
||||
*/
|
||||
if (!new_len)
|
||||
goto out;
|
||||
|
||||
/* new_addr is only valid if MREMAP_FIXED is specified */
|
||||
if (flags & MREMAP_FIXED) {
|
||||
if (new_addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
if (!(flags & MREMAP_MAYMOVE))
|
||||
goto out;
|
||||
|
||||
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
|
||||
goto out;
|
||||
|
||||
/* Check if the location we're moving into overlaps the
|
||||
* old location at all, and fail if it does.
|
||||
*/
|
||||
if ((new_addr <= addr) && (new_addr+new_len) > addr)
|
||||
goto out;
|
||||
|
||||
if ((addr <= new_addr) && (addr+old_len) > new_addr)
|
||||
goto out;
|
||||
|
||||
ret = do_munmap(mm, new_addr, new_len);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Always allow a shrinking remap: that just unmaps
|
||||
* the unnecessary pages..
|
||||
* do_munmap does all the needed commit accounting
|
||||
*/
|
||||
if (old_len >= new_len) {
|
||||
ret = do_munmap(mm, addr+new_len, old_len - new_len);
|
||||
if (ret && old_len != new_len)
|
||||
goto out;
|
||||
ret = addr;
|
||||
if (!(flags & MREMAP_FIXED) || (new_addr == addr))
|
||||
goto out;
|
||||
old_len = new_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, we need to grow.. or relocate.
|
||||
*/
|
||||
ret = -EFAULT;
|
||||
vma = find_vma(mm, addr);
|
||||
if (!vma || vma->vm_start > addr)
|
||||
goto out;
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
/* We can't remap across vm area boundaries */
|
||||
if (old_len > vma->vm_end - addr)
|
||||
goto out;
|
||||
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
|
||||
if (new_len > old_len)
|
||||
goto out;
|
||||
}
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
unsigned long locked, lock_limit;
|
||||
locked = mm->locked_vm << PAGE_SHIFT;
|
||||
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
|
||||
locked += new_len - old_len;
|
||||
ret = -EAGAIN;
|
||||
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
|
||||
goto out;
|
||||
}
|
||||
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT) {
|
||||
charged = (new_len - old_len) >> PAGE_SHIFT;
|
||||
if (security_vm_enough_memory(charged))
|
||||
goto out_nc;
|
||||
}
|
||||
|
||||
/* old_len exactly to the end of the area..
|
||||
* And we're not relocating the area.
|
||||
*/
|
||||
if (old_len == vma->vm_end - addr &&
|
||||
!((flags & MREMAP_FIXED) && (addr != new_addr)) &&
|
||||
(old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
|
||||
unsigned long max_addr = TASK_SIZE;
|
||||
if (vma->vm_next)
|
||||
max_addr = vma->vm_next->vm_start;
|
||||
/* can we just expand the current mapping? */
|
||||
if (max_addr - addr >= new_len) {
|
||||
int pages = (new_len - old_len) >> PAGE_SHIFT;
|
||||
|
||||
vma_adjust(vma, vma->vm_start,
|
||||
addr + new_len, vma->vm_pgoff, NULL);
|
||||
|
||||
mm->total_vm += pages;
|
||||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += pages;
|
||||
make_pages_present(addr + old_len,
|
||||
addr + new_len);
|
||||
}
|
||||
ret = addr;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We weren't able to just expand or shrink the area,
|
||||
* we need to create a new one and move it..
|
||||
*/
|
||||
ret = -ENOMEM;
|
||||
if (flags & MREMAP_MAYMOVE) {
|
||||
if (!(flags & MREMAP_FIXED)) {
|
||||
unsigned long map_flags = 0;
|
||||
if (vma->vm_flags & VM_MAYSHARE)
|
||||
map_flags |= MAP_SHARED;
|
||||
|
||||
new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
|
||||
vma->vm_pgoff, map_flags);
|
||||
ret = new_addr;
|
||||
if (new_addr & ~PAGE_MASK)
|
||||
goto out;
|
||||
}
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr);
|
||||
}
|
||||
out:
|
||||
if (ret & ~PAGE_MASK)
|
||||
vm_unacct_memory(charged);
|
||||
out_nc:
|
||||
return ret;
|
||||
}
|
||||
|
||||
asmlinkage unsigned long sys_mremap(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_len,
|
||||
unsigned long flags, unsigned long new_addr)
|
||||
{
|
||||
unsigned long ret;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
102
mm/msync.c
Normal file
102
mm/msync.c
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* linux/mm/msync.c
|
||||
*
|
||||
* Copyright (C) 1994-1999 Linus Torvalds
|
||||
*/
|
||||
|
||||
/*
|
||||
* The msync() system call.
|
||||
*/
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/syscalls.h>
|
||||
|
||||
/*
|
||||
* MS_SYNC syncs the entire file - including mappings.
|
||||
*
|
||||
* MS_ASYNC does not start I/O (it used to, up to 2.5.67).
|
||||
* Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
|
||||
* Now it doesn't do anything, since dirty pages are properly tracked.
|
||||
*
|
||||
* The application may now run fsync() to
|
||||
* write out the dirty pages and wait on the writeout and check the result.
|
||||
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
|
||||
* async writeout immediately.
|
||||
* So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
|
||||
* applications.
|
||||
*/
|
||||
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
|
||||
{
|
||||
unsigned long end;
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
int unmapped_error = 0;
|
||||
int error = -EINVAL;
|
||||
|
||||
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
|
||||
goto out;
|
||||
if (start & ~PAGE_MASK)
|
||||
goto out;
|
||||
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
|
||||
goto out;
|
||||
error = -ENOMEM;
|
||||
len = (len + ~PAGE_MASK) & PAGE_MASK;
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
goto out;
|
||||
error = 0;
|
||||
if (end == start)
|
||||
goto out;
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address ranges,
|
||||
* just ignore them, but return -ENOMEM at the end.
|
||||
*/
|
||||
down_read(&mm->mmap_sem);
|
||||
vma = find_vma(mm, start);
|
||||
for (;;) {
|
||||
struct file *file;
|
||||
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out_unlock;
|
||||
/* Here start < vma->vm_end. */
|
||||
if (start < vma->vm_start) {
|
||||
start = vma->vm_start;
|
||||
if (start >= end)
|
||||
goto out_unlock;
|
||||
unmapped_error = -ENOMEM;
|
||||
}
|
||||
/* Here vma->vm_start <= start < vma->vm_end. */
|
||||
if ((flags & MS_INVALIDATE) &&
|
||||
(vma->vm_flags & VM_LOCKED)) {
|
||||
error = -EBUSY;
|
||||
goto out_unlock;
|
||||
}
|
||||
file = vma->vm_file;
|
||||
start = vma->vm_end;
|
||||
if ((flags & MS_SYNC) && file &&
|
||||
(vma->vm_flags & VM_SHARED)) {
|
||||
get_file(file);
|
||||
up_read(&mm->mmap_sem);
|
||||
error = do_fsync(file, 0);
|
||||
fput(file);
|
||||
if (error || start >= end)
|
||||
goto out;
|
||||
down_read(&mm->mmap_sem);
|
||||
vma = find_vma(mm, start);
|
||||
} else {
|
||||
if (start >= end) {
|
||||
error = 0;
|
||||
goto out_unlock;
|
||||
}
|
||||
vma = vma->vm_next;
|
||||
}
|
||||
}
|
||||
out_unlock:
|
||||
up_read(&mm->mmap_sem);
|
||||
out:
|
||||
return error ? : unmapped_error;
|
||||
}
|
||||
1377
mm/nommu.c
Normal file
1377
mm/nommu.c
Normal file
File diff suppressed because it is too large
Load Diff
470
mm/oom_kill.c
Normal file
470
mm/oom_kill.c
Normal file
@@ -0,0 +1,470 @@
|
||||
/*
|
||||
* linux/mm/oom_kill.c
|
||||
*
|
||||
* Copyright (C) 1998,2000 Rik van Riel
|
||||
* Thanks go out to Claus Fischer for some serious inspiration and
|
||||
* for goading me into coding this file...
|
||||
*
|
||||
* The routines in this file are used to kill a process when
|
||||
* we're seriously out of memory. This gets called from __alloc_pages()
|
||||
* in mm/page_alloc.c when we really run out of memory.
|
||||
*
|
||||
* Since we won't call these routines often (on a well-configured
|
||||
* machine) this file will double as a 'coding guide' and a signpost
|
||||
* for newbie kernel hackers. It features several pointers to major
|
||||
* kernel subsystems and hints as to where to find out what things do.
|
||||
*/
|
||||
|
||||
#include <linux/oom.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/notifier.h>
|
||||
|
||||
int sysctl_panic_on_oom;
|
||||
/* #define DEBUG */
|
||||
|
||||
/**
|
||||
* badness - calculate a numeric value for how bad this task has been
|
||||
* @p: task struct of which task we should calculate
|
||||
* @uptime: current uptime in seconds
|
||||
*
|
||||
* The formula used is relatively simple and documented inline in the
|
||||
* function. The main rationale is that we want to select a good task
|
||||
* to kill when we run out of memory.
|
||||
*
|
||||
* Good in this context means that:
|
||||
* 1) we lose the minimum amount of work done
|
||||
* 2) we recover a large amount of memory
|
||||
* 3) we don't kill anything innocent of eating tons of memory
|
||||
* 4) we want to kill the minimum amount of processes (one)
|
||||
* 5) we try to kill the process the user expects us to kill, this
|
||||
* algorithm has been meticulously tuned to meet the principle
|
||||
* of least surprise ... (be careful when you change it)
|
||||
*/
|
||||
|
||||
unsigned long badness(struct task_struct *p, unsigned long uptime)
|
||||
{
|
||||
unsigned long points, cpu_time, run_time, s;
|
||||
struct mm_struct *mm;
|
||||
struct task_struct *child;
|
||||
|
||||
task_lock(p);
|
||||
mm = p->mm;
|
||||
if (!mm) {
|
||||
task_unlock(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The memory size of the process is the basis for the badness.
|
||||
*/
|
||||
points = mm->total_vm;
|
||||
|
||||
/*
|
||||
* After this unlock we can no longer dereference local variable `mm'
|
||||
*/
|
||||
task_unlock(p);
|
||||
|
||||
/*
|
||||
* swapoff can easily use up all memory, so kill those first.
|
||||
*/
|
||||
if (p->flags & PF_SWAPOFF)
|
||||
return ULONG_MAX;
|
||||
|
||||
/*
|
||||
* Processes which fork a lot of child processes are likely
|
||||
* a good choice. We add half the vmsize of the children if they
|
||||
* have an own mm. This prevents forking servers to flood the
|
||||
* machine with an endless amount of children. In case a single
|
||||
* child is eating the vast majority of memory, adding only half
|
||||
* to the parents will make the child our kill candidate of choice.
|
||||
*/
|
||||
list_for_each_entry(child, &p->children, sibling) {
|
||||
task_lock(child);
|
||||
if (child->mm != mm && child->mm)
|
||||
points += child->mm->total_vm/2 + 1;
|
||||
task_unlock(child);
|
||||
}
|
||||
|
||||
/*
|
||||
* CPU time is in tens of seconds and run time is in thousands
|
||||
* of seconds. There is no particular reason for this other than
|
||||
* that it turned out to work very well in practice.
|
||||
*/
|
||||
cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
|
||||
>> (SHIFT_HZ + 3);
|
||||
|
||||
if (uptime >= p->start_time.tv_sec)
|
||||
run_time = (uptime - p->start_time.tv_sec) >> 10;
|
||||
else
|
||||
run_time = 0;
|
||||
|
||||
s = int_sqrt(cpu_time);
|
||||
if (s)
|
||||
points /= s;
|
||||
s = int_sqrt(int_sqrt(run_time));
|
||||
if (s)
|
||||
points /= s;
|
||||
|
||||
/*
|
||||
* Niced processes are most likely less important, so double
|
||||
* their badness points.
|
||||
*/
|
||||
if (task_nice(p) > 0)
|
||||
points *= 2;
|
||||
|
||||
/*
|
||||
* Superuser processes are usually more important, so we make it
|
||||
* less likely that we kill those.
|
||||
*/
|
||||
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
|
||||
p->uid == 0 || p->euid == 0)
|
||||
points /= 4;
|
||||
|
||||
/*
|
||||
* We don't want to kill a process with direct hardware access.
|
||||
* Not only could that mess up the hardware, but usually users
|
||||
* tend to only have this flag set on applications they think
|
||||
* of as important.
|
||||
*/
|
||||
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
|
||||
points /= 4;
|
||||
|
||||
/*
|
||||
* If p's nodes don't overlap ours, it may still help to kill p
|
||||
* because p may have allocated or otherwise mapped memory on
|
||||
* this node before. However it will be less likely.
|
||||
*/
|
||||
if (!cpuset_excl_nodes_overlap(p))
|
||||
points /= 8;
|
||||
|
||||
/*
|
||||
* Adjust the score by oomkilladj.
|
||||
*/
|
||||
if (p->oomkilladj) {
|
||||
if (p->oomkilladj > 0)
|
||||
points <<= p->oomkilladj;
|
||||
else
|
||||
points >>= -(p->oomkilladj);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
|
||||
p->pid, p->comm, points);
|
||||
#endif
|
||||
return points;
|
||||
}
|
||||
|
||||
/*
|
||||
* Types of limitations to the nodes from which allocations may occur
|
||||
*/
|
||||
#define CONSTRAINT_NONE 1
|
||||
#define CONSTRAINT_MEMORY_POLICY 2
|
||||
#define CONSTRAINT_CPUSET 3
|
||||
|
||||
/*
|
||||
* Determine the type of allocation constraint.
|
||||
*/
|
||||
static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
|
||||
{
|
||||
#ifdef CONFIG_NUMA
|
||||
struct zone **z;
|
||||
nodemask_t nodes;
|
||||
int node;
|
||||
|
||||
nodes_clear(nodes);
|
||||
/* node has memory ? */
|
||||
for_each_online_node(node)
|
||||
if (NODE_DATA(node)->node_present_pages)
|
||||
node_set(node, nodes);
|
||||
|
||||
for (z = zonelist->zones; *z; z++)
|
||||
if (cpuset_zone_allowed_softwall(*z, gfp_mask))
|
||||
node_clear(zone_to_nid(*z), nodes);
|
||||
else
|
||||
return CONSTRAINT_CPUSET;
|
||||
|
||||
if (!nodes_empty(nodes))
|
||||
return CONSTRAINT_MEMORY_POLICY;
|
||||
#endif
|
||||
|
||||
return CONSTRAINT_NONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple selection loop. We chose the process with the highest
|
||||
* number of 'points'. We expect the caller will lock the tasklist.
|
||||
*
|
||||
* (not docbooked, we don't want this one cluttering up the manual)
|
||||
*/
|
||||
static struct task_struct *select_bad_process(unsigned long *ppoints)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
struct task_struct *chosen = NULL;
|
||||
struct timespec uptime;
|
||||
*ppoints = 0;
|
||||
|
||||
do_posix_clock_monotonic_gettime(&uptime);
|
||||
do_each_thread(g, p) {
|
||||
unsigned long points;
|
||||
|
||||
/*
|
||||
* skip kernel threads and tasks which have already released
|
||||
* their mm.
|
||||
*/
|
||||
if (!p->mm)
|
||||
continue;
|
||||
/* skip the init task */
|
||||
if (is_init(p))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* This task already has access to memory reserves and is
|
||||
* being killed. Don't allow any other task access to the
|
||||
* memory reserve.
|
||||
*
|
||||
* Note: this may have a chance of deadlock if it gets
|
||||
* blocked waiting for another task which itself is waiting
|
||||
* for memory. Is there a better alternative?
|
||||
*/
|
||||
if (test_tsk_thread_flag(p, TIF_MEMDIE))
|
||||
return ERR_PTR(-1UL);
|
||||
|
||||
/*
|
||||
* This is in the process of releasing memory so wait for it
|
||||
* to finish before killing some other task by mistake.
|
||||
*
|
||||
* However, if p is the current task, we allow the 'kill' to
|
||||
* go ahead if it is exiting: this will simply set TIF_MEMDIE,
|
||||
* which will allow it to gain access to memory reserves in
|
||||
* the process of exiting and releasing its resources.
|
||||
* Otherwise we could get an easy OOM deadlock.
|
||||
*/
|
||||
if (p->flags & PF_EXITING) {
|
||||
if (p != current)
|
||||
return ERR_PTR(-1UL);
|
||||
|
||||
chosen = p;
|
||||
*ppoints = ULONG_MAX;
|
||||
}
|
||||
|
||||
if (p->oomkilladj == OOM_DISABLE)
|
||||
continue;
|
||||
|
||||
points = badness(p, uptime.tv_sec);
|
||||
if (points > *ppoints || !chosen) {
|
||||
chosen = p;
|
||||
*ppoints = points;
|
||||
}
|
||||
} while_each_thread(g, p);
|
||||
|
||||
return chosen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
|
||||
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
|
||||
* set.
|
||||
*/
|
||||
static void __oom_kill_task(struct task_struct *p, int verbose)
|
||||
{
|
||||
if (is_init(p)) {
|
||||
WARN_ON(1);
|
||||
printk(KERN_WARNING "tried to kill init!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!p->mm) {
|
||||
WARN_ON(1);
|
||||
printk(KERN_WARNING "tried to kill an mm-less task!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
|
||||
|
||||
/*
|
||||
* We give our sacrificial lamb high priority and access to
|
||||
* all the memory it needs. That way it should be able to
|
||||
* exit() and clear out its resources quickly...
|
||||
*/
|
||||
p->time_slice = HZ;
|
||||
set_tsk_thread_flag(p, TIF_MEMDIE);
|
||||
|
||||
force_sig(SIGKILL, p);
|
||||
}
|
||||
|
||||
static int oom_kill_task(struct task_struct *p)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
struct task_struct *g, *q;
|
||||
|
||||
mm = p->mm;
|
||||
|
||||
/* WARNING: mm may not be dereferenced since we did not obtain its
|
||||
* value from get_task_mm(p). This is OK since all we need to do is
|
||||
* compare mm to q->mm below.
|
||||
*
|
||||
* Furthermore, even if mm contains a non-NULL value, p->mm may
|
||||
* change to NULL at any time since we do not hold task_lock(p).
|
||||
* However, this is of no concern to us.
|
||||
*/
|
||||
|
||||
if (mm == NULL)
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Don't kill the process if any threads are set to OOM_DISABLE
|
||||
*/
|
||||
do_each_thread(g, q) {
|
||||
if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
|
||||
return 1;
|
||||
} while_each_thread(g, q);
|
||||
|
||||
__oom_kill_task(p, 1);
|
||||
|
||||
/*
|
||||
* kill all processes that share the ->mm (i.e. all threads),
|
||||
* but are in a different thread group. Don't let them have access
|
||||
* to memory reserves though, otherwise we might deplete all memory.
|
||||
*/
|
||||
do_each_thread(g, q) {
|
||||
if (q->mm == mm && q->tgid != p->tgid)
|
||||
force_sig(SIGKILL, q);
|
||||
} while_each_thread(g, q);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int oom_kill_process(struct task_struct *p, unsigned long points,
|
||||
const char *message)
|
||||
{
|
||||
struct task_struct *c;
|
||||
struct list_head *tsk;
|
||||
|
||||
/*
|
||||
* If the task is already exiting, don't alarm the sysadmin or kill
|
||||
* its children or threads, just set TIF_MEMDIE so it can die quickly
|
||||
*/
|
||||
if (p->flags & PF_EXITING) {
|
||||
__oom_kill_task(p, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
|
||||
message, p->pid, p->comm, points);
|
||||
|
||||
/* Try to kill a child first */
|
||||
list_for_each(tsk, &p->children) {
|
||||
c = list_entry(tsk, struct task_struct, sibling);
|
||||
if (c->mm == p->mm)
|
||||
continue;
|
||||
if (!oom_kill_task(c))
|
||||
return 0;
|
||||
}
|
||||
return oom_kill_task(p);
|
||||
}
|
||||
|
||||
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
|
||||
|
||||
int register_oom_notifier(struct notifier_block *nb)
|
||||
{
|
||||
return blocking_notifier_chain_register(&oom_notify_list, nb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_oom_notifier);
|
||||
|
||||
int unregister_oom_notifier(struct notifier_block *nb)
|
||||
{
|
||||
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
||||
|
||||
/**
|
||||
* out_of_memory - kill the "best" process when we run out of memory
|
||||
*
|
||||
* If we run out of memory, we have the choice between either
|
||||
* killing a random task (bad), letting the system crash (worse)
|
||||
* OR try to be smart about which process to kill. Note that we
|
||||
* don't have to be perfect here, we just have to be good.
|
||||
*/
|
||||
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
|
||||
{
|
||||
struct task_struct *p;
|
||||
unsigned long points = 0;
|
||||
unsigned long freed = 0;
|
||||
int constraint;
|
||||
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
/* Got some memory back in the last second. */
|
||||
return;
|
||||
|
||||
if (printk_ratelimit()) {
|
||||
printk(KERN_WARNING "%s invoked oom-killer: "
|
||||
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
|
||||
current->comm, gfp_mask, order, current->oomkilladj);
|
||||
dump_stack();
|
||||
show_mem();
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if there were limitations on the allocation (only relevant for
|
||||
* NUMA) that may require different handling.
|
||||
*/
|
||||
constraint = constrained_alloc(zonelist, gfp_mask);
|
||||
cpuset_lock();
|
||||
read_lock(&tasklist_lock);
|
||||
|
||||
switch (constraint) {
|
||||
case CONSTRAINT_MEMORY_POLICY:
|
||||
oom_kill_process(current, points,
|
||||
"No available memory (MPOL_BIND)");
|
||||
break;
|
||||
|
||||
case CONSTRAINT_CPUSET:
|
||||
oom_kill_process(current, points,
|
||||
"No available memory in cpuset");
|
||||
break;
|
||||
|
||||
case CONSTRAINT_NONE:
|
||||
if (sysctl_panic_on_oom)
|
||||
panic("out of memory. panic_on_oom is selected\n");
|
||||
retry:
|
||||
/*
|
||||
* Rambo mode: Shoot down a process and hope it solves whatever
|
||||
* issues we may have.
|
||||
*/
|
||||
p = select_bad_process(&points);
|
||||
|
||||
if (PTR_ERR(p) == -1UL)
|
||||
goto out;
|
||||
|
||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||
if (!p) {
|
||||
read_unlock(&tasklist_lock);
|
||||
cpuset_unlock();
|
||||
panic("Out of memory and no killable processes...\n");
|
||||
}
|
||||
|
||||
if (oom_kill_process(p, points, "Out of memory"))
|
||||
goto retry;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
out:
|
||||
read_unlock(&tasklist_lock);
|
||||
cpuset_unlock();
|
||||
|
||||
/*
|
||||
* Give "p" a good chance of killing itself before we
|
||||
* retry to allocate memory unless "p" is current
|
||||
*/
|
||||
if (!test_thread_flag(TIF_MEMDIE))
|
||||
schedule_timeout_uninterruptible(1);
|
||||
}
|
||||
981
mm/page-writeback.c
Normal file
981
mm/page-writeback.c
Normal file
@@ -0,0 +1,981 @@
|
||||
/*
|
||||
* mm/page-writeback.c
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds.
|
||||
*
|
||||
* Contains functions related to writing back dirty pages at the
|
||||
* address_space level.
|
||||
*
|
||||
* 10Apr2002 akpm@zip.com.au
|
||||
* Initial version
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/mpage.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/pagevec.h>
|
||||
|
||||
/*
|
||||
* The maximum number of pages to writeout in a single bdflush/kupdate
|
||||
* operation. We do this so we don't hold I_LOCK against an inode for
|
||||
* enormous amounts of time, which would block a userspace task which has
|
||||
* been forced to throttle against that inode. Also, the code reevaluates
|
||||
* the dirty each time it has written this many pages.
|
||||
*/
|
||||
#define MAX_WRITEBACK_PAGES 1024
|
||||
|
||||
/*
|
||||
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
|
||||
* will look to see if it needs to force writeback or throttling.
|
||||
*/
|
||||
static long ratelimit_pages = 32;
|
||||
|
||||
static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
|
||||
|
||||
/*
|
||||
* When balance_dirty_pages decides that the caller needs to perform some
|
||||
* non-background writeback, this is how many pages it will attempt to write.
|
||||
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
|
||||
* large amounts of I/O are submitted.
|
||||
*/
|
||||
static inline long sync_writeback_pages(void)
|
||||
{
|
||||
return ratelimit_pages + ratelimit_pages / 2;
|
||||
}
|
||||
|
||||
/* The following parameters are exported via /proc/sys/vm */
|
||||
|
||||
/*
|
||||
* Start background writeback (via pdflush) at this percentage
|
||||
*/
|
||||
int dirty_background_ratio = 10;
|
||||
|
||||
/*
|
||||
* The generator of dirty data starts writeback at this percentage
|
||||
*/
|
||||
int vm_dirty_ratio = 40;
|
||||
|
||||
/*
|
||||
* The interval between `kupdate'-style writebacks, in jiffies
|
||||
*/
|
||||
int dirty_writeback_interval = 5 * HZ;
|
||||
|
||||
/*
|
||||
* The longest number of jiffies for which data is allowed to remain dirty
|
||||
*/
|
||||
int dirty_expire_interval = 30 * HZ;
|
||||
|
||||
/*
|
||||
* Flag that makes the machine dump writes/reads and block dirtyings.
|
||||
*/
|
||||
int block_dump;
|
||||
|
||||
/*
|
||||
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
|
||||
* a full sync is triggered after this time elapses without any disk activity.
|
||||
*/
|
||||
int laptop_mode;
|
||||
|
||||
EXPORT_SYMBOL(laptop_mode);
|
||||
|
||||
/* End of sysctl-exported parameters */
|
||||
|
||||
|
||||
static void background_writeout(unsigned long _min_pages);
|
||||
|
||||
/*
|
||||
* Work out the current dirty-memory clamping and background writeout
|
||||
* thresholds.
|
||||
*
|
||||
* The main aim here is to lower them aggressively if there is a lot of mapped
|
||||
* memory around. To avoid stressing page reclaim with lots of unreclaimable
|
||||
* pages. It is better to clamp down on writers than to start swapping, and
|
||||
* performing lots of scanning.
|
||||
*
|
||||
* We only allow 1/2 of the currently-unmapped memory to be dirtied.
|
||||
*
|
||||
* We don't permit the clamping level to fall below 5% - that is getting rather
|
||||
* excessive.
|
||||
*
|
||||
* We make sure that the background writeout level is below the adjusted
|
||||
* clamping level.
|
||||
*/
|
||||
static void
|
||||
get_dirty_limits(long *pbackground, long *pdirty,
|
||||
struct address_space *mapping)
|
||||
{
|
||||
int background_ratio; /* Percentages */
|
||||
int dirty_ratio;
|
||||
int unmapped_ratio;
|
||||
long background;
|
||||
long dirty;
|
||||
unsigned long available_memory = vm_total_pages;
|
||||
struct task_struct *tsk;
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/*
|
||||
* We always exclude high memory from our count.
|
||||
*/
|
||||
available_memory -= totalhigh_pages;
|
||||
#endif
|
||||
|
||||
|
||||
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
|
||||
global_page_state(NR_ANON_PAGES)) * 100) /
|
||||
vm_total_pages;
|
||||
|
||||
dirty_ratio = vm_dirty_ratio;
|
||||
if (dirty_ratio > unmapped_ratio / 2)
|
||||
dirty_ratio = unmapped_ratio / 2;
|
||||
|
||||
if (dirty_ratio < 5)
|
||||
dirty_ratio = 5;
|
||||
|
||||
background_ratio = dirty_background_ratio;
|
||||
if (background_ratio >= dirty_ratio)
|
||||
background_ratio = dirty_ratio / 2;
|
||||
|
||||
background = (background_ratio * available_memory) / 100;
|
||||
dirty = (dirty_ratio * available_memory) / 100;
|
||||
tsk = current;
|
||||
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
|
||||
background += background / 4;
|
||||
dirty += dirty / 4;
|
||||
}
|
||||
*pbackground = background;
|
||||
*pdirty = dirty;
|
||||
}
|
||||
|
||||
/*
|
||||
* balance_dirty_pages() must be called by processes which are generating dirty
|
||||
* data. It looks at the number of dirty pages in the machine and will force
|
||||
* the caller to perform writeback if the system is over `vm_dirty_ratio'.
|
||||
* If we're over `background_thresh' then pdflush is woken to perform some
|
||||
* writeout.
|
||||
*/
|
||||
static void balance_dirty_pages(struct address_space *mapping)
|
||||
{
|
||||
long nr_reclaimable;
|
||||
long background_thresh;
|
||||
long dirty_thresh;
|
||||
unsigned long pages_written = 0;
|
||||
unsigned long write_chunk = sync_writeback_pages();
|
||||
|
||||
struct backing_dev_info *bdi = mapping->backing_dev_info;
|
||||
|
||||
for (;;) {
|
||||
struct writeback_control wbc = {
|
||||
.bdi = bdi,
|
||||
.sync_mode = WB_SYNC_NONE,
|
||||
.older_than_this = NULL,
|
||||
.nr_to_write = write_chunk,
|
||||
.range_cyclic = 1,
|
||||
};
|
||||
|
||||
get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
|
||||
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
|
||||
global_page_state(NR_UNSTABLE_NFS);
|
||||
if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
|
||||
dirty_thresh)
|
||||
break;
|
||||
|
||||
if (!dirty_exceeded)
|
||||
dirty_exceeded = 1;
|
||||
|
||||
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
|
||||
* Unstable writes are a feature of certain networked
|
||||
* filesystems (i.e. NFS) in which data may have been
|
||||
* written to the server's write cache, but has not yet
|
||||
* been flushed to permanent storage.
|
||||
*/
|
||||
if (nr_reclaimable) {
|
||||
writeback_inodes(&wbc);
|
||||
get_dirty_limits(&background_thresh,
|
||||
&dirty_thresh, mapping);
|
||||
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
|
||||
global_page_state(NR_UNSTABLE_NFS);
|
||||
if (nr_reclaimable +
|
||||
global_page_state(NR_WRITEBACK)
|
||||
<= dirty_thresh)
|
||||
break;
|
||||
pages_written += write_chunk - wbc.nr_to_write;
|
||||
if (pages_written >= write_chunk)
|
||||
break; /* We've done our duty */
|
||||
}
|
||||
congestion_wait(WRITE, HZ/10);
|
||||
}
|
||||
|
||||
if (nr_reclaimable + global_page_state(NR_WRITEBACK)
|
||||
<= dirty_thresh && dirty_exceeded)
|
||||
dirty_exceeded = 0;
|
||||
|
||||
if (writeback_in_progress(bdi))
|
||||
return; /* pdflush is already working this queue */
|
||||
|
||||
/*
|
||||
* In laptop mode, we wait until hitting the higher threshold before
|
||||
* starting background writeout, and then write out all the way down
|
||||
* to the lower threshold. So slow writers cause minimal disk activity.
|
||||
*
|
||||
* In normal mode, we start background writeout at the lower
|
||||
* background_thresh, to keep the amount of dirty memory low.
|
||||
*/
|
||||
if ((laptop_mode && pages_written) ||
|
||||
(!laptop_mode && (nr_reclaimable > background_thresh)))
|
||||
pdflush_operation(background_writeout, 0);
|
||||
}
|
||||
|
||||
void set_page_dirty_balance(struct page *page)
|
||||
{
|
||||
if (set_page_dirty(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
|
||||
if (mapping)
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
|
||||
* @mapping: address_space which was dirtied
|
||||
* @nr_pages_dirtied: number of pages which the caller has just dirtied
|
||||
*
|
||||
* Processes which are dirtying memory should call in here once for each page
|
||||
* which was newly dirtied. The function will periodically check the system's
|
||||
* dirty state and will initiate writeback if needed.
|
||||
*
|
||||
* On really big machines, get_writeback_state is expensive, so try to avoid
|
||||
* calling it too often (ratelimiting). But once we're over the dirty memory
|
||||
* limit we decrease the ratelimiting by a lot, to prevent individual processes
|
||||
* from overshooting the limit by (ratelimit_pages) each.
|
||||
*/
|
||||
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
||||
unsigned long nr_pages_dirtied)
|
||||
{
|
||||
static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
|
||||
unsigned long ratelimit;
|
||||
unsigned long *p;
|
||||
|
||||
ratelimit = ratelimit_pages;
|
||||
if (dirty_exceeded)
|
||||
ratelimit = 8;
|
||||
|
||||
/*
|
||||
* Check the rate limiting. Also, we do not want to throttle real-time
|
||||
* tasks in balance_dirty_pages(). Period.
|
||||
*/
|
||||
preempt_disable();
|
||||
p = &__get_cpu_var(ratelimits);
|
||||
*p += nr_pages_dirtied;
|
||||
if (unlikely(*p >= ratelimit)) {
|
||||
*p = 0;
|
||||
preempt_enable();
|
||||
balance_dirty_pages(mapping);
|
||||
return;
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
|
||||
|
||||
void throttle_vm_writeout(gfp_t gfp_mask)
|
||||
{
|
||||
long background_thresh;
|
||||
long dirty_thresh;
|
||||
|
||||
if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
|
||||
/*
|
||||
* The caller might hold locks which can prevent IO completion
|
||||
* or progress in the filesystem. So we cannot just sit here
|
||||
* waiting for IO to complete.
|
||||
*/
|
||||
congestion_wait(WRITE, HZ/10);
|
||||
return;
|
||||
}
|
||||
|
||||
for ( ; ; ) {
|
||||
get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
|
||||
|
||||
/*
|
||||
* Boost the allowable dirty threshold a bit for page
|
||||
* allocators so they don't get DoS'ed by heavy writers
|
||||
*/
|
||||
dirty_thresh += dirty_thresh / 10; /* wheeee... */
|
||||
|
||||
if (global_page_state(NR_UNSTABLE_NFS) +
|
||||
global_page_state(NR_WRITEBACK) <= dirty_thresh)
|
||||
break;
|
||||
congestion_wait(WRITE, HZ/10);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* writeback at least _min_pages, and keep writing until the amount of dirty
|
||||
* memory is less than the background threshold, or until we're all clean.
|
||||
*/
|
||||
static void background_writeout(unsigned long _min_pages)
|
||||
{
|
||||
long min_pages = _min_pages;
|
||||
struct writeback_control wbc = {
|
||||
.bdi = NULL,
|
||||
.sync_mode = WB_SYNC_NONE,
|
||||
.older_than_this = NULL,
|
||||
.nr_to_write = 0,
|
||||
.nonblocking = 1,
|
||||
.range_cyclic = 1,
|
||||
};
|
||||
|
||||
for ( ; ; ) {
|
||||
long background_thresh;
|
||||
long dirty_thresh;
|
||||
|
||||
get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
|
||||
if (global_page_state(NR_FILE_DIRTY) +
|
||||
global_page_state(NR_UNSTABLE_NFS) < background_thresh
|
||||
&& min_pages <= 0)
|
||||
break;
|
||||
wbc.encountered_congestion = 0;
|
||||
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
|
||||
wbc.pages_skipped = 0;
|
||||
writeback_inodes(&wbc);
|
||||
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
|
||||
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
|
||||
/* Wrote less than expected */
|
||||
congestion_wait(WRITE, HZ/10);
|
||||
if (!wbc.encountered_congestion)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
|
||||
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
|
||||
* -1 if all pdflush threads were busy.
|
||||
*/
|
||||
int wakeup_pdflush(long nr_pages)
|
||||
{
|
||||
if (nr_pages == 0)
|
||||
nr_pages = global_page_state(NR_FILE_DIRTY) +
|
||||
global_page_state(NR_UNSTABLE_NFS);
|
||||
return pdflush_operation(background_writeout, nr_pages);
|
||||
}
|
||||
|
||||
static void wb_timer_fn(unsigned long unused);
|
||||
static void laptop_timer_fn(unsigned long unused);
|
||||
|
||||
static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
|
||||
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
|
||||
|
||||
/*
|
||||
* Periodic writeback of "old" data.
|
||||
*
|
||||
* Define "old": the first time one of an inode's pages is dirtied, we mark the
|
||||
* dirtying-time in the inode's address_space. So this periodic writeback code
|
||||
* just walks the superblock inode list, writing back any inodes which are
|
||||
* older than a specific point in time.
|
||||
*
|
||||
* Try to run once per dirty_writeback_interval. But if a writeback event
|
||||
* takes longer than a dirty_writeback_interval interval, then leave a
|
||||
* one-second gap.
|
||||
*
|
||||
* older_than_this takes precedence over nr_to_write. So we'll only write back
|
||||
* all dirty pages if they are all attached to "old" mappings.
|
||||
*/
|
||||
static void wb_kupdate(unsigned long arg)
|
||||
{
|
||||
unsigned long oldest_jif;
|
||||
unsigned long start_jif;
|
||||
unsigned long next_jif;
|
||||
long nr_to_write;
|
||||
struct writeback_control wbc = {
|
||||
.bdi = NULL,
|
||||
.sync_mode = WB_SYNC_NONE,
|
||||
.older_than_this = &oldest_jif,
|
||||
.nr_to_write = 0,
|
||||
.nonblocking = 1,
|
||||
.for_kupdate = 1,
|
||||
.range_cyclic = 1,
|
||||
};
|
||||
|
||||
sync_supers();
|
||||
|
||||
oldest_jif = jiffies - dirty_expire_interval;
|
||||
start_jif = jiffies;
|
||||
next_jif = start_jif + dirty_writeback_interval;
|
||||
nr_to_write = global_page_state(NR_FILE_DIRTY) +
|
||||
global_page_state(NR_UNSTABLE_NFS) +
|
||||
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
|
||||
while (nr_to_write > 0) {
|
||||
wbc.encountered_congestion = 0;
|
||||
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
|
||||
writeback_inodes(&wbc);
|
||||
if (wbc.nr_to_write > 0) {
|
||||
if (wbc.encountered_congestion)
|
||||
congestion_wait(WRITE, HZ/10);
|
||||
else
|
||||
break; /* All the old data is written */
|
||||
}
|
||||
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
|
||||
}
|
||||
if (time_before(next_jif, jiffies + HZ))
|
||||
next_jif = jiffies + HZ;
|
||||
if (dirty_writeback_interval)
|
||||
mod_timer(&wb_timer, next_jif);
|
||||
}
|
||||
|
||||
/*
|
||||
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
|
||||
*/
|
||||
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
|
||||
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
|
||||
if (dirty_writeback_interval) {
|
||||
mod_timer(&wb_timer,
|
||||
jiffies + dirty_writeback_interval);
|
||||
} else {
|
||||
del_timer(&wb_timer);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void wb_timer_fn(unsigned long unused)
|
||||
{
|
||||
if (pdflush_operation(wb_kupdate, 0) < 0)
|
||||
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
|
||||
}
|
||||
|
||||
static void laptop_flush(unsigned long unused)
|
||||
{
|
||||
sys_sync();
|
||||
}
|
||||
|
||||
static void laptop_timer_fn(unsigned long unused)
|
||||
{
|
||||
pdflush_operation(laptop_flush, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* We've spun up the disk and we're in laptop mode: schedule writeback
|
||||
* of all dirty data a few seconds from now. If the flush is already scheduled
|
||||
* then push it back - the user is still using the disk.
|
||||
*/
|
||||
void laptop_io_completion(void)
|
||||
{
|
||||
mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
|
||||
}
|
||||
|
||||
/*
|
||||
* We're in laptop mode and we've just synced. The sync's writes will have
|
||||
* caused another writeback to be scheduled by laptop_io_completion.
|
||||
* Nothing needs to be written back anymore, so we unschedule the writeback.
|
||||
*/
|
||||
void laptop_sync_completion(void)
|
||||
{
|
||||
del_timer(&laptop_mode_wb_timer);
|
||||
}
|
||||
|
||||
/*
|
||||
* If ratelimit_pages is too high then we can get into dirty-data overload
|
||||
* if a large number of processes all perform writes at the same time.
|
||||
* If it is too low then SMP machines will call the (expensive)
|
||||
* get_writeback_state too often.
|
||||
*
|
||||
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
|
||||
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
|
||||
* thresholds before writeback cuts in.
|
||||
*
|
||||
* But the limit should not be set too high. Because it also controls the
|
||||
* amount of memory which the balance_dirty_pages() caller has to write back.
|
||||
* If this is too large then the caller will block on the IO queue all the
|
||||
* time. So limit it to four megabytes - the balance_dirty_pages() caller
|
||||
* will write six megabyte chunks, max.
|
||||
*/
|
||||
|
||||
void writeback_set_ratelimit(void)
|
||||
{
|
||||
ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
|
||||
if (ratelimit_pages < 16)
|
||||
ratelimit_pages = 16;
|
||||
if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
|
||||
ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
|
||||
}
|
||||
|
||||
static int __cpuinit
|
||||
ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
|
||||
{
|
||||
writeback_set_ratelimit();
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block __cpuinitdata ratelimit_nb = {
|
||||
.notifier_call = ratelimit_handler,
|
||||
.next = NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
* Called early on to tune the page writeback dirty limits.
|
||||
*
|
||||
* We used to scale dirty pages according to how total memory
|
||||
* related to pages that could be allocated for buffers (by
|
||||
* comparing nr_free_buffer_pages() to vm_total_pages.
|
||||
*
|
||||
* However, that was when we used "dirty_ratio" to scale with
|
||||
* all memory, and we don't do that any more. "dirty_ratio"
|
||||
* is now applied to total non-HIGHPAGE memory (by subtracting
|
||||
* totalhigh_pages from vm_total_pages), and as such we can't
|
||||
* get into the old insane situation any more where we had
|
||||
* large amounts of dirty pages compared to a small amount of
|
||||
* non-HIGHMEM memory.
|
||||
*
|
||||
* But we might still want to scale the dirty_ratio by how
|
||||
* much memory the box has..
|
||||
*/
|
||||
void __init page_writeback_init(void)
|
||||
{
|
||||
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
|
||||
writeback_set_ratelimit();
|
||||
register_cpu_notifier(&ratelimit_nb);
|
||||
}
|
||||
|
||||
/**
|
||||
* generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
|
||||
* @mapping: address space structure to write
|
||||
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
|
||||
*
|
||||
* This is a library function, which implements the writepages()
|
||||
* address_space_operation.
|
||||
*
|
||||
* If a page is already under I/O, generic_writepages() skips it, even
|
||||
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
|
||||
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
|
||||
* and msync() need to guarantee that all the data which was dirty at the time
|
||||
* the call was made get new I/O started against them. If wbc->sync_mode is
|
||||
* WB_SYNC_ALL then we were called for data integrity and we must wait for
|
||||
* existing IO to complete.
|
||||
*
|
||||
* Derived from mpage_writepages() - if you fix this you should check that
|
||||
* also!
|
||||
*/
|
||||
int generic_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
struct backing_dev_info *bdi = mapping->backing_dev_info;
|
||||
int ret = 0;
|
||||
int done = 0;
|
||||
int (*writepage)(struct page *page, struct writeback_control *wbc);
|
||||
struct pagevec pvec;
|
||||
int nr_pages;
|
||||
pgoff_t index;
|
||||
pgoff_t end; /* Inclusive */
|
||||
int scanned = 0;
|
||||
int range_whole = 0;
|
||||
|
||||
if (wbc->nonblocking && bdi_write_congested(bdi)) {
|
||||
wbc->encountered_congestion = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
writepage = mapping->a_ops->writepage;
|
||||
|
||||
/* deal with chardevs and other special file */
|
||||
if (!writepage)
|
||||
return 0;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
if (wbc->range_cyclic) {
|
||||
index = mapping->writeback_index; /* Start from prev offset */
|
||||
end = -1;
|
||||
} else {
|
||||
index = wbc->range_start >> PAGE_CACHE_SHIFT;
|
||||
end = wbc->range_end >> PAGE_CACHE_SHIFT;
|
||||
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
|
||||
range_whole = 1;
|
||||
scanned = 1;
|
||||
}
|
||||
retry:
|
||||
while (!done && (index <= end) &&
|
||||
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
|
||||
PAGECACHE_TAG_DIRTY,
|
||||
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
|
||||
unsigned i;
|
||||
|
||||
scanned = 1;
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
|
||||
/*
|
||||
* At this point we hold neither mapping->tree_lock nor
|
||||
* lock on the page itself: the page may be truncated or
|
||||
* invalidated (changing page->mapping to NULL), or even
|
||||
* swizzled back from swapper_space to tmpfs file
|
||||
* mapping
|
||||
*/
|
||||
lock_page(page);
|
||||
|
||||
if (unlikely(page->mapping != mapping)) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!wbc->range_cyclic && page->index > end) {
|
||||
done = 1;
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wbc->sync_mode != WB_SYNC_NONE)
|
||||
wait_on_page_writeback(page);
|
||||
|
||||
if (PageWriteback(page) ||
|
||||
!clear_page_dirty_for_io(page)) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = (*writepage)(page, wbc);
|
||||
if (ret) {
|
||||
if (ret == -ENOSPC)
|
||||
set_bit(AS_ENOSPC, &mapping->flags);
|
||||
else
|
||||
set_bit(AS_EIO, &mapping->flags);
|
||||
}
|
||||
|
||||
if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
|
||||
unlock_page(page);
|
||||
if (ret || (--(wbc->nr_to_write) <= 0))
|
||||
done = 1;
|
||||
if (wbc->nonblocking && bdi_write_congested(bdi)) {
|
||||
wbc->encountered_congestion = 1;
|
||||
done = 1;
|
||||
}
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
if (!scanned && !done) {
|
||||
/*
|
||||
* We hit the last page and there is more work to be done: wrap
|
||||
* back to the start of the file
|
||||
*/
|
||||
scanned = 1;
|
||||
index = 0;
|
||||
goto retry;
|
||||
}
|
||||
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
|
||||
mapping->writeback_index = index;
|
||||
return ret;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(generic_writepages);
|
||||
|
||||
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (wbc->nr_to_write <= 0)
|
||||
return 0;
|
||||
wbc->for_writepages = 1;
|
||||
if (mapping->a_ops->writepages)
|
||||
ret = mapping->a_ops->writepages(mapping, wbc);
|
||||
else
|
||||
ret = generic_writepages(mapping, wbc);
|
||||
wbc->for_writepages = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* write_one_page - write out a single page and optionally wait on I/O
|
||||
* @page: the page to write
|
||||
* @wait: if true, wait on writeout
|
||||
*
|
||||
* The page must be locked by the caller and will be unlocked upon return.
|
||||
*
|
||||
* write_one_page() returns a negative error code if I/O failed.
|
||||
*/
|
||||
int write_one_page(struct page *page, int wait)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
int ret = 0;
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
.nr_to_write = 1,
|
||||
};
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (wait)
|
||||
wait_on_page_writeback(page);
|
||||
|
||||
if (clear_page_dirty_for_io(page)) {
|
||||
page_cache_get(page);
|
||||
ret = mapping->a_ops->writepage(page, &wbc);
|
||||
if (ret == 0 && wait) {
|
||||
wait_on_page_writeback(page);
|
||||
if (PageError(page))
|
||||
ret = -EIO;
|
||||
}
|
||||
page_cache_release(page);
|
||||
} else {
|
||||
unlock_page(page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(write_one_page);
|
||||
|
||||
/*
|
||||
* For address_spaces which do not use buffers nor write back.
|
||||
*/
|
||||
int __set_page_dirty_no_writeback(struct page *page)
|
||||
{
|
||||
if (!PageDirty(page))
|
||||
SetPageDirty(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For address_spaces which do not use buffers. Just tag the page as dirty in
|
||||
* its radix tree.
|
||||
*
|
||||
* This is also used when a single buffer is being dirtied: we want to set the
|
||||
* page dirty in that case, but not all the buffers. This is a "bottom-up"
|
||||
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
|
||||
*
|
||||
* Most callers have locked the page, which pins the address_space in memory.
|
||||
* But zap_pte_range() does not lock the page, however in that case the
|
||||
* mapping is pinned by the vma's ->vm_file reference.
|
||||
*
|
||||
* We take care to handle the case where the page was truncated from the
|
||||
* mapping by re-checking page_mapping() insode tree_lock.
|
||||
*/
|
||||
int __set_page_dirty_nobuffers(struct page *page)
|
||||
{
|
||||
if (!TestSetPageDirty(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
struct address_space *mapping2;
|
||||
|
||||
if (!mapping)
|
||||
return 1;
|
||||
|
||||
write_lock_irq(&mapping->tree_lock);
|
||||
mapping2 = page_mapping(page);
|
||||
if (mapping2) { /* Race with truncate? */
|
||||
BUG_ON(mapping2 != mapping);
|
||||
if (mapping_cap_account_dirty(mapping)) {
|
||||
__inc_zone_page_state(page, NR_FILE_DIRTY);
|
||||
task_io_account_write(PAGE_CACHE_SIZE);
|
||||
}
|
||||
radix_tree_tag_set(&mapping->page_tree,
|
||||
page_index(page), PAGECACHE_TAG_DIRTY);
|
||||
}
|
||||
write_unlock_irq(&mapping->tree_lock);
|
||||
if (mapping->host) {
|
||||
/* !PageAnon && !swapper_space */
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
|
||||
|
||||
/*
|
||||
* When a writepage implementation decides that it doesn't want to write this
|
||||
* page for some reason, it should redirty the locked page via
|
||||
* redirty_page_for_writepage() and it should then unlock the page and return 0
|
||||
*/
|
||||
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
|
||||
{
|
||||
wbc->pages_skipped++;
|
||||
return __set_page_dirty_nobuffers(page);
|
||||
}
|
||||
EXPORT_SYMBOL(redirty_page_for_writepage);
|
||||
|
||||
/*
|
||||
* If the mapping doesn't provide a set_page_dirty a_op, then
|
||||
* just fall through and assume that it wants buffer_heads.
|
||||
*/
|
||||
int fastcall set_page_dirty(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
|
||||
if (likely(mapping)) {
|
||||
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
|
||||
#ifdef CONFIG_BLOCK
|
||||
if (!spd)
|
||||
spd = __set_page_dirty_buffers;
|
||||
#endif
|
||||
return (*spd)(page);
|
||||
}
|
||||
if (!PageDirty(page)) {
|
||||
if (!TestSetPageDirty(page))
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(set_page_dirty);
|
||||
|
||||
/*
|
||||
* set_page_dirty() is racy if the caller has no reference against
|
||||
* page->mapping->host, and if the page is unlocked. This is because another
|
||||
* CPU could truncate the page off the mapping and then free the mapping.
|
||||
*
|
||||
* Usually, the page _is_ locked, or the caller is a user-space process which
|
||||
* holds a reference on the inode by having an open file.
|
||||
*
|
||||
* In other cases, the page should be locked before running set_page_dirty().
|
||||
*/
|
||||
int set_page_dirty_lock(struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
lock_page_nosync(page);
|
||||
ret = set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(set_page_dirty_lock);
|
||||
|
||||
/*
|
||||
* Clear a page's dirty flag, while caring for dirty memory accounting.
|
||||
* Returns true if the page was previously dirty.
|
||||
*
|
||||
* This is for preparing to put the page under writeout. We leave the page
|
||||
* tagged as dirty in the radix tree so that a concurrent write-for-sync
|
||||
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
|
||||
* implementation will run either set_page_writeback() or set_page_dirty(),
|
||||
* at which stage we bring the page's dirty flag and radix-tree dirty tag
|
||||
* back into sync.
|
||||
*
|
||||
* This incoherency between the page's dirty flag and radix-tree tag is
|
||||
* unfortunate, but it only exists while the page is locked.
|
||||
*/
|
||||
int clear_page_dirty_for_io(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
|
||||
if (mapping && mapping_cap_account_dirty(mapping)) {
|
||||
/*
|
||||
* Yes, Virginia, this is indeed insane.
|
||||
*
|
||||
* We use this sequence to make sure that
|
||||
* (a) we account for dirty stats properly
|
||||
* (b) we tell the low-level filesystem to
|
||||
* mark the whole page dirty if it was
|
||||
* dirty in a pagetable. Only to then
|
||||
* (c) clean the page again and return 1 to
|
||||
* cause the writeback.
|
||||
*
|
||||
* This way we avoid all nasty races with the
|
||||
* dirty bit in multiple places and clearing
|
||||
* them concurrently from different threads.
|
||||
*
|
||||
* Note! Normally the "set_page_dirty(page)"
|
||||
* has no effect on the actual dirty bit - since
|
||||
* that will already usually be set. But we
|
||||
* need the side effects, and it can help us
|
||||
* avoid races.
|
||||
*
|
||||
* We basically use the page "master dirty bit"
|
||||
* as a serialization point for all the different
|
||||
* threads doing their things.
|
||||
*
|
||||
* FIXME! We still have a race here: if somebody
|
||||
* adds the page back to the page tables in
|
||||
* between the "page_mkclean()" and the "TestClearPageDirty()",
|
||||
* we might have it mapped without the dirty bit set.
|
||||
*/
|
||||
if (page_mkclean(page))
|
||||
set_page_dirty(page);
|
||||
if (TestClearPageDirty(page)) {
|
||||
dec_zone_page_state(page, NR_FILE_DIRTY);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
return TestClearPageDirty(page);
|
||||
}
|
||||
EXPORT_SYMBOL(clear_page_dirty_for_io);
|
||||
|
||||
int test_clear_page_writeback(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
int ret;
|
||||
|
||||
if (mapping) {
|
||||
unsigned long flags;
|
||||
|
||||
write_lock_irqsave(&mapping->tree_lock, flags);
|
||||
ret = TestClearPageWriteback(page);
|
||||
if (ret)
|
||||
radix_tree_tag_clear(&mapping->page_tree,
|
||||
page_index(page),
|
||||
PAGECACHE_TAG_WRITEBACK);
|
||||
write_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
} else {
|
||||
ret = TestClearPageWriteback(page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int test_set_page_writeback(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
int ret;
|
||||
|
||||
if (mapping) {
|
||||
unsigned long flags;
|
||||
|
||||
write_lock_irqsave(&mapping->tree_lock, flags);
|
||||
ret = TestSetPageWriteback(page);
|
||||
if (!ret)
|
||||
radix_tree_tag_set(&mapping->page_tree,
|
||||
page_index(page),
|
||||
PAGECACHE_TAG_WRITEBACK);
|
||||
if (!PageDirty(page))
|
||||
radix_tree_tag_clear(&mapping->page_tree,
|
||||
page_index(page),
|
||||
PAGECACHE_TAG_DIRTY);
|
||||
write_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
} else {
|
||||
ret = TestSetPageWriteback(page);
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL(test_set_page_writeback);
|
||||
|
||||
/*
|
||||
* Return true if any of the pages in the mapping are marged with the
|
||||
* passed tag.
|
||||
*/
|
||||
int mapping_tagged(struct address_space *mapping, int tag)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
read_lock_irqsave(&mapping->tree_lock, flags);
|
||||
ret = radix_tree_tagged(&mapping->page_tree, tag);
|
||||
read_unlock_irqrestore(&mapping->tree_lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(mapping_tagged);
|
||||
3394
mm/page_alloc.c
Normal file
3394
mm/page_alloc.c
Normal file
File diff suppressed because it is too large
Load Diff
149
mm/page_io.c
Normal file
149
mm/page_io.c
Normal file
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
* linux/mm/page_io.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
||||
*
|
||||
* Swap reorganised 29.12.95,
|
||||
* Asynchronous swapping added 30.12.95. Stephen Tweedie
|
||||
* Removed race in async swapping. 14.4.1996. Bruno Haible
|
||||
* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
|
||||
* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
|
||||
struct page *page, bio_end_io_t end_io)
|
||||
{
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_alloc(gfp_flags, 1);
|
||||
if (bio) {
|
||||
struct swap_info_struct *sis;
|
||||
swp_entry_t entry = { .val = index, };
|
||||
|
||||
sis = get_swap_info_struct(swp_type(entry));
|
||||
bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
|
||||
(PAGE_SIZE >> 9);
|
||||
bio->bi_bdev = sis->bdev;
|
||||
bio->bi_io_vec[0].bv_page = page;
|
||||
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
|
||||
bio->bi_io_vec[0].bv_offset = 0;
|
||||
bio->bi_vcnt = 1;
|
||||
bio->bi_idx = 0;
|
||||
bio->bi_size = PAGE_SIZE;
|
||||
bio->bi_end_io = end_io;
|
||||
}
|
||||
return bio;
|
||||
}
|
||||
|
||||
static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
|
||||
struct page *page = bio->bi_io_vec[0].bv_page;
|
||||
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
if (!uptodate) {
|
||||
SetPageError(page);
|
||||
/*
|
||||
* We failed to write the page out to swap-space.
|
||||
* Re-dirty the page in order to avoid it being reclaimed.
|
||||
* Also print a dire warning that things will go BAD (tm)
|
||||
* very quickly.
|
||||
*
|
||||
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
|
||||
*/
|
||||
set_page_dirty(page);
|
||||
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
|
||||
imajor(bio->bi_bdev->bd_inode),
|
||||
iminor(bio->bi_bdev->bd_inode),
|
||||
(unsigned long long)bio->bi_sector);
|
||||
ClearPageReclaim(page);
|
||||
}
|
||||
end_page_writeback(page);
|
||||
bio_put(bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
|
||||
{
|
||||
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
|
||||
struct page *page = bio->bi_io_vec[0].bv_page;
|
||||
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
if (!uptodate) {
|
||||
SetPageError(page);
|
||||
ClearPageUptodate(page);
|
||||
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
|
||||
imajor(bio->bi_bdev->bd_inode),
|
||||
iminor(bio->bi_bdev->bd_inode),
|
||||
(unsigned long long)bio->bi_sector);
|
||||
} else {
|
||||
SetPageUptodate(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
bio_put(bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We may have stale swap cache pages in memory: notice
|
||||
* them here and get rid of the unnecessary final write.
|
||||
*/
|
||||
int swap_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct bio *bio;
|
||||
int ret = 0, rw = WRITE;
|
||||
|
||||
if (remove_exclusive_swap_page(page)) {
|
||||
unlock_page(page);
|
||||
goto out;
|
||||
}
|
||||
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
|
||||
end_swap_bio_write);
|
||||
if (bio == NULL) {
|
||||
set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (wbc->sync_mode == WB_SYNC_ALL)
|
||||
rw |= (1 << BIO_RW_SYNC);
|
||||
count_vm_event(PSWPOUT);
|
||||
set_page_writeback(page);
|
||||
unlock_page(page);
|
||||
submit_bio(rw, bio);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int swap_readpage(struct file *file, struct page *page)
|
||||
{
|
||||
struct bio *bio;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
ClearPageUptodate(page);
|
||||
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
|
||||
end_swap_bio_read);
|
||||
if (bio == NULL) {
|
||||
unlock_page(page);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
count_vm_event(PSWPIN);
|
||||
submit_bio(READ, bio);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
240
mm/pdflush.c
Normal file
240
mm/pdflush.c
Normal file
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
* mm/pdflush.c - worker threads for writing back filesystem data
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds.
|
||||
*
|
||||
* 09Apr2002 akpm@zip.com.au
|
||||
* Initial version
|
||||
* 29Feb2004 kaos@sgi.com
|
||||
* Move worker thread creation to kthread to avoid chewing
|
||||
* up stack space with nested calls to kernel_thread.
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h> // Needed by writeback.h
|
||||
#include <linux/writeback.h> // Prototypes pdflush_operation()
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/freezer.h>
|
||||
|
||||
|
||||
/*
|
||||
* Minimum and maximum number of pdflush instances
|
||||
*/
|
||||
#define MIN_PDFLUSH_THREADS 2
|
||||
#define MAX_PDFLUSH_THREADS 8
|
||||
|
||||
static void start_one_pdflush_thread(void);
|
||||
|
||||
|
||||
/*
|
||||
* The pdflush threads are worker threads for writing back dirty data.
|
||||
* Ideally, we'd like one thread per active disk spindle. But the disk
|
||||
* topology is very hard to divine at this level. Instead, we take
|
||||
* care in various places to prevent more than one pdflush thread from
|
||||
* performing writeback against a single filesystem. pdflush threads
|
||||
* have the PF_FLUSHER flag set in current->flags to aid in this.
|
||||
*/
|
||||
|
||||
/*
|
||||
* All the pdflush threads. Protected by pdflush_lock
|
||||
*/
|
||||
static LIST_HEAD(pdflush_list);
|
||||
static DEFINE_SPINLOCK(pdflush_lock);
|
||||
|
||||
/*
|
||||
* The count of currently-running pdflush threads. Protected
|
||||
* by pdflush_lock.
|
||||
*
|
||||
* Readable by sysctl, but not writable. Published to userspace at
|
||||
* /proc/sys/vm/nr_pdflush_threads.
|
||||
*/
|
||||
int nr_pdflush_threads = 0;
|
||||
|
||||
/*
|
||||
* The time at which the pdflush thread pool last went empty
|
||||
*/
|
||||
static unsigned long last_empty_jifs;
|
||||
|
||||
/*
|
||||
* The pdflush thread.
|
||||
*
|
||||
* Thread pool management algorithm:
|
||||
*
|
||||
* - The minimum and maximum number of pdflush instances are bound
|
||||
* by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
|
||||
*
|
||||
* - If there have been no idle pdflush instances for 1 second, create
|
||||
* a new one.
|
||||
*
|
||||
* - If the least-recently-went-to-sleep pdflush thread has been asleep
|
||||
* for more than one second, terminate a thread.
|
||||
*/
|
||||
|
||||
/*
|
||||
* A structure for passing work to a pdflush thread. Also for passing
|
||||
* state information between pdflush threads. Protected by pdflush_lock.
|
||||
*/
|
||||
struct pdflush_work {
|
||||
struct task_struct *who; /* The thread */
|
||||
void (*fn)(unsigned long); /* A callback function */
|
||||
unsigned long arg0; /* An argument to the callback */
|
||||
struct list_head list; /* On pdflush_list, when idle */
|
||||
unsigned long when_i_went_to_sleep;
|
||||
};
|
||||
|
||||
static int __pdflush(struct pdflush_work *my_work)
|
||||
{
|
||||
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
|
||||
my_work->fn = NULL;
|
||||
my_work->who = current;
|
||||
INIT_LIST_HEAD(&my_work->list);
|
||||
|
||||
spin_lock_irq(&pdflush_lock);
|
||||
nr_pdflush_threads++;
|
||||
for ( ; ; ) {
|
||||
struct pdflush_work *pdf;
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
list_move(&my_work->list, &pdflush_list);
|
||||
my_work->when_i_went_to_sleep = jiffies;
|
||||
spin_unlock_irq(&pdflush_lock);
|
||||
schedule();
|
||||
try_to_freeze();
|
||||
spin_lock_irq(&pdflush_lock);
|
||||
if (!list_empty(&my_work->list)) {
|
||||
/*
|
||||
* Someone woke us up, but without removing our control
|
||||
* structure from the global list. swsusp will do this
|
||||
* in try_to_freeze()->refrigerator(). Handle it.
|
||||
*/
|
||||
my_work->fn = NULL;
|
||||
continue;
|
||||
}
|
||||
if (my_work->fn == NULL) {
|
||||
printk("pdflush: bogus wakeup\n");
|
||||
continue;
|
||||
}
|
||||
spin_unlock_irq(&pdflush_lock);
|
||||
|
||||
(*my_work->fn)(my_work->arg0);
|
||||
|
||||
/*
|
||||
* Thread creation: For how long have there been zero
|
||||
* available threads?
|
||||
*/
|
||||
if (jiffies - last_empty_jifs > 1 * HZ) {
|
||||
/* unlocked list_empty() test is OK here */
|
||||
if (list_empty(&pdflush_list)) {
|
||||
/* unlocked test is OK here */
|
||||
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
|
||||
start_one_pdflush_thread();
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock_irq(&pdflush_lock);
|
||||
my_work->fn = NULL;
|
||||
|
||||
/*
|
||||
* Thread destruction: For how long has the sleepiest
|
||||
* thread slept?
|
||||
*/
|
||||
if (list_empty(&pdflush_list))
|
||||
continue;
|
||||
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
|
||||
continue;
|
||||
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
|
||||
if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
|
||||
/* Limit exit rate */
|
||||
pdf->when_i_went_to_sleep = jiffies;
|
||||
break; /* exeunt */
|
||||
}
|
||||
}
|
||||
nr_pdflush_threads--;
|
||||
spin_unlock_irq(&pdflush_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Of course, my_work wants to be just a local in __pdflush(). It is
|
||||
* separated out in this manner to hopefully prevent the compiler from
|
||||
* performing unfortunate optimisations against the auto variables. Because
|
||||
* these are visible to other tasks and CPUs. (No problem has actually
|
||||
* been observed. This is just paranoia).
|
||||
*/
|
||||
static int pdflush(void *dummy)
|
||||
{
|
||||
struct pdflush_work my_work;
|
||||
cpumask_t cpus_allowed;
|
||||
|
||||
/*
|
||||
* pdflush can spend a lot of time doing encryption via dm-crypt. We
|
||||
* don't want to do that at keventd's priority.
|
||||
*/
|
||||
set_user_nice(current, 0);
|
||||
|
||||
/*
|
||||
* Some configs put our parent kthread in a limited cpuset,
|
||||
* which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
|
||||
* Our needs are more modest - cut back to our cpusets cpus_allowed.
|
||||
* This is needed as pdflush's are dynamically created and destroyed.
|
||||
* The boottime pdflush's are easily placed w/o these 2 lines.
|
||||
*/
|
||||
cpus_allowed = cpuset_cpus_allowed(current);
|
||||
set_cpus_allowed(current, cpus_allowed);
|
||||
|
||||
return __pdflush(&my_work);
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to wake up a pdflush thread, and get it to do some work for you.
|
||||
* Returns zero if it indeed managed to find a worker thread, and passed your
|
||||
* payload to it.
|
||||
*/
|
||||
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
|
||||
|
||||
spin_lock_irqsave(&pdflush_lock, flags);
|
||||
if (list_empty(&pdflush_list)) {
|
||||
spin_unlock_irqrestore(&pdflush_lock, flags);
|
||||
ret = -1;
|
||||
} else {
|
||||
struct pdflush_work *pdf;
|
||||
|
||||
pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
|
||||
list_del_init(&pdf->list);
|
||||
if (list_empty(&pdflush_list))
|
||||
last_empty_jifs = jiffies;
|
||||
pdf->fn = fn;
|
||||
pdf->arg0 = arg0;
|
||||
wake_up_process(pdf->who);
|
||||
spin_unlock_irqrestore(&pdflush_lock, flags);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void start_one_pdflush_thread(void)
|
||||
{
|
||||
kthread_run(pdflush, NULL, "pdflush");
|
||||
}
|
||||
|
||||
static int __init pdflush_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
|
||||
start_one_pdflush_thread();
|
||||
return 0;
|
||||
}
|
||||
|
||||
module_init(pdflush_init);
|
||||
207
mm/prio_tree.c
Normal file
207
mm/prio_tree.c
Normal file
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
* mm/prio_tree.c - priority search tree for mapping->i_mmap
|
||||
*
|
||||
* Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
|
||||
*
|
||||
* This file is released under the GPL v2.
|
||||
*
|
||||
* Based on the radix priority search tree proposed by Edward M. McCreight
|
||||
* SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
|
||||
*
|
||||
* 02Feb2004 Initial version
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/prio_tree.h>
|
||||
|
||||
/*
|
||||
* See lib/prio_tree.c for details on the general radix priority search tree
|
||||
* code.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The following #defines are mirrored from lib/prio_tree.c. They're only used
|
||||
* for debugging, and should be removed (along with the debugging code using
|
||||
* them) when switching also VMAs to the regular prio_tree code.
|
||||
*/
|
||||
|
||||
#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
|
||||
#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
|
||||
/* avoid overflow */
|
||||
#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
|
||||
|
||||
/*
|
||||
* Radix priority search tree for address_space->i_mmap
|
||||
*
|
||||
* For each vma that map a unique set of file pages i.e., unique [radix_index,
|
||||
* heap_index] value, we have a corresponing priority search tree node. If
|
||||
* multiple vmas have identical [radix_index, heap_index] value, then one of
|
||||
* them is used as a tree node and others are stored in a vm_set list. The tree
|
||||
* node points to the first vma (head) of the list using vm_set.head.
|
||||
*
|
||||
* prio_tree_root
|
||||
* |
|
||||
* A vm_set.head
|
||||
* / \ /
|
||||
* L R -> H-I-J-K-M-N-O-P-Q-S
|
||||
* ^ ^ <-- vm_set.list -->
|
||||
* tree nodes
|
||||
*
|
||||
* We need some way to identify whether a vma is a tree node, head of a vm_set
|
||||
* list, or just a member of a vm_set list. We cannot use vm_flags to store
|
||||
* such information. The reason is, in the above figure, it is possible that
|
||||
* vm_flags' of R and H are covered by the different mmap_sems. When R is
|
||||
* removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
|
||||
* H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
|
||||
* That's why some trick involving shared.vm_set.parent is used for identifying
|
||||
* tree nodes and list head nodes.
|
||||
*
|
||||
* vma radix priority search tree node rules:
|
||||
*
|
||||
* vma->shared.vm_set.parent != NULL ==> a tree node
|
||||
* vma->shared.vm_set.head != NULL ==> list of others mapping same range
|
||||
* vma->shared.vm_set.head == NULL ==> no others map the same range
|
||||
*
|
||||
* vma->shared.vm_set.parent == NULL
|
||||
* vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
|
||||
* vma->shared.vm_set.head == NULL ==> a list node
|
||||
*/
|
||||
|
||||
/*
|
||||
* Add a new vma known to map the same set of pages as the old vma:
|
||||
* useful for fork's dup_mmap as well as vma_prio_tree_insert below.
|
||||
* Note that it just happens to work correctly on i_mmap_nonlinear too.
|
||||
*/
|
||||
void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
|
||||
{
|
||||
/* Leave these BUG_ONs till prio_tree patch stabilizes */
|
||||
BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
|
||||
BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
|
||||
|
||||
vma->shared.vm_set.head = NULL;
|
||||
vma->shared.vm_set.parent = NULL;
|
||||
|
||||
if (!old->shared.vm_set.parent)
|
||||
list_add(&vma->shared.vm_set.list,
|
||||
&old->shared.vm_set.list);
|
||||
else if (old->shared.vm_set.head)
|
||||
list_add_tail(&vma->shared.vm_set.list,
|
||||
&old->shared.vm_set.head->shared.vm_set.list);
|
||||
else {
|
||||
INIT_LIST_HEAD(&vma->shared.vm_set.list);
|
||||
vma->shared.vm_set.head = old;
|
||||
old->shared.vm_set.head = vma;
|
||||
}
|
||||
}
|
||||
|
||||
void vma_prio_tree_insert(struct vm_area_struct *vma,
|
||||
struct prio_tree_root *root)
|
||||
{
|
||||
struct prio_tree_node *ptr;
|
||||
struct vm_area_struct *old;
|
||||
|
||||
vma->shared.vm_set.head = NULL;
|
||||
|
||||
ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
|
||||
if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
|
||||
old = prio_tree_entry(ptr, struct vm_area_struct,
|
||||
shared.prio_tree_node);
|
||||
vma_prio_tree_add(vma, old);
|
||||
}
|
||||
}
|
||||
|
||||
void vma_prio_tree_remove(struct vm_area_struct *vma,
|
||||
struct prio_tree_root *root)
|
||||
{
|
||||
struct vm_area_struct *node, *head, *new_head;
|
||||
|
||||
if (!vma->shared.vm_set.head) {
|
||||
if (!vma->shared.vm_set.parent)
|
||||
list_del_init(&vma->shared.vm_set.list);
|
||||
else
|
||||
raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
|
||||
} else {
|
||||
/* Leave this BUG_ON till prio_tree patch stabilizes */
|
||||
BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
|
||||
if (vma->shared.vm_set.parent) {
|
||||
head = vma->shared.vm_set.head;
|
||||
if (!list_empty(&head->shared.vm_set.list)) {
|
||||
new_head = list_entry(
|
||||
head->shared.vm_set.list.next,
|
||||
struct vm_area_struct,
|
||||
shared.vm_set.list);
|
||||
list_del_init(&head->shared.vm_set.list);
|
||||
} else
|
||||
new_head = NULL;
|
||||
|
||||
raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
|
||||
&head->shared.prio_tree_node);
|
||||
head->shared.vm_set.head = new_head;
|
||||
if (new_head)
|
||||
new_head->shared.vm_set.head = head;
|
||||
|
||||
} else {
|
||||
node = vma->shared.vm_set.head;
|
||||
if (!list_empty(&vma->shared.vm_set.list)) {
|
||||
new_head = list_entry(
|
||||
vma->shared.vm_set.list.next,
|
||||
struct vm_area_struct,
|
||||
shared.vm_set.list);
|
||||
list_del_init(&vma->shared.vm_set.list);
|
||||
node->shared.vm_set.head = new_head;
|
||||
new_head->shared.vm_set.head = node;
|
||||
} else
|
||||
node->shared.vm_set.head = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper function to enumerate vmas that map a given file page or a set of
|
||||
* contiguous file pages. The function returns vmas that at least map a single
|
||||
* page in the given range of contiguous file pages.
|
||||
*/
|
||||
struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
|
||||
struct prio_tree_iter *iter)
|
||||
{
|
||||
struct prio_tree_node *ptr;
|
||||
struct vm_area_struct *next;
|
||||
|
||||
if (!vma) {
|
||||
/*
|
||||
* First call is with NULL vma
|
||||
*/
|
||||
ptr = prio_tree_next(iter);
|
||||
if (ptr) {
|
||||
next = prio_tree_entry(ptr, struct vm_area_struct,
|
||||
shared.prio_tree_node);
|
||||
prefetch(next->shared.vm_set.head);
|
||||
return next;
|
||||
} else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (vma->shared.vm_set.parent) {
|
||||
if (vma->shared.vm_set.head) {
|
||||
next = vma->shared.vm_set.head;
|
||||
prefetch(next->shared.vm_set.list.next);
|
||||
return next;
|
||||
}
|
||||
} else {
|
||||
next = list_entry(vma->shared.vm_set.list.next,
|
||||
struct vm_area_struct, shared.vm_set.list);
|
||||
if (!next->shared.vm_set.head) {
|
||||
prefetch(next->shared.vm_set.list.next);
|
||||
return next;
|
||||
}
|
||||
}
|
||||
|
||||
ptr = prio_tree_next(iter);
|
||||
if (ptr) {
|
||||
next = prio_tree_entry(ptr, struct vm_area_struct,
|
||||
shared.prio_tree_node);
|
||||
prefetch(next->shared.vm_set.head);
|
||||
return next;
|
||||
} else
|
||||
return NULL;
|
||||
}
|
||||
580
mm/readahead.c
Normal file
580
mm/readahead.c
Normal file
@@ -0,0 +1,580 @@
|
||||
/*
|
||||
* mm/readahead.c - address_space-level file readahead.
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 09Apr2002 akpm@zip.com.au
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/pagevec.h>
|
||||
|
||||
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL(default_unplug_io_fn);
|
||||
|
||||
struct backing_dev_info default_backing_dev_info = {
|
||||
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
|
||||
.state = 0,
|
||||
.capabilities = BDI_CAP_MAP_COPY,
|
||||
.unplug_io_fn = default_unplug_io_fn,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(default_backing_dev_info);
|
||||
|
||||
/*
|
||||
* Initialise a struct file's readahead state. Assumes that the caller has
|
||||
* memset *ra to zero.
|
||||
*/
|
||||
void
|
||||
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
|
||||
{
|
||||
ra->ra_pages = mapping->backing_dev_info->ra_pages;
|
||||
ra->prev_page = -1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(file_ra_state_init);
|
||||
|
||||
/*
|
||||
* Return max readahead size for this inode in number-of-pages.
|
||||
*/
|
||||
static inline unsigned long get_max_readahead(struct file_ra_state *ra)
|
||||
{
|
||||
return ra->ra_pages;
|
||||
}
|
||||
|
||||
static inline unsigned long get_min_readahead(struct file_ra_state *ra)
|
||||
{
|
||||
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
|
||||
}
|
||||
|
||||
static inline void reset_ahead_window(struct file_ra_state *ra)
|
||||
{
|
||||
/*
|
||||
* ... but preserve ahead_start + ahead_size value,
|
||||
* see 'recheck:' label in page_cache_readahead().
|
||||
* Note: We never use ->ahead_size as rvalue without
|
||||
* checking ->ahead_start != 0 first.
|
||||
*/
|
||||
ra->ahead_size += ra->ahead_start;
|
||||
ra->ahead_start = 0;
|
||||
}
|
||||
|
||||
static inline void ra_off(struct file_ra_state *ra)
|
||||
{
|
||||
ra->start = 0;
|
||||
ra->flags = 0;
|
||||
ra->size = 0;
|
||||
reset_ahead_window(ra);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the initial window size, round to next power of 2 and square
|
||||
* for small size, x 4 for medium, and x 2 for large
|
||||
* for 128k (32 page) max ra
|
||||
* 1-8 page = 32k initial, > 8 page = 128k initial
|
||||
*/
|
||||
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
|
||||
{
|
||||
unsigned long newsize = roundup_pow_of_two(size);
|
||||
|
||||
if (newsize <= max / 32)
|
||||
newsize = newsize * 4;
|
||||
else if (newsize <= max / 4)
|
||||
newsize = newsize * 2;
|
||||
else
|
||||
newsize = max;
|
||||
return newsize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the new window size, this is called only when I/O is to be submitted,
|
||||
* not for each call to readahead. If a cache miss occured, reduce next I/O
|
||||
* size, else increase depending on how close to max we are.
|
||||
*/
|
||||
static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
|
||||
{
|
||||
unsigned long max = get_max_readahead(ra);
|
||||
unsigned long min = get_min_readahead(ra);
|
||||
unsigned long cur = ra->size;
|
||||
unsigned long newsize;
|
||||
|
||||
if (ra->flags & RA_FLAG_MISS) {
|
||||
ra->flags &= ~RA_FLAG_MISS;
|
||||
newsize = max((cur - 2), min);
|
||||
} else if (cur < max / 16) {
|
||||
newsize = 4 * cur;
|
||||
} else {
|
||||
newsize = 2 * cur;
|
||||
}
|
||||
return min(newsize, max);
|
||||
}
|
||||
|
||||
#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
|
||||
|
||||
/**
|
||||
* read_cache_pages - populate an address space with some pages & start reads against them
|
||||
* @mapping: the address_space
|
||||
* @pages: The address of a list_head which contains the target pages. These
|
||||
* pages have their ->index populated and are otherwise uninitialised.
|
||||
* @filler: callback routine for filling a single page.
|
||||
* @data: private data for the callback routine.
|
||||
*
|
||||
* Hides the details of the LRU cache etc from the filesystems.
|
||||
*/
|
||||
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
|
||||
int (*filler)(void *, struct page *), void *data)
|
||||
{
|
||||
struct page *page;
|
||||
struct pagevec lru_pvec;
|
||||
int ret = 0;
|
||||
|
||||
pagevec_init(&lru_pvec, 0);
|
||||
|
||||
while (!list_empty(pages)) {
|
||||
page = list_to_page(pages);
|
||||
list_del(&page->lru);
|
||||
if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
|
||||
page_cache_release(page);
|
||||
continue;
|
||||
}
|
||||
ret = filler(data, page);
|
||||
if (!pagevec_add(&lru_pvec, page))
|
||||
__pagevec_lru_add(&lru_pvec);
|
||||
if (ret) {
|
||||
put_pages_list(pages);
|
||||
break;
|
||||
}
|
||||
task_io_account_read(PAGE_CACHE_SIZE);
|
||||
}
|
||||
pagevec_lru_add(&lru_pvec);
|
||||
return ret;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(read_cache_pages);
|
||||
|
||||
static int read_pages(struct address_space *mapping, struct file *filp,
|
||||
struct list_head *pages, unsigned nr_pages)
|
||||
{
|
||||
unsigned page_idx;
|
||||
struct pagevec lru_pvec;
|
||||
int ret;
|
||||
|
||||
if (mapping->a_ops->readpages) {
|
||||
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
|
||||
/* Clean up the remaining pages */
|
||||
put_pages_list(pages);
|
||||
goto out;
|
||||
}
|
||||
|
||||
pagevec_init(&lru_pvec, 0);
|
||||
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
|
||||
struct page *page = list_to_page(pages);
|
||||
list_del(&page->lru);
|
||||
if (!add_to_page_cache(page, mapping,
|
||||
page->index, GFP_KERNEL)) {
|
||||
mapping->a_ops->readpage(filp, page);
|
||||
if (!pagevec_add(&lru_pvec, page))
|
||||
__pagevec_lru_add(&lru_pvec);
|
||||
} else
|
||||
page_cache_release(page);
|
||||
}
|
||||
pagevec_lru_add(&lru_pvec);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Readahead design.
|
||||
*
|
||||
* The fields in struct file_ra_state represent the most-recently-executed
|
||||
* readahead attempt:
|
||||
*
|
||||
* start: Page index at which we started the readahead
|
||||
* size: Number of pages in that read
|
||||
* Together, these form the "current window".
|
||||
* Together, start and size represent the `readahead window'.
|
||||
* prev_page: The page which the readahead algorithm most-recently inspected.
|
||||
* It is mainly used to detect sequential file reading.
|
||||
* If page_cache_readahead sees that it is again being called for
|
||||
* a page which it just looked at, it can return immediately without
|
||||
* making any state changes.
|
||||
* ahead_start,
|
||||
* ahead_size: Together, these form the "ahead window".
|
||||
* ra_pages: The externally controlled max readahead for this fd.
|
||||
*
|
||||
* When readahead is in the off state (size == 0), readahead is disabled.
|
||||
* In this state, prev_page is used to detect the resumption of sequential I/O.
|
||||
*
|
||||
* The readahead code manages two windows - the "current" and the "ahead"
|
||||
* windows. The intent is that while the application is walking the pages
|
||||
* in the current window, I/O is underway on the ahead window. When the
|
||||
* current window is fully traversed, it is replaced by the ahead window
|
||||
* and the ahead window is invalidated. When this copying happens, the
|
||||
* new current window's pages are probably still locked. So
|
||||
* we submit a new batch of I/O immediately, creating a new ahead window.
|
||||
*
|
||||
* So:
|
||||
*
|
||||
* ----|----------------|----------------|-----
|
||||
* ^start ^start+size
|
||||
* ^ahead_start ^ahead_start+ahead_size
|
||||
*
|
||||
* ^ When this page is read, we submit I/O for the
|
||||
* ahead window.
|
||||
*
|
||||
* A `readahead hit' occurs when a read request is made against a page which is
|
||||
* the next sequential page. Ahead window calculations are done only when it
|
||||
* is time to submit a new IO. The code ramps up the size agressively at first,
|
||||
* but slow down as it approaches max_readhead.
|
||||
*
|
||||
* Any seek/ramdom IO will result in readahead being turned off. It will resume
|
||||
* at the first sequential access.
|
||||
*
|
||||
* There is a special-case: if the first page which the application tries to
|
||||
* read happens to be the first page of the file, it is assumed that a linear
|
||||
* read is about to happen and the window is immediately set to the initial size
|
||||
* based on I/O request size and the max_readahead.
|
||||
*
|
||||
* This function is to be called for every read request, rather than when
|
||||
* it is time to perform readahead. It is called only once for the entire I/O
|
||||
* regardless of size unless readahead is unable to start enough I/O to satisfy
|
||||
* the request (I/O request > max_readahead).
|
||||
*/
|
||||
|
||||
/*
|
||||
* do_page_cache_readahead actually reads a chunk of disk. It allocates all
|
||||
* the pages first, then submits them all for I/O. This avoids the very bad
|
||||
* behaviour which would occur if page allocations are causing VM writeback.
|
||||
* We really don't want to intermingle reads and writes like that.
|
||||
*
|
||||
* Returns the number of pages requested, or the maximum amount of I/O allowed.
|
||||
*
|
||||
* do_page_cache_readahead() returns -1 if it encountered request queue
|
||||
* congestion.
|
||||
*/
|
||||
static int
|
||||
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
pgoff_t offset, unsigned long nr_to_read)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct page *page;
|
||||
unsigned long end_index; /* The last page we want to read */
|
||||
LIST_HEAD(page_pool);
|
||||
int page_idx;
|
||||
int ret = 0;
|
||||
loff_t isize = i_size_read(inode);
|
||||
|
||||
if (isize == 0)
|
||||
goto out;
|
||||
|
||||
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
|
||||
|
||||
/*
|
||||
* Preallocate as many pages as we will need.
|
||||
*/
|
||||
read_lock_irq(&mapping->tree_lock);
|
||||
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
|
||||
pgoff_t page_offset = offset + page_idx;
|
||||
|
||||
if (page_offset > end_index)
|
||||
break;
|
||||
|
||||
page = radix_tree_lookup(&mapping->page_tree, page_offset);
|
||||
if (page)
|
||||
continue;
|
||||
|
||||
read_unlock_irq(&mapping->tree_lock);
|
||||
page = page_cache_alloc_cold(mapping);
|
||||
read_lock_irq(&mapping->tree_lock);
|
||||
if (!page)
|
||||
break;
|
||||
page->index = page_offset;
|
||||
list_add(&page->lru, &page_pool);
|
||||
ret++;
|
||||
}
|
||||
read_unlock_irq(&mapping->tree_lock);
|
||||
|
||||
/*
|
||||
* Now start the IO. We ignore I/O errors - if the page is not
|
||||
* uptodate then the caller will launch readpage again, and
|
||||
* will then handle the error.
|
||||
*/
|
||||
if (ret)
|
||||
read_pages(mapping, filp, &page_pool, ret);
|
||||
BUG_ON(!list_empty(&page_pool));
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
|
||||
* memory at once.
|
||||
*/
|
||||
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
pgoff_t offset, unsigned long nr_to_read)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
|
||||
return -EINVAL;
|
||||
|
||||
while (nr_to_read) {
|
||||
int err;
|
||||
|
||||
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
|
||||
|
||||
if (this_chunk > nr_to_read)
|
||||
this_chunk = nr_to_read;
|
||||
err = __do_page_cache_readahead(mapping, filp,
|
||||
offset, this_chunk);
|
||||
if (err < 0) {
|
||||
ret = err;
|
||||
break;
|
||||
}
|
||||
ret += err;
|
||||
offset += this_chunk;
|
||||
nr_to_read -= this_chunk;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check how effective readahead is being. If the amount of started IO is
|
||||
* less than expected then the file is partly or fully in pagecache and
|
||||
* readahead isn't helping.
|
||||
*
|
||||
*/
|
||||
static inline int check_ra_success(struct file_ra_state *ra,
|
||||
unsigned long nr_to_read, unsigned long actual)
|
||||
{
|
||||
if (actual == 0) {
|
||||
ra->cache_hit += nr_to_read;
|
||||
if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
|
||||
ra_off(ra);
|
||||
ra->flags |= RA_FLAG_INCACHE;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
ra->cache_hit=0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This version skips the IO if the queue is read-congested, and will tell the
|
||||
* block layer to abandon the readahead if request allocation would block.
|
||||
*
|
||||
* force_page_cache_readahead() will ignore queue congestion and will block on
|
||||
* request queues.
|
||||
*/
|
||||
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
pgoff_t offset, unsigned long nr_to_read)
|
||||
{
|
||||
if (bdi_read_congested(mapping->backing_dev_info))
|
||||
return -1;
|
||||
|
||||
return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
|
||||
* is set wait till the read completes. Otherwise attempt to read without
|
||||
* blocking.
|
||||
* Returns 1 meaning 'success' if read is successful without switching off
|
||||
* readahead mode. Otherwise return failure.
|
||||
*/
|
||||
static int
|
||||
blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
||||
pgoff_t offset, unsigned long nr_to_read,
|
||||
struct file_ra_state *ra, int block)
|
||||
{
|
||||
int actual;
|
||||
|
||||
if (!block && bdi_read_congested(mapping->backing_dev_info))
|
||||
return 0;
|
||||
|
||||
actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
|
||||
|
||||
return check_ra_success(ra, nr_to_read, actual);
|
||||
}
|
||||
|
||||
static int make_ahead_window(struct address_space *mapping, struct file *filp,
|
||||
struct file_ra_state *ra, int force)
|
||||
{
|
||||
int block, ret;
|
||||
|
||||
ra->ahead_size = get_next_ra_size(ra);
|
||||
ra->ahead_start = ra->start + ra->size;
|
||||
|
||||
block = force || (ra->prev_page >= ra->ahead_start);
|
||||
ret = blockable_page_cache_readahead(mapping, filp,
|
||||
ra->ahead_start, ra->ahead_size, ra, block);
|
||||
|
||||
if (!ret && !force) {
|
||||
/* A read failure in blocking mode, implies pages are
|
||||
* all cached. So we can safely assume we have taken
|
||||
* care of all the pages requested in this call.
|
||||
* A read failure in non-blocking mode, implies we are
|
||||
* reading more pages than requested in this call. So
|
||||
* we safely assume we have taken care of all the pages
|
||||
* requested in this call.
|
||||
*
|
||||
* Just reset the ahead window in case we failed due to
|
||||
* congestion. The ahead window will any way be closed
|
||||
* in case we failed due to excessive page cache hits.
|
||||
*/
|
||||
reset_ahead_window(ra);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* page_cache_readahead - generic adaptive readahead
|
||||
* @mapping: address_space which holds the pagecache and I/O vectors
|
||||
* @ra: file_ra_state which holds the readahead state
|
||||
* @filp: passed on to ->readpage() and ->readpages()
|
||||
* @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
|
||||
* @req_size: hint: total size of the read which the caller is performing in
|
||||
* PAGE_CACHE_SIZE units
|
||||
*
|
||||
* page_cache_readahead() is the main function. If performs the adaptive
|
||||
* readahead window size management and submits the readahead I/O.
|
||||
*
|
||||
* Note that @filp is purely used for passing on to the ->readpage[s]()
|
||||
* handler: it may refer to a different file from @mapping (so we may not use
|
||||
* @filp->f_mapping or @filp->f_path.dentry->d_inode here).
|
||||
* Also, @ra may not be equal to &@filp->f_ra.
|
||||
*
|
||||
*/
|
||||
unsigned long
|
||||
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
|
||||
struct file *filp, pgoff_t offset, unsigned long req_size)
|
||||
{
|
||||
unsigned long max, newsize;
|
||||
int sequential;
|
||||
|
||||
/*
|
||||
* We avoid doing extra work and bogusly perturbing the readahead
|
||||
* window expansion logic.
|
||||
*/
|
||||
if (offset == ra->prev_page && --req_size)
|
||||
++offset;
|
||||
|
||||
/* Note that prev_page == -1 if it is a first read */
|
||||
sequential = (offset == ra->prev_page + 1);
|
||||
ra->prev_page = offset;
|
||||
|
||||
max = get_max_readahead(ra);
|
||||
newsize = min(req_size, max);
|
||||
|
||||
/* No readahead or sub-page sized read or file already in cache */
|
||||
if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
|
||||
goto out;
|
||||
|
||||
ra->prev_page += newsize - 1;
|
||||
|
||||
/*
|
||||
* Special case - first read at start of file. We'll assume it's
|
||||
* a whole-file read and grow the window fast. Or detect first
|
||||
* sequential access
|
||||
*/
|
||||
if (sequential && ra->size == 0) {
|
||||
ra->size = get_init_ra_size(newsize, max);
|
||||
ra->start = offset;
|
||||
if (!blockable_page_cache_readahead(mapping, filp, offset,
|
||||
ra->size, ra, 1))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If the request size is larger than our max readahead, we
|
||||
* at least want to be sure that we get 2 IOs in flight and
|
||||
* we know that we will definitly need the new I/O.
|
||||
* once we do this, subsequent calls should be able to overlap
|
||||
* IOs,* thus preventing stalls. so issue the ahead window
|
||||
* immediately.
|
||||
*/
|
||||
if (req_size >= max)
|
||||
make_ahead_window(mapping, filp, ra, 1);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now handle the random case:
|
||||
* partial page reads and first access were handled above,
|
||||
* so this must be the next page otherwise it is random
|
||||
*/
|
||||
if (!sequential) {
|
||||
ra_off(ra);
|
||||
blockable_page_cache_readahead(mapping, filp, offset,
|
||||
newsize, ra, 1);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we get here we are doing sequential IO and this was not the first
|
||||
* occurence (ie we have an existing window)
|
||||
*/
|
||||
if (ra->ahead_start == 0) { /* no ahead window yet */
|
||||
if (!make_ahead_window(mapping, filp, ra, 0))
|
||||
goto recheck;
|
||||
}
|
||||
|
||||
/*
|
||||
* Already have an ahead window, check if we crossed into it.
|
||||
* If so, shift windows and issue a new ahead window.
|
||||
* Only return the #pages that are in the current window, so that
|
||||
* we get called back on the first page of the ahead window which
|
||||
* will allow us to submit more IO.
|
||||
*/
|
||||
if (ra->prev_page >= ra->ahead_start) {
|
||||
ra->start = ra->ahead_start;
|
||||
ra->size = ra->ahead_size;
|
||||
make_ahead_window(mapping, filp, ra, 0);
|
||||
recheck:
|
||||
/* prev_page shouldn't overrun the ahead window */
|
||||
ra->prev_page = min(ra->prev_page,
|
||||
ra->ahead_start + ra->ahead_size - 1);
|
||||
}
|
||||
|
||||
out:
|
||||
return ra->prev_page + 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_cache_readahead);
|
||||
|
||||
/*
|
||||
* handle_ra_miss() is called when it is known that a page which should have
|
||||
* been present in the pagecache (we just did some readahead there) was in fact
|
||||
* not found. This will happen if it was evicted by the VM (readahead
|
||||
* thrashing)
|
||||
*
|
||||
* Turn on the cache miss flag in the RA struct, this will cause the RA code
|
||||
* to reduce the RA size on the next read.
|
||||
*/
|
||||
void handle_ra_miss(struct address_space *mapping,
|
||||
struct file_ra_state *ra, pgoff_t offset)
|
||||
{
|
||||
ra->flags |= RA_FLAG_MISS;
|
||||
ra->flags &= ~RA_FLAG_INCACHE;
|
||||
ra->cache_hit = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
|
||||
* sensible upper limit.
|
||||
*/
|
||||
unsigned long max_sane_readahead(unsigned long nr)
|
||||
{
|
||||
return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
|
||||
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
|
||||
}
|
||||
941
mm/rmap.c
Normal file
941
mm/rmap.c
Normal file
@@ -0,0 +1,941 @@
|
||||
/*
|
||||
* mm/rmap.c - physical to virtual reverse mappings
|
||||
*
|
||||
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
|
||||
* Released under the General Public License (GPL).
|
||||
*
|
||||
* Simple, low overhead reverse mapping scheme.
|
||||
* Please try to keep this thing as modular as possible.
|
||||
*
|
||||
* Provides methods for unmapping each kind of mapped page:
|
||||
* the anon methods track anonymous pages, and
|
||||
* the file methods track pages belonging to an inode.
|
||||
*
|
||||
* Original design by Rik van Riel <riel@conectiva.com.br> 2001
|
||||
* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
|
||||
* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
|
||||
* Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
|
||||
*/
|
||||
|
||||
/*
|
||||
* Lock ordering in mm:
|
||||
*
|
||||
* inode->i_mutex (while writing or truncating, not reading or faulting)
|
||||
* inode->i_alloc_sem (vmtruncate_range)
|
||||
* mm->mmap_sem
|
||||
* page->flags PG_locked (lock_page)
|
||||
* mapping->i_mmap_lock
|
||||
* anon_vma->lock
|
||||
* mm->page_table_lock or pte_lock
|
||||
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* inode_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
* mapping->tree_lock (widely used, in set_page_dirty,
|
||||
* in arch-dependent flush_dcache_mmap_lock,
|
||||
* within inode_lock in __sync_single_inode)
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kallsyms.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
struct kmem_cache *anon_vma_cachep;
|
||||
|
||||
static inline void validate_anon_vma(struct vm_area_struct *find_vma)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
struct anon_vma *anon_vma = find_vma->anon_vma;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned int mapcount = 0;
|
||||
int found = 0;
|
||||
|
||||
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
|
||||
mapcount++;
|
||||
BUG_ON(mapcount > 100000);
|
||||
if (vma == find_vma)
|
||||
found = 1;
|
||||
}
|
||||
BUG_ON(!found);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* This must be called under the mmap_sem. */
|
||||
int anon_vma_prepare(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
|
||||
might_sleep();
|
||||
if (unlikely(!anon_vma)) {
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct anon_vma *allocated, *locked;
|
||||
|
||||
anon_vma = find_mergeable_anon_vma(vma);
|
||||
if (anon_vma) {
|
||||
allocated = NULL;
|
||||
locked = anon_vma;
|
||||
spin_lock(&locked->lock);
|
||||
} else {
|
||||
anon_vma = anon_vma_alloc();
|
||||
if (unlikely(!anon_vma))
|
||||
return -ENOMEM;
|
||||
allocated = anon_vma;
|
||||
locked = NULL;
|
||||
}
|
||||
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (likely(!vma->anon_vma)) {
|
||||
vma->anon_vma = anon_vma;
|
||||
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
|
||||
allocated = NULL;
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
|
||||
if (locked)
|
||||
spin_unlock(&locked->lock);
|
||||
if (unlikely(allocated))
|
||||
anon_vma_free(allocated);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
|
||||
{
|
||||
BUG_ON(vma->anon_vma != next->anon_vma);
|
||||
list_del(&next->anon_vma_node);
|
||||
}
|
||||
|
||||
void __anon_vma_link(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
|
||||
if (anon_vma) {
|
||||
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
|
||||
validate_anon_vma(vma);
|
||||
}
|
||||
}
|
||||
|
||||
void anon_vma_link(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
|
||||
if (anon_vma) {
|
||||
spin_lock(&anon_vma->lock);
|
||||
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
|
||||
validate_anon_vma(vma);
|
||||
spin_unlock(&anon_vma->lock);
|
||||
}
|
||||
}
|
||||
|
||||
void anon_vma_unlink(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
int empty;
|
||||
|
||||
if (!anon_vma)
|
||||
return;
|
||||
|
||||
spin_lock(&anon_vma->lock);
|
||||
validate_anon_vma(vma);
|
||||
list_del(&vma->anon_vma_node);
|
||||
|
||||
/* We must garbage collect the anon_vma if it's empty */
|
||||
empty = list_empty(&anon_vma->head);
|
||||
spin_unlock(&anon_vma->lock);
|
||||
|
||||
if (empty)
|
||||
anon_vma_free(anon_vma);
|
||||
}
|
||||
|
||||
static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
|
||||
unsigned long flags)
|
||||
{
|
||||
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
|
||||
SLAB_CTOR_CONSTRUCTOR) {
|
||||
struct anon_vma *anon_vma = data;
|
||||
|
||||
spin_lock_init(&anon_vma->lock);
|
||||
INIT_LIST_HEAD(&anon_vma->head);
|
||||
}
|
||||
}
|
||||
|
||||
void __init anon_vma_init(void)
|
||||
{
|
||||
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
|
||||
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Getting a lock on a stable anon_vma from a page off the LRU is
|
||||
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
|
||||
*/
|
||||
static struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
{
|
||||
struct anon_vma *anon_vma;
|
||||
unsigned long anon_mapping;
|
||||
|
||||
rcu_read_lock();
|
||||
anon_mapping = (unsigned long) page->mapping;
|
||||
if (!(anon_mapping & PAGE_MAPPING_ANON))
|
||||
goto out;
|
||||
if (!page_mapped(page))
|
||||
goto out;
|
||||
|
||||
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
|
||||
spin_lock(&anon_vma->lock);
|
||||
return anon_vma;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
{
|
||||
spin_unlock(&anon_vma->lock);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* At what user virtual address is page expected in vma?
|
||||
*/
|
||||
static inline unsigned long
|
||||
vma_address(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
|
||||
unsigned long address;
|
||||
|
||||
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
|
||||
/* page should be within any vma from prio_tree_next */
|
||||
BUG_ON(!PageAnon(page));
|
||||
return -EFAULT;
|
||||
}
|
||||
return address;
|
||||
}
|
||||
|
||||
/*
|
||||
* At what user virtual address is page expected in vma? checking that the
|
||||
* page matches the vma: currently only used on anon pages, by unuse_vma;
|
||||
*/
|
||||
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
if (PageAnon(page)) {
|
||||
if ((void *)vma->anon_vma !=
|
||||
(void *)page->mapping - PAGE_MAPPING_ANON)
|
||||
return -EFAULT;
|
||||
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
|
||||
if (!vma->vm_file ||
|
||||
vma->vm_file->f_mapping != page->mapping)
|
||||
return -EFAULT;
|
||||
} else
|
||||
return -EFAULT;
|
||||
return vma_address(page, vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that @page is mapped at @address into @mm.
|
||||
*
|
||||
* On success returns with pte mapped and locked.
|
||||
*/
|
||||
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
|
||||
unsigned long address, spinlock_t **ptlp)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
return NULL;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return NULL;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
return NULL;
|
||||
|
||||
pte = pte_offset_map(pmd, address);
|
||||
/* Make a quick check before getting the lock */
|
||||
if (!pte_present(*pte)) {
|
||||
pte_unmap(pte);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
spin_lock(ptl);
|
||||
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
|
||||
*ptlp = ptl;
|
||||
return pte;
|
||||
}
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Subfunctions of page_referenced: page_referenced_one called
|
||||
* repeatedly from either page_referenced_anon or page_referenced_file.
|
||||
*/
|
||||
static int page_referenced_one(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned int *mapcount)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long address;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
int referenced = 0;
|
||||
|
||||
address = vma_address(page, vma);
|
||||
if (address == -EFAULT)
|
||||
goto out;
|
||||
|
||||
pte = page_check_address(page, mm, address, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
if (ptep_clear_flush_young(vma, address, pte))
|
||||
referenced++;
|
||||
|
||||
/* Pretend the page is referenced if the task has the
|
||||
swap token and is in the middle of a page fault. */
|
||||
if (mm != current->mm && has_swap_token(mm) &&
|
||||
rwsem_is_locked(&mm->mmap_sem))
|
||||
referenced++;
|
||||
|
||||
(*mapcount)--;
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
return referenced;
|
||||
}
|
||||
|
||||
static int page_referenced_anon(struct page *page)
|
||||
{
|
||||
unsigned int mapcount;
|
||||
struct anon_vma *anon_vma;
|
||||
struct vm_area_struct *vma;
|
||||
int referenced = 0;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
if (!anon_vma)
|
||||
return referenced;
|
||||
|
||||
mapcount = page_mapcount(page);
|
||||
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
|
||||
referenced += page_referenced_one(page, vma, &mapcount);
|
||||
if (!mapcount)
|
||||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
return referenced;
|
||||
}
|
||||
|
||||
/**
|
||||
* page_referenced_file - referenced check for object-based rmap
|
||||
* @page: the page we're checking references on.
|
||||
*
|
||||
* For an object-based mapped page, find all the places it is mapped and
|
||||
* check/clear the referenced flag. This is done by following the page->mapping
|
||||
* pointer, then walking the chain of vmas it holds. It returns the number
|
||||
* of references it found.
|
||||
*
|
||||
* This function is only called from page_referenced for object-based pages.
|
||||
*/
|
||||
static int page_referenced_file(struct page *page)
|
||||
{
|
||||
unsigned int mapcount;
|
||||
struct address_space *mapping = page->mapping;
|
||||
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
|
||||
struct vm_area_struct *vma;
|
||||
struct prio_tree_iter iter;
|
||||
int referenced = 0;
|
||||
|
||||
/*
|
||||
* The caller's checks on page->mapping and !PageAnon have made
|
||||
* sure that this is a file page: the check for page->mapping
|
||||
* excludes the case just before it gets set on an anon page.
|
||||
*/
|
||||
BUG_ON(PageAnon(page));
|
||||
|
||||
/*
|
||||
* The page lock not only makes sure that page->mapping cannot
|
||||
* suddenly be NULLified by truncation, it makes sure that the
|
||||
* structure at mapping cannot be freed and reused yet,
|
||||
* so we can safely take mapping->i_mmap_lock.
|
||||
*/
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
|
||||
/*
|
||||
* i_mmap_lock does not stabilize mapcount at all, but mapcount
|
||||
* is more likely to be accurate if we note it after spinning.
|
||||
*/
|
||||
mapcount = page_mapcount(page);
|
||||
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
|
||||
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
|
||||
== (VM_LOCKED|VM_MAYSHARE)) {
|
||||
referenced++;
|
||||
break;
|
||||
}
|
||||
referenced += page_referenced_one(page, vma, &mapcount);
|
||||
if (!mapcount)
|
||||
break;
|
||||
}
|
||||
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
return referenced;
|
||||
}
|
||||
|
||||
/**
|
||||
* page_referenced - test if the page was referenced
|
||||
* @page: the page to test
|
||||
* @is_locked: caller holds lock on the page
|
||||
*
|
||||
* Quick test_and_clear_referenced for all mappings to a page,
|
||||
* returns the number of ptes which referenced the page.
|
||||
*/
|
||||
int page_referenced(struct page *page, int is_locked)
|
||||
{
|
||||
int referenced = 0;
|
||||
|
||||
if (page_test_and_clear_young(page))
|
||||
referenced++;
|
||||
|
||||
if (TestClearPageReferenced(page))
|
||||
referenced++;
|
||||
|
||||
if (page_mapped(page) && page->mapping) {
|
||||
if (PageAnon(page))
|
||||
referenced += page_referenced_anon(page);
|
||||
else if (is_locked)
|
||||
referenced += page_referenced_file(page);
|
||||
else if (TestSetPageLocked(page))
|
||||
referenced++;
|
||||
else {
|
||||
if (page->mapping)
|
||||
referenced += page_referenced_file(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
}
|
||||
return referenced;
|
||||
}
|
||||
|
||||
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long address;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
int ret = 0;
|
||||
|
||||
address = vma_address(page, vma);
|
||||
if (address == -EFAULT)
|
||||
goto out;
|
||||
|
||||
pte = page_check_address(page, mm, address, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
if (pte_dirty(*pte) || pte_write(*pte)) {
|
||||
pte_t entry;
|
||||
|
||||
flush_cache_page(vma, address, pte_pfn(*pte));
|
||||
entry = ptep_clear_flush(vma, address, pte);
|
||||
entry = pte_wrprotect(entry);
|
||||
entry = pte_mkclean(entry);
|
||||
set_pte_at(mm, address, pte, entry);
|
||||
lazy_mmu_prot_update(entry);
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int page_mkclean_file(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
|
||||
struct vm_area_struct *vma;
|
||||
struct prio_tree_iter iter;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(PageAnon(page));
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
|
||||
if (vma->vm_flags & VM_SHARED)
|
||||
ret += page_mkclean_one(page, vma);
|
||||
}
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int page_mkclean(struct page *page)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (page_mapped(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
if (mapping)
|
||||
ret = page_mkclean_file(mapping, page);
|
||||
if (page_test_and_clear_dirty(page))
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* page_set_anon_rmap - setup new anonymous rmap
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
*/
|
||||
static void __page_set_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
struct anon_vma *anon_vma = vma->anon_vma;
|
||||
|
||||
BUG_ON(!anon_vma);
|
||||
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
|
||||
page->mapping = (struct address_space *) anon_vma;
|
||||
|
||||
page->index = linear_page_index(vma, address);
|
||||
|
||||
/*
|
||||
* nr_mapped state can be updated without turning off
|
||||
* interrupts because it is not modified via interrupt.
|
||||
*/
|
||||
__inc_zone_page_state(page, NR_ANON_PAGES);
|
||||
}
|
||||
|
||||
/**
|
||||
* page_add_anon_rmap - add pte mapping to an anonymous page
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
*
|
||||
* The caller needs to hold the pte lock.
|
||||
*/
|
||||
void page_add_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
if (atomic_inc_and_test(&page->_mapcount))
|
||||
__page_set_anon_rmap(page, vma, address);
|
||||
/* else checking page index and mapping is racy */
|
||||
}
|
||||
|
||||
/*
|
||||
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
|
||||
* @page: the page to add the mapping to
|
||||
* @vma: the vm area in which the mapping is added
|
||||
* @address: the user virtual address mapped
|
||||
*
|
||||
* Same as page_add_anon_rmap but must only be called on *new* pages.
|
||||
* This means the inc-and-test can be bypassed.
|
||||
*/
|
||||
void page_add_new_anon_rmap(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
|
||||
__page_set_anon_rmap(page, vma, address);
|
||||
}
|
||||
|
||||
/**
|
||||
* page_add_file_rmap - add pte mapping to a file page
|
||||
* @page: the page to add the mapping to
|
||||
*
|
||||
* The caller needs to hold the pte lock.
|
||||
*/
|
||||
void page_add_file_rmap(struct page *page)
|
||||
{
|
||||
if (atomic_inc_and_test(&page->_mapcount))
|
||||
__inc_zone_page_state(page, NR_FILE_MAPPED);
|
||||
}
|
||||
|
||||
/**
|
||||
* page_remove_rmap - take down pte mapping from a page
|
||||
* @page: page to remove mapping from
|
||||
*
|
||||
* The caller needs to hold the pte lock.
|
||||
*/
|
||||
void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
|
||||
{
|
||||
if (atomic_add_negative(-1, &page->_mapcount)) {
|
||||
if (unlikely(page_mapcount(page) < 0)) {
|
||||
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
|
||||
printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
|
||||
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
|
||||
printk (KERN_EMERG " page->count = %x\n", page_count(page));
|
||||
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
|
||||
print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
|
||||
if (vma->vm_ops)
|
||||
print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
|
||||
if (vma->vm_file && vma->vm_file->f_op)
|
||||
print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
|
||||
BUG();
|
||||
}
|
||||
|
||||
/*
|
||||
* It would be tidy to reset the PageAnon mapping here,
|
||||
* but that might overwrite a racing page_add_anon_rmap
|
||||
* which increments mapcount after us but sets mapping
|
||||
* before us: so leave the reset to free_hot_cold_page,
|
||||
* and remember that it's only reliable while mapped.
|
||||
* Leaving it set also helps swapoff to reinstate ptes
|
||||
* faster for those pages still in swapcache.
|
||||
*/
|
||||
if (page_test_and_clear_dirty(page))
|
||||
set_page_dirty(page);
|
||||
__dec_zone_page_state(page,
|
||||
PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Subfunctions of try_to_unmap: try_to_unmap_one called
|
||||
* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
|
||||
*/
|
||||
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||
int migration)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long address;
|
||||
pte_t *pte;
|
||||
pte_t pteval;
|
||||
spinlock_t *ptl;
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
address = vma_address(page, vma);
|
||||
if (address == -EFAULT)
|
||||
goto out;
|
||||
|
||||
pte = page_check_address(page, mm, address, &ptl);
|
||||
if (!pte)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If the page is mlock()d, we cannot swap it out.
|
||||
* If it's recently referenced (perhaps page_referenced
|
||||
* skipped over this mm) then we should reactivate it.
|
||||
*/
|
||||
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
|
||||
(ptep_clear_flush_young(vma, address, pte)))) {
|
||||
ret = SWAP_FAIL;
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
/* Nuke the page table entry. */
|
||||
flush_cache_page(vma, address, page_to_pfn(page));
|
||||
pteval = ptep_clear_flush(vma, address, pte);
|
||||
|
||||
/* Move the dirty bit to the physical page now the pte is gone. */
|
||||
if (pte_dirty(pteval))
|
||||
set_page_dirty(page);
|
||||
|
||||
/* Update high watermark before we lower rss */
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
if (PageAnon(page)) {
|
||||
swp_entry_t entry = { .val = page_private(page) };
|
||||
|
||||
if (PageSwapCache(page)) {
|
||||
/*
|
||||
* Store the swap location in the pte.
|
||||
* See handle_pte_fault() ...
|
||||
*/
|
||||
swap_duplicate(entry);
|
||||
if (list_empty(&mm->mmlist)) {
|
||||
spin_lock(&mmlist_lock);
|
||||
if (list_empty(&mm->mmlist))
|
||||
list_add(&mm->mmlist, &init_mm.mmlist);
|
||||
spin_unlock(&mmlist_lock);
|
||||
}
|
||||
dec_mm_counter(mm, anon_rss);
|
||||
#ifdef CONFIG_MIGRATION
|
||||
} else {
|
||||
/*
|
||||
* Store the pfn of the page in a special migration
|
||||
* pte. do_swap_page() will wait until the migration
|
||||
* pte is removed and then restart fault handling.
|
||||
*/
|
||||
BUG_ON(!migration);
|
||||
entry = make_migration_entry(page, pte_write(pteval));
|
||||
#endif
|
||||
}
|
||||
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
|
||||
BUG_ON(pte_file(*pte));
|
||||
} else
|
||||
#ifdef CONFIG_MIGRATION
|
||||
if (migration) {
|
||||
/* Establish migration entry for a file page */
|
||||
swp_entry_t entry;
|
||||
entry = make_migration_entry(page, pte_write(pteval));
|
||||
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
|
||||
} else
|
||||
#endif
|
||||
dec_mm_counter(mm, file_rss);
|
||||
|
||||
|
||||
page_remove_rmap(page, vma);
|
||||
page_cache_release(page);
|
||||
|
||||
out_unmap:
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* objrmap doesn't work for nonlinear VMAs because the assumption that
|
||||
* offset-into-file correlates with offset-into-virtual-addresses does not hold.
|
||||
* Consequently, given a particular page and its ->index, we cannot locate the
|
||||
* ptes which are mapping that page without an exhaustive linear search.
|
||||
*
|
||||
* So what this code does is a mini "virtual scan" of each nonlinear VMA which
|
||||
* maps the file to which the target page belongs. The ->vm_private_data field
|
||||
* holds the current cursor into that scan. Successive searches will circulate
|
||||
* around the vma's virtual address space.
|
||||
*
|
||||
* So as more replacement pressure is applied to the pages in a nonlinear VMA,
|
||||
* more scanning pressure is placed against them as well. Eventually pages
|
||||
* will become fully unmapped and are eligible for eviction.
|
||||
*
|
||||
* For very sparsely populated VMAs this is a little inefficient - chances are
|
||||
* there there won't be many ptes located within the scan cluster. In this case
|
||||
* maybe we could scan further - to the end of the pte page, perhaps.
|
||||
*/
|
||||
#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
|
||||
#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
|
||||
|
||||
static void try_to_unmap_cluster(unsigned long cursor,
|
||||
unsigned int *mapcount, struct vm_area_struct *vma)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
pte_t pteval;
|
||||
spinlock_t *ptl;
|
||||
struct page *page;
|
||||
unsigned long address;
|
||||
unsigned long end;
|
||||
|
||||
address = (vma->vm_start + cursor) & CLUSTER_MASK;
|
||||
end = address + CLUSTER_SIZE;
|
||||
if (address < vma->vm_start)
|
||||
address = vma->vm_start;
|
||||
if (end > vma->vm_end)
|
||||
end = vma->vm_end;
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
return;
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
if (!pud_present(*pud))
|
||||
return;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
if (!pmd_present(*pmd))
|
||||
return;
|
||||
|
||||
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
|
||||
|
||||
/* Update high watermark before we lower rss */
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
for (; address < end; pte++, address += PAGE_SIZE) {
|
||||
if (!pte_present(*pte))
|
||||
continue;
|
||||
page = vm_normal_page(vma, address, *pte);
|
||||
BUG_ON(!page || PageAnon(page));
|
||||
|
||||
if (ptep_clear_flush_young(vma, address, pte))
|
||||
continue;
|
||||
|
||||
/* Nuke the page table entry. */
|
||||
flush_cache_page(vma, address, pte_pfn(*pte));
|
||||
pteval = ptep_clear_flush(vma, address, pte);
|
||||
|
||||
/* If nonlinear, store the file page offset in the pte. */
|
||||
if (page->index != linear_page_index(vma, address))
|
||||
set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
|
||||
|
||||
/* Move the dirty bit to the physical page now the pte is gone. */
|
||||
if (pte_dirty(pteval))
|
||||
set_page_dirty(page);
|
||||
|
||||
page_remove_rmap(page, vma);
|
||||
page_cache_release(page);
|
||||
dec_mm_counter(mm, file_rss);
|
||||
(*mapcount)--;
|
||||
}
|
||||
pte_unmap_unlock(pte - 1, ptl);
|
||||
}
|
||||
|
||||
static int try_to_unmap_anon(struct page *page, int migration)
|
||||
{
|
||||
struct anon_vma *anon_vma;
|
||||
struct vm_area_struct *vma;
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
|
||||
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_unmap_file - unmap file page using the object-based rmap method
|
||||
* @page: the page to unmap
|
||||
*
|
||||
* Find all the mappings of a page using the mapping pointer and the vma chains
|
||||
* contained in the address_space struct it points to.
|
||||
*
|
||||
* This function is only called from try_to_unmap for object-based pages.
|
||||
*/
|
||||
static int try_to_unmap_file(struct page *page, int migration)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
|
||||
struct vm_area_struct *vma;
|
||||
struct prio_tree_iter iter;
|
||||
int ret = SWAP_AGAIN;
|
||||
unsigned long cursor;
|
||||
unsigned long max_nl_cursor = 0;
|
||||
unsigned long max_nl_size = 0;
|
||||
unsigned int mapcount;
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
|
||||
ret = try_to_unmap_one(page, vma, migration);
|
||||
if (ret == SWAP_FAIL || !page_mapped(page))
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (list_empty(&mapping->i_mmap_nonlinear))
|
||||
goto out;
|
||||
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
|
||||
shared.vm_set.list) {
|
||||
if ((vma->vm_flags & VM_LOCKED) && !migration)
|
||||
continue;
|
||||
cursor = (unsigned long) vma->vm_private_data;
|
||||
if (cursor > max_nl_cursor)
|
||||
max_nl_cursor = cursor;
|
||||
cursor = vma->vm_end - vma->vm_start;
|
||||
if (cursor > max_nl_size)
|
||||
max_nl_size = cursor;
|
||||
}
|
||||
|
||||
if (max_nl_size == 0) { /* any nonlinears locked or reserved */
|
||||
ret = SWAP_FAIL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't try to search for this page in the nonlinear vmas,
|
||||
* and page_referenced wouldn't have found it anyway. Instead
|
||||
* just walk the nonlinear vmas trying to age and unmap some.
|
||||
* The mapcount of the page we came in with is irrelevant,
|
||||
* but even so use it as a guide to how hard we should try?
|
||||
*/
|
||||
mapcount = page_mapcount(page);
|
||||
if (!mapcount)
|
||||
goto out;
|
||||
cond_resched_lock(&mapping->i_mmap_lock);
|
||||
|
||||
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
|
||||
if (max_nl_cursor == 0)
|
||||
max_nl_cursor = CLUSTER_SIZE;
|
||||
|
||||
do {
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
|
||||
shared.vm_set.list) {
|
||||
if ((vma->vm_flags & VM_LOCKED) && !migration)
|
||||
continue;
|
||||
cursor = (unsigned long) vma->vm_private_data;
|
||||
while ( cursor < max_nl_cursor &&
|
||||
cursor < vma->vm_end - vma->vm_start) {
|
||||
try_to_unmap_cluster(cursor, &mapcount, vma);
|
||||
cursor += CLUSTER_SIZE;
|
||||
vma->vm_private_data = (void *) cursor;
|
||||
if ((int)mapcount <= 0)
|
||||
goto out;
|
||||
}
|
||||
vma->vm_private_data = (void *) max_nl_cursor;
|
||||
}
|
||||
cond_resched_lock(&mapping->i_mmap_lock);
|
||||
max_nl_cursor += CLUSTER_SIZE;
|
||||
} while (max_nl_cursor <= max_nl_size);
|
||||
|
||||
/*
|
||||
* Don't loop forever (perhaps all the remaining pages are
|
||||
* in locked vmas). Reset cursor on all unreserved nonlinear
|
||||
* vmas, now forgetting on which ones it had fallen behind.
|
||||
*/
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
|
||||
vma->vm_private_data = NULL;
|
||||
out:
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* try_to_unmap - try to remove all page table mappings to a page
|
||||
* @page: the page to get unmapped
|
||||
*
|
||||
* Tries to remove all the page table entries which are mapping this
|
||||
* page, used in the pageout path. Caller must hold the page lock.
|
||||
* Return values are:
|
||||
*
|
||||
* SWAP_SUCCESS - we succeeded in removing all mappings
|
||||
* SWAP_AGAIN - we missed a mapping, try again later
|
||||
* SWAP_FAIL - the page is unswappable
|
||||
*/
|
||||
int try_to_unmap(struct page *page, int migration)
|
||||
{
|
||||
int ret;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
if (PageAnon(page))
|
||||
ret = try_to_unmap_anon(page, migration);
|
||||
else
|
||||
ret = try_to_unmap_file(page, migration);
|
||||
|
||||
if (!page_mapped(page))
|
||||
ret = SWAP_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
2605
mm/shmem.c
Normal file
2605
mm/shmem.c
Normal file
File diff suppressed because it is too large
Load Diff
197
mm/shmem_acl.c
Normal file
197
mm/shmem_acl.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* mm/shmem_acl.c
|
||||
*
|
||||
* (C) 2005 Andreas Gruenbacher <agruen@suse.de>
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/generic_acl.h>
|
||||
|
||||
/**
|
||||
* shmem_get_acl - generic_acl_operations->getacl() operation
|
||||
*/
|
||||
static struct posix_acl *
|
||||
shmem_get_acl(struct inode *inode, int type)
|
||||
{
|
||||
struct posix_acl *acl = NULL;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
switch(type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
|
||||
break;
|
||||
|
||||
case ACL_TYPE_DEFAULT:
|
||||
acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
|
||||
break;
|
||||
}
|
||||
spin_unlock(&inode->i_lock);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
/**
|
||||
* shmem_set_acl - generic_acl_operations->setacl() operation
|
||||
*/
|
||||
static void
|
||||
shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
|
||||
{
|
||||
struct posix_acl *free = NULL;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
switch(type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
free = SHMEM_I(inode)->i_acl;
|
||||
SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
|
||||
break;
|
||||
|
||||
case ACL_TYPE_DEFAULT:
|
||||
free = SHMEM_I(inode)->i_default_acl;
|
||||
SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
|
||||
break;
|
||||
}
|
||||
spin_unlock(&inode->i_lock);
|
||||
posix_acl_release(free);
|
||||
}
|
||||
|
||||
struct generic_acl_operations shmem_acl_ops = {
|
||||
.getacl = shmem_get_acl,
|
||||
.setacl = shmem_set_acl,
|
||||
};
|
||||
|
||||
/**
|
||||
* shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
|
||||
* shmem_xattr_acl_access_handler - plumbing code to implement the
|
||||
* system.posix_acl_access xattr using the generic acl functions.
|
||||
*/
|
||||
|
||||
static size_t
|
||||
shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
|
||||
const char *name, size_t name_len)
|
||||
{
|
||||
return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
|
||||
list, list_size);
|
||||
}
|
||||
|
||||
static int
|
||||
shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
|
||||
size_t size)
|
||||
{
|
||||
if (strcmp(name, "") != 0)
|
||||
return -EINVAL;
|
||||
return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
|
||||
size);
|
||||
}
|
||||
|
||||
static int
|
||||
shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
if (strcmp(name, "") != 0)
|
||||
return -EINVAL;
|
||||
return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
|
||||
size);
|
||||
}
|
||||
|
||||
struct xattr_handler shmem_xattr_acl_access_handler = {
|
||||
.prefix = POSIX_ACL_XATTR_ACCESS,
|
||||
.list = shmem_list_acl_access,
|
||||
.get = shmem_get_acl_access,
|
||||
.set = shmem_set_acl_access,
|
||||
};
|
||||
|
||||
/**
|
||||
* shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
|
||||
* shmem_xattr_acl_default_handler - plumbing code to implement the
|
||||
* system.posix_acl_default xattr using the generic acl functions.
|
||||
*/
|
||||
|
||||
static size_t
|
||||
shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
|
||||
const char *name, size_t name_len)
|
||||
{
|
||||
return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
|
||||
list, list_size);
|
||||
}
|
||||
|
||||
static int
|
||||
shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
|
||||
size_t size)
|
||||
{
|
||||
if (strcmp(name, "") != 0)
|
||||
return -EINVAL;
|
||||
return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
|
||||
size);
|
||||
}
|
||||
|
||||
static int
|
||||
shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
if (strcmp(name, "") != 0)
|
||||
return -EINVAL;
|
||||
return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
|
||||
size);
|
||||
}
|
||||
|
||||
struct xattr_handler shmem_xattr_acl_default_handler = {
|
||||
.prefix = POSIX_ACL_XATTR_DEFAULT,
|
||||
.list = shmem_list_acl_default,
|
||||
.get = shmem_get_acl_default,
|
||||
.set = shmem_set_acl_default,
|
||||
};
|
||||
|
||||
/**
|
||||
* shmem_acl_init - Inizialize the acl(s) of a new inode
|
||||
*/
|
||||
int
|
||||
shmem_acl_init(struct inode *inode, struct inode *dir)
|
||||
{
|
||||
return generic_acl_init(inode, dir, &shmem_acl_ops);
|
||||
}
|
||||
|
||||
/**
|
||||
* shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
|
||||
*
|
||||
* This is done before destroying the actual inode.
|
||||
*/
|
||||
|
||||
void
|
||||
shmem_acl_destroy_inode(struct inode *inode)
|
||||
{
|
||||
if (SHMEM_I(inode)->i_acl)
|
||||
posix_acl_release(SHMEM_I(inode)->i_acl);
|
||||
SHMEM_I(inode)->i_acl = NULL;
|
||||
if (SHMEM_I(inode)->i_default_acl)
|
||||
posix_acl_release(SHMEM_I(inode)->i_default_acl);
|
||||
SHMEM_I(inode)->i_default_acl = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* shmem_check_acl - check_acl() callback for generic_permission()
|
||||
*/
|
||||
static int
|
||||
shmem_check_acl(struct inode *inode, int mask)
|
||||
{
|
||||
struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
|
||||
|
||||
if (acl) {
|
||||
int error = posix_acl_permission(inode, acl, mask);
|
||||
posix_acl_release(acl);
|
||||
return error;
|
||||
}
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
/**
|
||||
* shmem_permission - permission() inode operation
|
||||
*/
|
||||
int
|
||||
shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
|
||||
{
|
||||
return generic_permission(inode, mask, shmem_check_acl);
|
||||
}
|
||||
348
mm/slob.c
Normal file
348
mm/slob.c
Normal file
@@ -0,0 +1,348 @@
|
||||
/*
|
||||
* SLOB Allocator: Simple List Of Blocks
|
||||
*
|
||||
* Matt Mackall <mpm@selenic.com> 12/30/03
|
||||
*
|
||||
* How SLOB works:
|
||||
*
|
||||
* The core of SLOB is a traditional K&R style heap allocator, with
|
||||
* support for returning aligned objects. The granularity of this
|
||||
* allocator is 8 bytes on x86, though it's perhaps possible to reduce
|
||||
* this to 4 if it's deemed worth the effort. The slob heap is a
|
||||
* singly-linked list of pages from __get_free_page, grown on demand
|
||||
* and allocation from the heap is currently first-fit.
|
||||
*
|
||||
* Above this is an implementation of kmalloc/kfree. Blocks returned
|
||||
* from kmalloc are 8-byte aligned and prepended with a 8-byte header.
|
||||
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
|
||||
* __get_free_pages directly so that it can return page-aligned blocks
|
||||
* and keeps a linked list of such pages and their orders. These
|
||||
* objects are detected in kfree() by their page alignment.
|
||||
*
|
||||
* SLAB is emulated on top of SLOB by simply calling constructors and
|
||||
* destructors for every SLAB allocation. Objects are returned with
|
||||
* the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
|
||||
* set, in which case the low-level allocator will fragment blocks to
|
||||
* create the proper alignment. Again, objects of page-size or greater
|
||||
* are allocated by calling __get_free_pages. As SLAB objects know
|
||||
* their size, no separate size bookkeeping is necessary and there is
|
||||
* essentially no allocation space overhead.
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/cache.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/timer.h>
|
||||
|
||||
struct slob_block {
|
||||
int units;
|
||||
struct slob_block *next;
|
||||
};
|
||||
typedef struct slob_block slob_t;
|
||||
|
||||
#define SLOB_UNIT sizeof(slob_t)
|
||||
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
|
||||
#define SLOB_ALIGN L1_CACHE_BYTES
|
||||
|
||||
struct bigblock {
|
||||
int order;
|
||||
void *pages;
|
||||
struct bigblock *next;
|
||||
};
|
||||
typedef struct bigblock bigblock_t;
|
||||
|
||||
static slob_t arena = { .next = &arena, .units = 1 };
|
||||
static slob_t *slobfree = &arena;
|
||||
static bigblock_t *bigblocks;
|
||||
static DEFINE_SPINLOCK(slob_lock);
|
||||
static DEFINE_SPINLOCK(block_lock);
|
||||
|
||||
static void slob_free(void *b, int size);
|
||||
static void slob_timer_cbk(void);
|
||||
|
||||
|
||||
static void *slob_alloc(size_t size, gfp_t gfp, int align)
|
||||
{
|
||||
slob_t *prev, *cur, *aligned = 0;
|
||||
int delta = 0, units = SLOB_UNITS(size);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&slob_lock, flags);
|
||||
prev = slobfree;
|
||||
for (cur = prev->next; ; prev = cur, cur = cur->next) {
|
||||
if (align) {
|
||||
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
|
||||
delta = aligned - cur;
|
||||
}
|
||||
if (cur->units >= units + delta) { /* room enough? */
|
||||
if (delta) { /* need to fragment head to align? */
|
||||
aligned->units = cur->units - delta;
|
||||
aligned->next = cur->next;
|
||||
cur->next = aligned;
|
||||
cur->units = delta;
|
||||
prev = cur;
|
||||
cur = aligned;
|
||||
}
|
||||
|
||||
if (cur->units == units) /* exact fit? */
|
||||
prev->next = cur->next; /* unlink */
|
||||
else { /* fragment */
|
||||
prev->next = cur + units;
|
||||
prev->next->units = cur->units - units;
|
||||
prev->next->next = cur->next;
|
||||
cur->units = units;
|
||||
}
|
||||
|
||||
slobfree = prev;
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
return cur;
|
||||
}
|
||||
if (cur == slobfree) {
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
|
||||
if (size == PAGE_SIZE) /* trying to shrink arena? */
|
||||
return 0;
|
||||
|
||||
cur = (slob_t *)__get_free_page(gfp);
|
||||
if (!cur)
|
||||
return 0;
|
||||
|
||||
slob_free(cur, PAGE_SIZE);
|
||||
spin_lock_irqsave(&slob_lock, flags);
|
||||
cur = slobfree;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void slob_free(void *block, int size)
|
||||
{
|
||||
slob_t *cur, *b = (slob_t *)block;
|
||||
unsigned long flags;
|
||||
|
||||
if (!block)
|
||||
return;
|
||||
|
||||
if (size)
|
||||
b->units = SLOB_UNITS(size);
|
||||
|
||||
/* Find reinsertion point */
|
||||
spin_lock_irqsave(&slob_lock, flags);
|
||||
for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
|
||||
if (cur >= cur->next && (b > cur || b < cur->next))
|
||||
break;
|
||||
|
||||
if (b + b->units == cur->next) {
|
||||
b->units += cur->next->units;
|
||||
b->next = cur->next->next;
|
||||
} else
|
||||
b->next = cur->next;
|
||||
|
||||
if (cur + cur->units == b) {
|
||||
cur->units += b->units;
|
||||
cur->next = b->next;
|
||||
} else
|
||||
cur->next = b;
|
||||
|
||||
slobfree = cur;
|
||||
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
}
|
||||
|
||||
void *__kmalloc(size_t size, gfp_t gfp)
|
||||
{
|
||||
slob_t *m;
|
||||
bigblock_t *bb;
|
||||
unsigned long flags;
|
||||
|
||||
if (size < PAGE_SIZE - SLOB_UNIT) {
|
||||
m = slob_alloc(size + SLOB_UNIT, gfp, 0);
|
||||
return m ? (void *)(m + 1) : 0;
|
||||
}
|
||||
|
||||
bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
|
||||
if (!bb)
|
||||
return 0;
|
||||
|
||||
bb->order = get_order(size);
|
||||
bb->pages = (void *)__get_free_pages(gfp, bb->order);
|
||||
|
||||
if (bb->pages) {
|
||||
spin_lock_irqsave(&block_lock, flags);
|
||||
bb->next = bigblocks;
|
||||
bigblocks = bb;
|
||||
spin_unlock_irqrestore(&block_lock, flags);
|
||||
return bb->pages;
|
||||
}
|
||||
|
||||
slob_free(bb, sizeof(bigblock_t));
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__kmalloc);
|
||||
|
||||
void kfree(const void *block)
|
||||
{
|
||||
bigblock_t *bb, **last = &bigblocks;
|
||||
unsigned long flags;
|
||||
|
||||
if (!block)
|
||||
return;
|
||||
|
||||
if (!((unsigned long)block & (PAGE_SIZE-1))) {
|
||||
/* might be on the big block list */
|
||||
spin_lock_irqsave(&block_lock, flags);
|
||||
for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
|
||||
if (bb->pages == block) {
|
||||
*last = bb->next;
|
||||
spin_unlock_irqrestore(&block_lock, flags);
|
||||
free_pages((unsigned long)block, bb->order);
|
||||
slob_free(bb, sizeof(bigblock_t));
|
||||
return;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&block_lock, flags);
|
||||
}
|
||||
|
||||
slob_free((slob_t *)block - 1, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kfree);
|
||||
|
||||
unsigned int ksize(const void *block)
|
||||
{
|
||||
bigblock_t *bb;
|
||||
unsigned long flags;
|
||||
|
||||
if (!block)
|
||||
return 0;
|
||||
|
||||
if (!((unsigned long)block & (PAGE_SIZE-1))) {
|
||||
spin_lock_irqsave(&block_lock, flags);
|
||||
for (bb = bigblocks; bb; bb = bb->next)
|
||||
if (bb->pages == block) {
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
return PAGE_SIZE << bb->order;
|
||||
}
|
||||
spin_unlock_irqrestore(&block_lock, flags);
|
||||
}
|
||||
|
||||
return ((slob_t *)block - 1)->units * SLOB_UNIT;
|
||||
}
|
||||
|
||||
struct kmem_cache {
|
||||
unsigned int size, align;
|
||||
const char *name;
|
||||
void (*ctor)(void *, struct kmem_cache *, unsigned long);
|
||||
void (*dtor)(void *, struct kmem_cache *, unsigned long);
|
||||
};
|
||||
|
||||
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
|
||||
size_t align, unsigned long flags,
|
||||
void (*ctor)(void*, struct kmem_cache *, unsigned long),
|
||||
void (*dtor)(void*, struct kmem_cache *, unsigned long))
|
||||
{
|
||||
struct kmem_cache *c;
|
||||
|
||||
c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
|
||||
|
||||
if (c) {
|
||||
c->name = name;
|
||||
c->size = size;
|
||||
c->ctor = ctor;
|
||||
c->dtor = dtor;
|
||||
/* ignore alignment unless it's forced */
|
||||
c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
|
||||
if (c->align < align)
|
||||
c->align = align;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_create);
|
||||
|
||||
void kmem_cache_destroy(struct kmem_cache *c)
|
||||
{
|
||||
slob_free(c, sizeof(struct kmem_cache));
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_destroy);
|
||||
|
||||
void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
|
||||
{
|
||||
void *b;
|
||||
|
||||
if (c->size < PAGE_SIZE)
|
||||
b = slob_alloc(c->size, flags, c->align);
|
||||
else
|
||||
b = (void *)__get_free_pages(flags, get_order(c->size));
|
||||
|
||||
if (c->ctor)
|
||||
c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
|
||||
|
||||
return b;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_alloc);
|
||||
|
||||
void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
|
||||
{
|
||||
void *ret = kmem_cache_alloc(c, flags);
|
||||
if (ret)
|
||||
memset(ret, 0, c->size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_zalloc);
|
||||
|
||||
void kmem_cache_free(struct kmem_cache *c, void *b)
|
||||
{
|
||||
if (c->dtor)
|
||||
c->dtor(b, c, 0);
|
||||
|
||||
if (c->size < PAGE_SIZE)
|
||||
slob_free(b, c->size);
|
||||
else
|
||||
free_pages((unsigned long)b, get_order(c->size));
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_free);
|
||||
|
||||
unsigned int kmem_cache_size(struct kmem_cache *c)
|
||||
{
|
||||
return c->size;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_size);
|
||||
|
||||
const char *kmem_cache_name(struct kmem_cache *c)
|
||||
{
|
||||
return c->name;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_name);
|
||||
|
||||
static struct timer_list slob_timer = TIMER_INITIALIZER(
|
||||
(void (*)(unsigned long))slob_timer_cbk, 0, 0);
|
||||
|
||||
int kmem_cache_shrink(struct kmem_cache *d)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_shrink);
|
||||
|
||||
int kmem_ptr_validate(struct kmem_cache *a, const void *b)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __init kmem_cache_init(void)
|
||||
{
|
||||
slob_timer_cbk();
|
||||
}
|
||||
|
||||
static void slob_timer_cbk(void)
|
||||
{
|
||||
void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
|
||||
|
||||
if (p)
|
||||
free_page((unsigned long)p);
|
||||
|
||||
mod_timer(&slob_timer, jiffies + HZ);
|
||||
}
|
||||
340
mm/sparse.c
Normal file
340
mm/sparse.c
Normal file
@@ -0,0 +1,340 @@
|
||||
/*
|
||||
* sparse memory mappings.
|
||||
*/
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmzone.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <asm/dma.h>
|
||||
|
||||
/*
|
||||
* Permanent SPARSEMEM data:
|
||||
*
|
||||
* 1) mem_section - memory sections, mem_map's for valid memory
|
||||
*/
|
||||
#ifdef CONFIG_SPARSEMEM_EXTREME
|
||||
struct mem_section *mem_section[NR_SECTION_ROOTS]
|
||||
____cacheline_internodealigned_in_smp;
|
||||
#else
|
||||
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
|
||||
____cacheline_internodealigned_in_smp;
|
||||
#endif
|
||||
EXPORT_SYMBOL(mem_section);
|
||||
|
||||
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
||||
/*
|
||||
* If we did not store the node number in the page then we have to
|
||||
* do a lookup in the section_to_node_table in order to find which
|
||||
* node the page belongs to.
|
||||
*/
|
||||
#if MAX_NUMNODES <= 256
|
||||
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
|
||||
#else
|
||||
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
|
||||
#endif
|
||||
|
||||
int page_to_nid(struct page *page)
|
||||
{
|
||||
return section_to_node_table[page_to_section(page)];
|
||||
}
|
||||
EXPORT_SYMBOL(page_to_nid);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_EXTREME
|
||||
static struct mem_section *sparse_index_alloc(int nid)
|
||||
{
|
||||
struct mem_section *section = NULL;
|
||||
unsigned long array_size = SECTIONS_PER_ROOT *
|
||||
sizeof(struct mem_section);
|
||||
|
||||
if (slab_is_available())
|
||||
section = kmalloc_node(array_size, GFP_KERNEL, nid);
|
||||
else
|
||||
section = alloc_bootmem_node(NODE_DATA(nid), array_size);
|
||||
|
||||
if (section)
|
||||
memset(section, 0, array_size);
|
||||
|
||||
return section;
|
||||
}
|
||||
|
||||
static int sparse_index_init(unsigned long section_nr, int nid)
|
||||
{
|
||||
static DEFINE_SPINLOCK(index_init_lock);
|
||||
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
|
||||
struct mem_section *section;
|
||||
int ret = 0;
|
||||
|
||||
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
||||
section_to_node_table[section_nr] = nid;
|
||||
#endif
|
||||
|
||||
if (mem_section[root])
|
||||
return -EEXIST;
|
||||
|
||||
section = sparse_index_alloc(nid);
|
||||
/*
|
||||
* This lock keeps two different sections from
|
||||
* reallocating for the same index
|
||||
*/
|
||||
spin_lock(&index_init_lock);
|
||||
|
||||
if (mem_section[root]) {
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mem_section[root] = section;
|
||||
out:
|
||||
spin_unlock(&index_init_lock);
|
||||
return ret;
|
||||
}
|
||||
#else /* !SPARSEMEM_EXTREME */
|
||||
static inline int sparse_index_init(unsigned long section_nr, int nid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Although written for the SPARSEMEM_EXTREME case, this happens
|
||||
* to also work for the flat array case becase
|
||||
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
|
||||
*/
|
||||
int __section_nr(struct mem_section* ms)
|
||||
{
|
||||
unsigned long root_nr;
|
||||
struct mem_section* root;
|
||||
|
||||
for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
|
||||
root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
|
||||
if (!root)
|
||||
continue;
|
||||
|
||||
if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
|
||||
break;
|
||||
}
|
||||
|
||||
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
|
||||
}
|
||||
|
||||
/*
|
||||
* During early boot, before section_mem_map is used for an actual
|
||||
* mem_map, we use section_mem_map to store the section's NUMA
|
||||
* node. This keeps us from having to use another data structure. The
|
||||
* node information is cleared just before we store the real mem_map.
|
||||
*/
|
||||
static inline unsigned long sparse_encode_early_nid(int nid)
|
||||
{
|
||||
return (nid << SECTION_NID_SHIFT);
|
||||
}
|
||||
|
||||
static inline int sparse_early_nid(struct mem_section *section)
|
||||
{
|
||||
return (section->section_mem_map >> SECTION_NID_SHIFT);
|
||||
}
|
||||
|
||||
/* Record a memory area against a node. */
|
||||
void memory_present(int nid, unsigned long start, unsigned long end)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
start &= PAGE_SECTION_MASK;
|
||||
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
|
||||
unsigned long section = pfn_to_section_nr(pfn);
|
||||
struct mem_section *ms;
|
||||
|
||||
sparse_index_init(section, nid);
|
||||
|
||||
ms = __nr_to_section(section);
|
||||
if (!ms->section_mem_map)
|
||||
ms->section_mem_map = sparse_encode_early_nid(nid) |
|
||||
SECTION_MARKED_PRESENT;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Only used by the i386 NUMA architecures, but relatively
|
||||
* generic code.
|
||||
*/
|
||||
unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
unsigned long pfn;
|
||||
unsigned long nr_pages = 0;
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
if (nid != early_pfn_to_nid(pfn))
|
||||
continue;
|
||||
|
||||
if (pfn_valid(pfn))
|
||||
nr_pages += PAGES_PER_SECTION;
|
||||
}
|
||||
|
||||
return nr_pages * sizeof(struct page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Subtle, we encode the real pfn into the mem_map such that
|
||||
* the identity pfn - section_mem_map will return the actual
|
||||
* physical page frame number.
|
||||
*/
|
||||
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
|
||||
{
|
||||
return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
|
||||
}
|
||||
|
||||
/*
|
||||
* We need this if we ever free the mem_maps. While not implemented yet,
|
||||
* this function is included for parity with its sibling.
|
||||
*/
|
||||
static __attribute((unused))
|
||||
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
|
||||
{
|
||||
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
|
||||
}
|
||||
|
||||
static int sparse_init_one_section(struct mem_section *ms,
|
||||
unsigned long pnum, struct page *mem_map)
|
||||
{
|
||||
if (!valid_section(ms))
|
||||
return -EINVAL;
|
||||
|
||||
ms->section_mem_map &= ~SECTION_MAP_MASK;
|
||||
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
__attribute__((weak))
|
||||
void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
|
||||
{
|
||||
struct page *map;
|
||||
struct mem_section *ms = __nr_to_section(pnum);
|
||||
int nid = sparse_early_nid(ms);
|
||||
|
||||
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
|
||||
if (map)
|
||||
return map;
|
||||
|
||||
map = alloc_bootmem_high_node(NODE_DATA(nid),
|
||||
sizeof(struct page) * PAGES_PER_SECTION);
|
||||
if (map)
|
||||
return map;
|
||||
|
||||
map = alloc_bootmem_node(NODE_DATA(nid),
|
||||
sizeof(struct page) * PAGES_PER_SECTION);
|
||||
if (map)
|
||||
return map;
|
||||
|
||||
printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
|
||||
ms->section_mem_map = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
|
||||
{
|
||||
struct page *page, *ret;
|
||||
unsigned long memmap_size = sizeof(struct page) * nr_pages;
|
||||
|
||||
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
|
||||
if (page)
|
||||
goto got_map_page;
|
||||
|
||||
ret = vmalloc(memmap_size);
|
||||
if (ret)
|
||||
goto got_map_ptr;
|
||||
|
||||
return NULL;
|
||||
got_map_page:
|
||||
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
|
||||
got_map_ptr:
|
||||
memset(ret, 0, memmap_size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vaddr_in_vmalloc_area(void *addr)
|
||||
{
|
||||
if (addr >= (void *)VMALLOC_START &&
|
||||
addr < (void *)VMALLOC_END)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
if (vaddr_in_vmalloc_area(memmap))
|
||||
vfree(memmap);
|
||||
else
|
||||
free_pages((unsigned long)memmap,
|
||||
get_order(sizeof(struct page) * nr_pages));
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate the accumulated non-linear sections, allocate a mem_map
|
||||
* for each and record the physical to section mapping.
|
||||
*/
|
||||
void sparse_init(void)
|
||||
{
|
||||
unsigned long pnum;
|
||||
struct page *map;
|
||||
|
||||
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
|
||||
if (!valid_section_nr(pnum))
|
||||
continue;
|
||||
|
||||
map = sparse_early_mem_map_alloc(pnum);
|
||||
if (!map)
|
||||
continue;
|
||||
sparse_init_one_section(__nr_to_section(pnum), pnum, map);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* returns the number of sections whose mem_maps were properly
|
||||
* set. If this is <=0, then that means that the passed-in
|
||||
* map was not consumed and must be freed.
|
||||
*/
|
||||
int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
|
||||
int nr_pages)
|
||||
{
|
||||
unsigned long section_nr = pfn_to_section_nr(start_pfn);
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
struct mem_section *ms;
|
||||
struct page *memmap;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* no locking for this, because it does its own
|
||||
* plus, it does a kmalloc
|
||||
*/
|
||||
sparse_index_init(section_nr, pgdat->node_id);
|
||||
memmap = __kmalloc_section_memmap(nr_pages);
|
||||
|
||||
pgdat_resize_lock(pgdat, &flags);
|
||||
|
||||
ms = __pfn_to_section(start_pfn);
|
||||
if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
ms->section_mem_map |= SECTION_MARKED_PRESENT;
|
||||
|
||||
ret = sparse_init_one_section(ms, section_nr, memmap);
|
||||
|
||||
out:
|
||||
pgdat_resize_unlock(pgdat, &flags);
|
||||
if (ret <= 0)
|
||||
__kfree_section_memmap(memmap, nr_pages);
|
||||
return ret;
|
||||
}
|
||||
520
mm/swap.c
Normal file
520
mm/swap.c
Normal file
@@ -0,0 +1,520 @@
|
||||
/*
|
||||
* linux/mm/swap.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file contains the default values for the opereation of the
|
||||
* Linux VM subsystem. Fine-tuning documentation can be found in
|
||||
* Documentation/sysctl/vm.txt.
|
||||
* Started 18.12.91
|
||||
* Swap aging added 23.2.95, Stephen Tweedie.
|
||||
* Buffermem limits added 12.3.98, Rik van Riel.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/buffer_head.h> /* for try_to_release_page() */
|
||||
#include <linux/module.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/init.h>
|
||||
|
||||
/* How many pages do we try to swap or page in/out together? */
|
||||
int page_cluster;
|
||||
|
||||
/*
|
||||
* This path almost never happens for VM activity - pages are normally
|
||||
* freed via pagevecs. But it gets used by networking.
|
||||
*/
|
||||
static void fastcall __page_cache_release(struct page *page)
|
||||
{
|
||||
if (PageLRU(page)) {
|
||||
unsigned long flags;
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||
VM_BUG_ON(!PageLRU(page));
|
||||
__ClearPageLRU(page);
|
||||
del_page_from_lru(zone, page);
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
}
|
||||
free_hot_page(page);
|
||||
}
|
||||
|
||||
static void put_compound_page(struct page *page)
|
||||
{
|
||||
page = (struct page *)page_private(page);
|
||||
if (put_page_testzero(page)) {
|
||||
compound_page_dtor *dtor;
|
||||
|
||||
dtor = get_compound_page_dtor(page);
|
||||
(*dtor)(page);
|
||||
}
|
||||
}
|
||||
|
||||
void put_page(struct page *page)
|
||||
{
|
||||
if (unlikely(PageCompound(page)))
|
||||
put_compound_page(page);
|
||||
else if (put_page_testzero(page))
|
||||
__page_cache_release(page);
|
||||
}
|
||||
EXPORT_SYMBOL(put_page);
|
||||
|
||||
/**
|
||||
* put_pages_list(): release a list of pages
|
||||
*
|
||||
* Release a list of pages which are strung together on page.lru. Currently
|
||||
* used by read_cache_pages() and related error recovery code.
|
||||
*
|
||||
* @pages: list of pages threaded on page->lru
|
||||
*/
|
||||
void put_pages_list(struct list_head *pages)
|
||||
{
|
||||
while (!list_empty(pages)) {
|
||||
struct page *victim;
|
||||
|
||||
victim = list_entry(pages->prev, struct page, lru);
|
||||
list_del(&victim->lru);
|
||||
page_cache_release(victim);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(put_pages_list);
|
||||
|
||||
/*
|
||||
* Writeback is about to end against a page which has been marked for immediate
|
||||
* reclaim. If it still appears to be reclaimable, move it to the tail of the
|
||||
* inactive list. The page still has PageWriteback set, which will pin it.
|
||||
*
|
||||
* We don't expect many pages to come through here, so don't bother batching
|
||||
* things up.
|
||||
*
|
||||
* To avoid placing the page at the tail of the LRU while PG_writeback is still
|
||||
* set, this function will clear PG_writeback before performing the page
|
||||
* motion. Do that inside the lru lock because once PG_writeback is cleared
|
||||
* we may not touch the page.
|
||||
*
|
||||
* Returns zero if it cleared PG_writeback.
|
||||
*/
|
||||
int rotate_reclaimable_page(struct page *page)
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long flags;
|
||||
|
||||
if (PageLocked(page))
|
||||
return 1;
|
||||
if (PageDirty(page))
|
||||
return 1;
|
||||
if (PageActive(page))
|
||||
return 1;
|
||||
if (!PageLRU(page))
|
||||
return 1;
|
||||
|
||||
zone = page_zone(page);
|
||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||
if (PageLRU(page) && !PageActive(page)) {
|
||||
list_move_tail(&page->lru, &zone->inactive_list);
|
||||
__count_vm_event(PGROTATED);
|
||||
}
|
||||
if (!test_clear_page_writeback(page))
|
||||
BUG();
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: speed this up?
|
||||
*/
|
||||
void fastcall activate_page(struct page *page)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
if (PageLRU(page) && !PageActive(page)) {
|
||||
del_page_from_inactive_list(zone, page);
|
||||
SetPageActive(page);
|
||||
add_page_to_active_list(zone, page);
|
||||
__count_vm_event(PGACTIVATE);
|
||||
}
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark a page as having seen activity.
|
||||
*
|
||||
* inactive,unreferenced -> inactive,referenced
|
||||
* inactive,referenced -> active,unreferenced
|
||||
* active,unreferenced -> active,referenced
|
||||
*/
|
||||
void fastcall mark_page_accessed(struct page *page)
|
||||
{
|
||||
if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
|
||||
activate_page(page);
|
||||
ClearPageReferenced(page);
|
||||
} else if (!PageReferenced(page)) {
|
||||
SetPageReferenced(page);
|
||||
}
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(mark_page_accessed);
|
||||
|
||||
/**
|
||||
* lru_cache_add: add a page to the page lists
|
||||
* @page: the page to add
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
|
||||
static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
|
||||
|
||||
void fastcall lru_cache_add(struct page *page)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
|
||||
|
||||
page_cache_get(page);
|
||||
if (!pagevec_add(pvec, page))
|
||||
__pagevec_lru_add(pvec);
|
||||
put_cpu_var(lru_add_pvecs);
|
||||
}
|
||||
|
||||
void fastcall lru_cache_add_active(struct page *page)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
|
||||
|
||||
page_cache_get(page);
|
||||
if (!pagevec_add(pvec, page))
|
||||
__pagevec_lru_add_active(pvec);
|
||||
put_cpu_var(lru_add_active_pvecs);
|
||||
}
|
||||
|
||||
static void __lru_add_drain(int cpu)
|
||||
{
|
||||
struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
|
||||
|
||||
/* CPU is dead, so no locking needed. */
|
||||
if (pagevec_count(pvec))
|
||||
__pagevec_lru_add(pvec);
|
||||
pvec = &per_cpu(lru_add_active_pvecs, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
__pagevec_lru_add_active(pvec);
|
||||
}
|
||||
|
||||
void lru_add_drain(void)
|
||||
{
|
||||
__lru_add_drain(get_cpu());
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static void lru_add_drain_per_cpu(struct work_struct *dummy)
|
||||
{
|
||||
lru_add_drain();
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns 0 for success
|
||||
*/
|
||||
int lru_add_drain_all(void)
|
||||
{
|
||||
return schedule_on_each_cpu(lru_add_drain_per_cpu);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* Returns 0 for success
|
||||
*/
|
||||
int lru_add_drain_all(void)
|
||||
{
|
||||
lru_add_drain();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Batched page_cache_release(). Decrement the reference count on all the
|
||||
* passed pages. If it fell to zero then remove the page from the LRU and
|
||||
* free it.
|
||||
*
|
||||
* Avoid taking zone->lru_lock if possible, but if it is taken, retain it
|
||||
* for the remainder of the operation.
|
||||
*
|
||||
* The locking in this function is against shrink_cache(): we recheck the
|
||||
* page count inside the lock to see whether shrink_cache grabbed the page
|
||||
* via the LRU. If it did, give up: shrink_cache will free it.
|
||||
*/
|
||||
void release_pages(struct page **pages, int nr, int cold)
|
||||
{
|
||||
int i;
|
||||
struct pagevec pages_to_free;
|
||||
struct zone *zone = NULL;
|
||||
|
||||
pagevec_init(&pages_to_free, cold);
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pages[i];
|
||||
|
||||
if (unlikely(PageCompound(page))) {
|
||||
if (zone) {
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = NULL;
|
||||
}
|
||||
put_compound_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!put_page_testzero(page))
|
||||
continue;
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct zone *pagezone = page_zone(page);
|
||||
if (pagezone != zone) {
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = pagezone;
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
}
|
||||
VM_BUG_ON(!PageLRU(page));
|
||||
__ClearPageLRU(page);
|
||||
del_page_from_lru(zone, page);
|
||||
}
|
||||
|
||||
if (!pagevec_add(&pages_to_free, page)) {
|
||||
if (zone) {
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = NULL;
|
||||
}
|
||||
__pagevec_free(&pages_to_free);
|
||||
pagevec_reinit(&pages_to_free);
|
||||
}
|
||||
}
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
|
||||
pagevec_free(&pages_to_free);
|
||||
}
|
||||
|
||||
/*
|
||||
* The pages which we're about to release may be in the deferred lru-addition
|
||||
* queues. That would prevent them from really being freed right now. That's
|
||||
* OK from a correctness point of view but is inefficient - those pages may be
|
||||
* cache-warm and we want to give them back to the page allocator ASAP.
|
||||
*
|
||||
* So __pagevec_release() will drain those queues here. __pagevec_lru_add()
|
||||
* and __pagevec_lru_add_active() call release_pages() directly to avoid
|
||||
* mutual recursion.
|
||||
*/
|
||||
void __pagevec_release(struct pagevec *pvec)
|
||||
{
|
||||
lru_add_drain();
|
||||
release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(__pagevec_release);
|
||||
|
||||
/*
|
||||
* pagevec_release() for pages which are known to not be on the LRU
|
||||
*
|
||||
* This function reinitialises the caller's pagevec.
|
||||
*/
|
||||
void __pagevec_release_nonlru(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
struct pagevec pages_to_free;
|
||||
|
||||
pagevec_init(&pages_to_free, pvec->cold);
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
if (put_page_testzero(page))
|
||||
pagevec_add(&pages_to_free, page);
|
||||
}
|
||||
pagevec_free(&pages_to_free);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the passed pages to the LRU, then drop the caller's refcount
|
||||
* on them. Reinitialises the caller's pagevec.
|
||||
*/
|
||||
void __pagevec_lru_add(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
struct zone *zone = NULL;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct zone *pagezone = page_zone(page);
|
||||
|
||||
if (pagezone != zone) {
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = pagezone;
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
}
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
SetPageLRU(page);
|
||||
add_page_to_inactive_list(zone, page);
|
||||
}
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
release_pages(pvec->pages, pvec->nr, pvec->cold);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(__pagevec_lru_add);
|
||||
|
||||
void __pagevec_lru_add_active(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
struct zone *zone = NULL;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct zone *pagezone = page_zone(page);
|
||||
|
||||
if (pagezone != zone) {
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
zone = pagezone;
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
}
|
||||
VM_BUG_ON(PageLRU(page));
|
||||
SetPageLRU(page);
|
||||
VM_BUG_ON(PageActive(page));
|
||||
SetPageActive(page);
|
||||
add_page_to_active_list(zone, page);
|
||||
}
|
||||
if (zone)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
release_pages(pvec->pages, pvec->nr, pvec->cold);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to drop buffers from the pages in a pagevec
|
||||
*/
|
||||
void pagevec_strip(struct pagevec *pvec)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
if (PagePrivate(page) && !TestSetPageLocked(page)) {
|
||||
if (PagePrivate(page))
|
||||
try_to_release_page(page, 0);
|
||||
unlock_page(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* pagevec_lookup - gang pagecache lookup
|
||||
* @pvec: Where the resulting pages are placed
|
||||
* @mapping: The address_space to search
|
||||
* @start: The starting page index
|
||||
* @nr_pages: The maximum number of pages
|
||||
*
|
||||
* pagevec_lookup() will search for and return a group of up to @nr_pages pages
|
||||
* in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
|
||||
* reference against the pages in @pvec.
|
||||
*
|
||||
* The search returns a group of mapping-contiguous pages with ascending
|
||||
* indexes. There may be holes in the indices due to not-present pages.
|
||||
*
|
||||
* pagevec_lookup() returns the number of pages which were found.
|
||||
*/
|
||||
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
|
||||
pgoff_t start, unsigned nr_pages)
|
||||
{
|
||||
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
|
||||
return pagevec_count(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(pagevec_lookup);
|
||||
|
||||
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
|
||||
pgoff_t *index, int tag, unsigned nr_pages)
|
||||
{
|
||||
pvec->nr = find_get_pages_tag(mapping, index, tag,
|
||||
nr_pages, pvec->pages);
|
||||
return pagevec_count(pvec);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(pagevec_lookup_tag);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* We tolerate a little inaccuracy to avoid ping-ponging the counter between
|
||||
* CPUs
|
||||
*/
|
||||
#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
|
||||
|
||||
static DEFINE_PER_CPU(long, committed_space) = 0;
|
||||
|
||||
void vm_acct_memory(long pages)
|
||||
{
|
||||
long *local;
|
||||
|
||||
preempt_disable();
|
||||
local = &__get_cpu_var(committed_space);
|
||||
*local += pages;
|
||||
if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
|
||||
atomic_add(*local, &vm_committed_space);
|
||||
*local = 0;
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
|
||||
/* Drop the CPU's cached committed space back into the central pool. */
|
||||
static int cpu_swap_callback(struct notifier_block *nfb,
|
||||
unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
long *committed;
|
||||
|
||||
committed = &per_cpu(committed_space, (long)hcpu);
|
||||
if (action == CPU_DEAD) {
|
||||
atomic_add(*committed, &vm_committed_space);
|
||||
*committed = 0;
|
||||
__lru_add_drain((long)hcpu);
|
||||
}
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
#endif /* CONFIG_HOTPLUG_CPU */
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
* Perform any setup for the swap system
|
||||
*/
|
||||
void __init swap_setup(void)
|
||||
{
|
||||
unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
|
||||
|
||||
/* Use a smaller cluster for small-memory machines */
|
||||
if (megs < 16)
|
||||
page_cluster = 2;
|
||||
else
|
||||
page_cluster = 3;
|
||||
/*
|
||||
* Right now other parts of the system means that we
|
||||
* _really_ don't want to cluster much more
|
||||
*/
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
hotcpu_notifier(cpu_swap_callback, 0);
|
||||
#endif
|
||||
}
|
||||
366
mm/swap_state.c
Normal file
366
mm/swap_state.c
Normal file
@@ -0,0 +1,366 @@
|
||||
/*
|
||||
* linux/mm/swap_state.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
||||
* Swap reorganised 29.12.95, Stephen Tweedie
|
||||
*
|
||||
* Rewritten to use page cache, (C) 1998 Stephen Tweedie
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/migrate.h>
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
/*
|
||||
* swapper_space is a fiction, retained to simplify the path through
|
||||
* vmscan's shrink_list, to make sync_page look nicer, and to allow
|
||||
* future use of radix_tree tags in the swap cache.
|
||||
*/
|
||||
static const struct address_space_operations swap_aops = {
|
||||
.writepage = swap_writepage,
|
||||
.sync_page = block_sync_page,
|
||||
.set_page_dirty = __set_page_dirty_nobuffers,
|
||||
.migratepage = migrate_page,
|
||||
};
|
||||
|
||||
static struct backing_dev_info swap_backing_dev_info = {
|
||||
.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
|
||||
.unplug_io_fn = swap_unplug_io_fn,
|
||||
};
|
||||
|
||||
struct address_space swapper_space = {
|
||||
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
|
||||
.tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
|
||||
.a_ops = &swap_aops,
|
||||
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
|
||||
.backing_dev_info = &swap_backing_dev_info,
|
||||
};
|
||||
|
||||
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
|
||||
|
||||
static struct {
|
||||
unsigned long add_total;
|
||||
unsigned long del_total;
|
||||
unsigned long find_success;
|
||||
unsigned long find_total;
|
||||
unsigned long noent_race;
|
||||
unsigned long exist_race;
|
||||
} swap_cache_info;
|
||||
|
||||
void show_swap_cache_info(void)
|
||||
{
|
||||
printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
|
||||
swap_cache_info.add_total, swap_cache_info.del_total,
|
||||
swap_cache_info.find_success, swap_cache_info.find_total,
|
||||
swap_cache_info.noent_race, swap_cache_info.exist_race);
|
||||
printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
|
||||
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
|
||||
}
|
||||
|
||||
/*
|
||||
* __add_to_swap_cache resembles add_to_page_cache on swapper_space,
|
||||
* but sets SwapCache flag and private instead of mapping and index.
|
||||
*/
|
||||
static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
int error;
|
||||
|
||||
BUG_ON(PageSwapCache(page));
|
||||
BUG_ON(PagePrivate(page));
|
||||
error = radix_tree_preload(gfp_mask);
|
||||
if (!error) {
|
||||
write_lock_irq(&swapper_space.tree_lock);
|
||||
error = radix_tree_insert(&swapper_space.page_tree,
|
||||
entry.val, page);
|
||||
if (!error) {
|
||||
page_cache_get(page);
|
||||
SetPageLocked(page);
|
||||
SetPageSwapCache(page);
|
||||
set_page_private(page, entry.val);
|
||||
total_swapcache_pages++;
|
||||
__inc_zone_page_state(page, NR_FILE_PAGES);
|
||||
}
|
||||
write_unlock_irq(&swapper_space.tree_lock);
|
||||
radix_tree_preload_end();
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
static int add_to_swap_cache(struct page *page, swp_entry_t entry)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (!swap_duplicate(entry)) {
|
||||
INC_CACHE_INFO(noent_race);
|
||||
return -ENOENT;
|
||||
}
|
||||
error = __add_to_swap_cache(page, entry, GFP_KERNEL);
|
||||
/*
|
||||
* Anon pages are already on the LRU, we don't run lru_cache_add here.
|
||||
*/
|
||||
if (error) {
|
||||
swap_free(entry);
|
||||
if (error == -EEXIST)
|
||||
INC_CACHE_INFO(exist_race);
|
||||
return error;
|
||||
}
|
||||
INC_CACHE_INFO(add_total);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This must be called only on pages that have
|
||||
* been verified to be in the swap cache.
|
||||
*/
|
||||
void __delete_from_swap_cache(struct page *page)
|
||||
{
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(!PageSwapCache(page));
|
||||
BUG_ON(PageWriteback(page));
|
||||
BUG_ON(PagePrivate(page));
|
||||
|
||||
radix_tree_delete(&swapper_space.page_tree, page_private(page));
|
||||
set_page_private(page, 0);
|
||||
ClearPageSwapCache(page);
|
||||
total_swapcache_pages--;
|
||||
__dec_zone_page_state(page, NR_FILE_PAGES);
|
||||
INC_CACHE_INFO(del_total);
|
||||
}
|
||||
|
||||
/**
|
||||
* add_to_swap - allocate swap space for a page
|
||||
* @page: page we want to move to swap
|
||||
*
|
||||
* Allocate swap space for the page and add the page to the
|
||||
* swap cache. Caller needs to hold the page lock.
|
||||
*/
|
||||
int add_to_swap(struct page * page, gfp_t gfp_mask)
|
||||
{
|
||||
swp_entry_t entry;
|
||||
int err;
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
for (;;) {
|
||||
entry = get_swap_page();
|
||||
if (!entry.val)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Radix-tree node allocations from PF_MEMALLOC contexts could
|
||||
* completely exhaust the page allocator. __GFP_NOMEMALLOC
|
||||
* stops emergency reserves from being allocated.
|
||||
*
|
||||
* TODO: this could cause a theoretical memory reclaim
|
||||
* deadlock in the swap out path.
|
||||
*/
|
||||
/*
|
||||
* Add it to the swap cache and mark it dirty
|
||||
*/
|
||||
err = __add_to_swap_cache(page, entry,
|
||||
gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
|
||||
|
||||
switch (err) {
|
||||
case 0: /* Success */
|
||||
SetPageUptodate(page);
|
||||
SetPageDirty(page);
|
||||
INC_CACHE_INFO(add_total);
|
||||
return 1;
|
||||
case -EEXIST:
|
||||
/* Raced with "speculative" read_swap_cache_async */
|
||||
INC_CACHE_INFO(exist_race);
|
||||
swap_free(entry);
|
||||
continue;
|
||||
default:
|
||||
/* -ENOMEM radix-tree allocation failure */
|
||||
swap_free(entry);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This must be called only on pages that have
|
||||
* been verified to be in the swap cache and locked.
|
||||
* It will never put the page into the free list,
|
||||
* the caller has a reference on the page.
|
||||
*/
|
||||
void delete_from_swap_cache(struct page *page)
|
||||
{
|
||||
swp_entry_t entry;
|
||||
|
||||
entry.val = page_private(page);
|
||||
|
||||
write_lock_irq(&swapper_space.tree_lock);
|
||||
__delete_from_swap_cache(page);
|
||||
write_unlock_irq(&swapper_space.tree_lock);
|
||||
|
||||
swap_free(entry);
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Strange swizzling function only for use by shmem_writepage
|
||||
*/
|
||||
int move_to_swap_cache(struct page *page, swp_entry_t entry)
|
||||
{
|
||||
int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
|
||||
if (!err) {
|
||||
remove_from_page_cache(page);
|
||||
page_cache_release(page); /* pagecache ref */
|
||||
if (!swap_duplicate(entry))
|
||||
BUG();
|
||||
SetPageDirty(page);
|
||||
INC_CACHE_INFO(add_total);
|
||||
} else if (err == -EEXIST)
|
||||
INC_CACHE_INFO(exist_race);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Strange swizzling function for shmem_getpage (and shmem_unuse)
|
||||
*/
|
||||
int move_from_swap_cache(struct page *page, unsigned long index,
|
||||
struct address_space *mapping)
|
||||
{
|
||||
int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
|
||||
if (!err) {
|
||||
delete_from_swap_cache(page);
|
||||
/* shift page from clean_pages to dirty_pages list */
|
||||
ClearPageDirty(page);
|
||||
set_page_dirty(page);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are the only user, then try to free up the swap cache.
|
||||
*
|
||||
* Its ok to check for PageSwapCache without the page lock
|
||||
* here because we are going to recheck again inside
|
||||
* exclusive_swap_page() _with_ the lock.
|
||||
* - Marcelo
|
||||
*/
|
||||
static inline void free_swap_cache(struct page *page)
|
||||
{
|
||||
if (PageSwapCache(page) && !TestSetPageLocked(page)) {
|
||||
remove_exclusive_swap_page(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform a free_page(), also freeing any swap cache associated with
|
||||
* this page if it is the last user of the page.
|
||||
*/
|
||||
void free_page_and_swap_cache(struct page *page)
|
||||
{
|
||||
free_swap_cache(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Passed an array of pages, drop them all from swapcache and then release
|
||||
* them. They are removed from the LRU and freed if this is their last use.
|
||||
*/
|
||||
void free_pages_and_swap_cache(struct page **pages, int nr)
|
||||
{
|
||||
struct page **pagep = pages;
|
||||
|
||||
lru_add_drain();
|
||||
while (nr) {
|
||||
int todo = min(nr, PAGEVEC_SIZE);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < todo; i++)
|
||||
free_swap_cache(pagep[i]);
|
||||
release_pages(pagep, todo, 0);
|
||||
pagep += todo;
|
||||
nr -= todo;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup a swap entry in the swap cache. A found page will be returned
|
||||
* unlocked and with its refcount incremented - we rely on the kernel
|
||||
* lock getting page table operations atomic even if we drop the page
|
||||
* lock before returning.
|
||||
*/
|
||||
struct page * lookup_swap_cache(swp_entry_t entry)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = find_get_page(&swapper_space, entry.val);
|
||||
|
||||
if (page)
|
||||
INC_CACHE_INFO(find_success);
|
||||
|
||||
INC_CACHE_INFO(find_total);
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Locate a page of swap in physical memory, reserving swap cache space
|
||||
* and reading the disk if it is not already cached.
|
||||
* A failure return means that either the page allocation failed or that
|
||||
* the swap entry is no longer in use.
|
||||
*/
|
||||
struct page *read_swap_cache_async(swp_entry_t entry,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
struct page *found_page, *new_page = NULL;
|
||||
int err;
|
||||
|
||||
do {
|
||||
/*
|
||||
* First check the swap cache. Since this is normally
|
||||
* called after lookup_swap_cache() failed, re-calling
|
||||
* that would confuse statistics.
|
||||
*/
|
||||
found_page = find_get_page(&swapper_space, entry.val);
|
||||
if (found_page)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Get a new page to read into from swap.
|
||||
*/
|
||||
if (!new_page) {
|
||||
new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
|
||||
if (!new_page)
|
||||
break; /* Out of memory */
|
||||
}
|
||||
|
||||
/*
|
||||
* Associate the page with swap entry in the swap cache.
|
||||
* May fail (-ENOENT) if swap entry has been freed since
|
||||
* our caller observed it. May fail (-EEXIST) if there
|
||||
* is already a page associated with this entry in the
|
||||
* swap cache: added by a racing read_swap_cache_async,
|
||||
* or by try_to_swap_out (or shmem_writepage) re-using
|
||||
* the just freed swap entry for an existing page.
|
||||
* May fail (-ENOMEM) if radix-tree node allocation failed.
|
||||
*/
|
||||
err = add_to_swap_cache(new_page, entry);
|
||||
if (!err) {
|
||||
/*
|
||||
* Initiate read into locked page and return.
|
||||
*/
|
||||
lru_cache_add_active(new_page);
|
||||
swap_readpage(NULL, new_page);
|
||||
return new_page;
|
||||
}
|
||||
} while (err != -ENOENT && err != -ENOMEM);
|
||||
|
||||
if (new_page)
|
||||
page_cache_release(new_page);
|
||||
return found_page;
|
||||
}
|
||||
1801
mm/swapfile.c
Normal file
1801
mm/swapfile.c
Normal file
File diff suppressed because it is too large
Load Diff
80
mm/thrash.c
Normal file
80
mm/thrash.c
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* mm/thrash.c
|
||||
*
|
||||
* Copyright (C) 2004, Red Hat, Inc.
|
||||
* Copyright (C) 2004, Rik van Riel <riel@redhat.com>
|
||||
* Released under the GPL, see the file COPYING for details.
|
||||
*
|
||||
* Simple token based thrashing protection, using the algorithm
|
||||
* described in: http://www.cs.wm.edu/~sjiang/token.pdf
|
||||
*
|
||||
* Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
|
||||
* Improved algorithm to pass token:
|
||||
* Each task has a priority which is incremented if it contended
|
||||
* for the token in an interval less than its previous attempt.
|
||||
* If the token is acquired, that task's priority is boosted to prevent
|
||||
* the token from bouncing around too often and to let the task make
|
||||
* some progress in its execution.
|
||||
*/
|
||||
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
static DEFINE_SPINLOCK(swap_token_lock);
|
||||
struct mm_struct *swap_token_mm;
|
||||
static unsigned int global_faults;
|
||||
|
||||
void grab_swap_token(void)
|
||||
{
|
||||
int current_interval;
|
||||
|
||||
global_faults++;
|
||||
|
||||
current_interval = global_faults - current->mm->faultstamp;
|
||||
|
||||
if (!spin_trylock(&swap_token_lock))
|
||||
return;
|
||||
|
||||
/* First come first served */
|
||||
if (swap_token_mm == NULL) {
|
||||
current->mm->token_priority = current->mm->token_priority + 2;
|
||||
swap_token_mm = current->mm;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (current->mm != swap_token_mm) {
|
||||
if (current_interval < current->mm->last_interval)
|
||||
current->mm->token_priority++;
|
||||
else {
|
||||
current->mm->token_priority--;
|
||||
if (unlikely(current->mm->token_priority < 0))
|
||||
current->mm->token_priority = 0;
|
||||
}
|
||||
/* Check if we deserve the token */
|
||||
if (current->mm->token_priority >
|
||||
swap_token_mm->token_priority) {
|
||||
current->mm->token_priority += 2;
|
||||
swap_token_mm = current->mm;
|
||||
}
|
||||
} else {
|
||||
/* Token holder came in again! */
|
||||
current->mm->token_priority += 2;
|
||||
}
|
||||
|
||||
out:
|
||||
current->mm->faultstamp = global_faults;
|
||||
current->mm->last_interval = current_interval;
|
||||
spin_unlock(&swap_token_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Called on process exit. */
|
||||
void __put_swap_token(struct mm_struct *mm)
|
||||
{
|
||||
spin_lock(&swap_token_lock);
|
||||
if (likely(mm == swap_token_mm))
|
||||
swap_token_mm = NULL;
|
||||
spin_unlock(&swap_token_lock);
|
||||
}
|
||||
150
mm/tiny-shmem.c
Normal file
150
mm/tiny-shmem.c
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
|
||||
*
|
||||
* Matt Mackall <mpm@selenic.com> January, 2004
|
||||
* derived from mm/shmem.c and fs/ramfs/inode.c
|
||||
*
|
||||
* This is intended for small system where the benefits of the full
|
||||
* shmem code (swap-backed and resource-limited) are outweighed by
|
||||
* their complexity. On systems without swap this code should be
|
||||
* effectively equivalent, but much lighter weight.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/vfs.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/ramfs.h>
|
||||
|
||||
static struct file_system_type tmpfs_fs_type = {
|
||||
.name = "tmpfs",
|
||||
.get_sb = ramfs_get_sb,
|
||||
.kill_sb = kill_litter_super,
|
||||
};
|
||||
|
||||
static struct vfsmount *shm_mnt;
|
||||
|
||||
static int __init init_tmpfs(void)
|
||||
{
|
||||
BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
|
||||
|
||||
shm_mnt = kern_mount(&tmpfs_fs_type);
|
||||
BUG_ON(IS_ERR(shm_mnt));
|
||||
|
||||
return 0;
|
||||
}
|
||||
module_init(init_tmpfs)
|
||||
|
||||
/*
|
||||
* shmem_file_setup - get an unlinked file living in tmpfs
|
||||
*
|
||||
* @name: name for dentry (to be seen in /proc/<pid>/maps
|
||||
* @size: size to be set for the file
|
||||
*
|
||||
*/
|
||||
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
|
||||
{
|
||||
int error;
|
||||
struct file *file;
|
||||
struct inode *inode;
|
||||
struct dentry *dentry, *root;
|
||||
struct qstr this;
|
||||
|
||||
if (IS_ERR(shm_mnt))
|
||||
return (void *)shm_mnt;
|
||||
|
||||
error = -ENOMEM;
|
||||
this.name = name;
|
||||
this.len = strlen(name);
|
||||
this.hash = 0; /* will go */
|
||||
root = shm_mnt->mnt_root;
|
||||
dentry = d_alloc(root, &this);
|
||||
if (!dentry)
|
||||
goto put_memory;
|
||||
|
||||
error = -ENFILE;
|
||||
file = get_empty_filp();
|
||||
if (!file)
|
||||
goto put_dentry;
|
||||
|
||||
error = -ENOSPC;
|
||||
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
|
||||
if (!inode)
|
||||
goto close_file;
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
inode->i_nlink = 0; /* It is unlinked */
|
||||
|
||||
file->f_path.mnt = mntget(shm_mnt);
|
||||
file->f_path.dentry = dentry;
|
||||
file->f_mapping = inode->i_mapping;
|
||||
file->f_op = &ramfs_file_operations;
|
||||
file->f_mode = FMODE_WRITE | FMODE_READ;
|
||||
|
||||
/* notify everyone as to the change of file size */
|
||||
error = do_truncate(dentry, size, 0, file);
|
||||
if (error < 0)
|
||||
goto close_file;
|
||||
|
||||
return file;
|
||||
|
||||
close_file:
|
||||
put_filp(file);
|
||||
put_dentry:
|
||||
dput(dentry);
|
||||
put_memory:
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
|
||||
/*
|
||||
* shmem_zero_setup - setup a shared anonymous mapping
|
||||
*
|
||||
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
|
||||
*/
|
||||
int shmem_zero_setup(struct vm_area_struct *vma)
|
||||
{
|
||||
struct file *file;
|
||||
loff_t size = vma->vm_end - vma->vm_start;
|
||||
|
||||
file = shmem_file_setup("dev/zero", size, vma->vm_flags);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
if (vma->vm_file)
|
||||
fput(vma->vm_file);
|
||||
vma->vm_file = file;
|
||||
vma->vm_ops = &generic_file_vm_ops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int shmem_unuse(swp_entry_t entry, struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
int shmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
file_accessed(file);
|
||||
#ifndef CONFIG_MMU
|
||||
return ramfs_nommu_mmap(file, vma);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#endif /* 0 */
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
unsigned long shmem_get_unmapped_area(struct file *file,
|
||||
unsigned long addr,
|
||||
unsigned long len,
|
||||
unsigned long pgoff,
|
||||
unsigned long flags)
|
||||
{
|
||||
return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
|
||||
}
|
||||
#endif
|
||||
444
mm/truncate.c
Normal file
444
mm/truncate.c
Normal file
@@ -0,0 +1,444 @@
|
||||
/*
|
||||
* mm/truncate.c - code for taking down pages from address_spaces
|
||||
*
|
||||
* Copyright (C) 2002, Linus Torvalds
|
||||
*
|
||||
* 10Sep2002 akpm@zip.com.au
|
||||
* Initial version.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/buffer_head.h> /* grr. try_to_release_page,
|
||||
do_invalidatepage */
|
||||
|
||||
|
||||
/**
|
||||
* do_invalidatepage - invalidate part of all of a page
|
||||
* @page: the page which is affected
|
||||
* @offset: the index of the truncation point
|
||||
*
|
||||
* do_invalidatepage() is called when all or part of the page has become
|
||||
* invalidated by a truncate operation.
|
||||
*
|
||||
* do_invalidatepage() does not have to release all buffers, but it must
|
||||
* ensure that no dirty buffer is left outside @offset and that no I/O
|
||||
* is underway against any of the blocks which are outside the truncation
|
||||
* point. Because the caller is about to free (and possibly reuse) those
|
||||
* blocks on-disk.
|
||||
*/
|
||||
void do_invalidatepage(struct page *page, unsigned long offset)
|
||||
{
|
||||
void (*invalidatepage)(struct page *, unsigned long);
|
||||
invalidatepage = page->mapping->a_ops->invalidatepage;
|
||||
#ifdef CONFIG_BLOCK
|
||||
if (!invalidatepage)
|
||||
invalidatepage = block_invalidatepage;
|
||||
#endif
|
||||
if (invalidatepage)
|
||||
(*invalidatepage)(page, offset);
|
||||
}
|
||||
|
||||
static inline void truncate_partial_page(struct page *page, unsigned partial)
|
||||
{
|
||||
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
|
||||
if (PagePrivate(page))
|
||||
do_invalidatepage(page, partial);
|
||||
}
|
||||
|
||||
/*
|
||||
* This cancels just the dirty bit on the kernel page itself, it
|
||||
* does NOT actually remove dirty bits on any mmap's that may be
|
||||
* around. It also leaves the page tagged dirty, so any sync
|
||||
* activity will still find it on the dirty lists, and in particular,
|
||||
* clear_page_dirty_for_io() will still look at the dirty bits in
|
||||
* the VM.
|
||||
*
|
||||
* Doing this should *normally* only ever be done when a page
|
||||
* is truncated, and is not actually mapped anywhere at all. However,
|
||||
* fs/buffer.c does this when it notices that somebody has cleaned
|
||||
* out all the buffers on a page without actually doing it through
|
||||
* the VM. Can you say "ext3 is horribly ugly"? Tought you could.
|
||||
*/
|
||||
void cancel_dirty_page(struct page *page, unsigned int account_size)
|
||||
{
|
||||
if (TestClearPageDirty(page)) {
|
||||
struct address_space *mapping = page->mapping;
|
||||
if (mapping && mapping_cap_account_dirty(mapping)) {
|
||||
dec_zone_page_state(page, NR_FILE_DIRTY);
|
||||
if (account_size)
|
||||
task_io_account_cancelled_write(account_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(cancel_dirty_page);
|
||||
|
||||
/*
|
||||
* If truncate cannot remove the fs-private metadata from the page, the page
|
||||
* becomes anonymous. It will be left on the LRU and may even be mapped into
|
||||
* user pagetables if we're racing with filemap_nopage().
|
||||
*
|
||||
* We need to bale out if page->mapping is no longer equal to the original
|
||||
* mapping. This happens a) when the VM reclaimed the page while we waited on
|
||||
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
|
||||
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
|
||||
*/
|
||||
static void
|
||||
truncate_complete_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
if (page->mapping != mapping)
|
||||
return;
|
||||
|
||||
cancel_dirty_page(page, PAGE_CACHE_SIZE);
|
||||
|
||||
if (PagePrivate(page))
|
||||
do_invalidatepage(page, 0);
|
||||
|
||||
ClearPageUptodate(page);
|
||||
ClearPageMappedToDisk(page);
|
||||
remove_from_page_cache(page);
|
||||
page_cache_release(page); /* pagecache ref */
|
||||
}
|
||||
|
||||
/*
|
||||
* This is for invalidate_mapping_pages(). That function can be called at
|
||||
* any time, and is not supposed to throw away dirty pages. But pages can
|
||||
* be marked dirty at any time too, so use remove_mapping which safely
|
||||
* discards clean, unused pages.
|
||||
*
|
||||
* Returns non-zero if the page was successfully invalidated.
|
||||
*/
|
||||
static int
|
||||
invalidate_complete_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (page->mapping != mapping)
|
||||
return 0;
|
||||
|
||||
if (PagePrivate(page) && !try_to_release_page(page, 0))
|
||||
return 0;
|
||||
|
||||
ret = remove_mapping(mapping, page);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* truncate_inode_pages - truncate range of pages specified by start and
|
||||
* end byte offsets
|
||||
* @mapping: mapping to truncate
|
||||
* @lstart: offset from which to truncate
|
||||
* @lend: offset to which to truncate
|
||||
*
|
||||
* Truncate the page cache, removing the pages that are between
|
||||
* specified offsets (and zeroing out partial page
|
||||
* (if lstart is not page aligned)).
|
||||
*
|
||||
* Truncate takes two passes - the first pass is nonblocking. It will not
|
||||
* block on page locks and it will not block on writeback. The second pass
|
||||
* will wait. This is to prevent as much IO as possible in the affected region.
|
||||
* The first pass will remove most pages, so the search cost of the second pass
|
||||
* is low.
|
||||
*
|
||||
* When looking at page->index outside the page lock we need to be careful to
|
||||
* copy it into a local to avoid races (it could change at any time).
|
||||
*
|
||||
* We pass down the cache-hot hint to the page freeing code. Even if the
|
||||
* mapping is large, it is probably the case that the final pages are the most
|
||||
* recently touched, and freeing happens in ascending file offset order.
|
||||
*/
|
||||
void truncate_inode_pages_range(struct address_space *mapping,
|
||||
loff_t lstart, loff_t lend)
|
||||
{
|
||||
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
|
||||
pgoff_t end;
|
||||
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
|
||||
struct pagevec pvec;
|
||||
pgoff_t next;
|
||||
int i;
|
||||
|
||||
if (mapping->nrpages == 0)
|
||||
return;
|
||||
|
||||
BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
|
||||
end = (lend >> PAGE_CACHE_SHIFT);
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
next = start;
|
||||
while (next <= end &&
|
||||
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
pgoff_t page_index = page->index;
|
||||
|
||||
if (page_index > end) {
|
||||
next = page_index;
|
||||
break;
|
||||
}
|
||||
|
||||
if (page_index > next)
|
||||
next = page_index;
|
||||
next++;
|
||||
if (TestSetPageLocked(page))
|
||||
continue;
|
||||
if (PageWriteback(page)) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
truncate_complete_page(mapping, page);
|
||||
unlock_page(page);
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (partial) {
|
||||
struct page *page = find_lock_page(mapping, start - 1);
|
||||
if (page) {
|
||||
wait_on_page_writeback(page);
|
||||
truncate_partial_page(page, partial);
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
}
|
||||
|
||||
next = start;
|
||||
for ( ; ; ) {
|
||||
cond_resched();
|
||||
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
||||
if (next == start)
|
||||
break;
|
||||
next = start;
|
||||
continue;
|
||||
}
|
||||
if (pvec.pages[0]->index > end) {
|
||||
pagevec_release(&pvec);
|
||||
break;
|
||||
}
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
|
||||
if (page->index > end)
|
||||
break;
|
||||
lock_page(page);
|
||||
wait_on_page_writeback(page);
|
||||
if (page->index > next)
|
||||
next = page->index;
|
||||
next++;
|
||||
truncate_complete_page(mapping, page);
|
||||
unlock_page(page);
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages_range);
|
||||
|
||||
/**
|
||||
* truncate_inode_pages - truncate *all* the pages from an offset
|
||||
* @mapping: mapping to truncate
|
||||
* @lstart: offset from which to truncate
|
||||
*
|
||||
* Called under (and serialised by) inode->i_mutex.
|
||||
*/
|
||||
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
|
||||
{
|
||||
truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages);
|
||||
|
||||
/**
|
||||
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
|
||||
* @mapping: the address_space which holds the pages to invalidate
|
||||
* @start: the offset 'from' which to invalidate
|
||||
* @end: the offset 'to' which to invalidate (inclusive)
|
||||
*
|
||||
* This function only removes the unlocked pages, if you want to
|
||||
* remove all the pages of one inode, you must call truncate_inode_pages.
|
||||
*
|
||||
* invalidate_mapping_pages() will not block on IO activity. It will not
|
||||
* invalidate pages which are dirty, locked, under writeback or mapped into
|
||||
* pagetables.
|
||||
*/
|
||||
unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
struct pagevec pvec;
|
||||
pgoff_t next = start;
|
||||
unsigned long ret = 0;
|
||||
int i;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
while (next <= end &&
|
||||
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
pgoff_t index;
|
||||
int lock_failed;
|
||||
|
||||
lock_failed = TestSetPageLocked(page);
|
||||
|
||||
/*
|
||||
* We really shouldn't be looking at the ->index of an
|
||||
* unlocked page. But we're not allowed to lock these
|
||||
* pages. So we rely upon nobody altering the ->index
|
||||
* of this (pinned-by-us) page.
|
||||
*/
|
||||
index = page->index;
|
||||
if (index > next)
|
||||
next = index;
|
||||
next++;
|
||||
if (lock_failed)
|
||||
continue;
|
||||
|
||||
if (PageDirty(page) || PageWriteback(page))
|
||||
goto unlock;
|
||||
if (page_mapped(page))
|
||||
goto unlock;
|
||||
ret += invalidate_complete_page(mapping, page);
|
||||
unlock:
|
||||
unlock_page(page);
|
||||
if (next > end)
|
||||
break;
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(invalidate_mapping_pages);
|
||||
|
||||
/*
|
||||
* This is like invalidate_complete_page(), except it ignores the page's
|
||||
* refcount. We do this because invalidate_inode_pages2() needs stronger
|
||||
* invalidation guarantees, and cannot afford to leave pages behind because
|
||||
* shrink_list() has a temp ref on them, or because they're transiently sitting
|
||||
* in the lru_cache_add() pagevecs.
|
||||
*/
|
||||
static int
|
||||
invalidate_complete_page2(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
if (page->mapping != mapping)
|
||||
return 0;
|
||||
|
||||
if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
|
||||
return 0;
|
||||
|
||||
write_lock_irq(&mapping->tree_lock);
|
||||
if (PageDirty(page))
|
||||
goto failed;
|
||||
|
||||
BUG_ON(PagePrivate(page));
|
||||
__remove_from_page_cache(page);
|
||||
write_unlock_irq(&mapping->tree_lock);
|
||||
ClearPageUptodate(page);
|
||||
page_cache_release(page); /* pagecache ref */
|
||||
return 1;
|
||||
failed:
|
||||
write_unlock_irq(&mapping->tree_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_launder_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
if (!PageDirty(page))
|
||||
return 0;
|
||||
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
|
||||
return 0;
|
||||
return mapping->a_ops->launder_page(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* invalidate_inode_pages2_range - remove range of pages from an address_space
|
||||
* @mapping: the address_space
|
||||
* @start: the page offset 'from' which to invalidate
|
||||
* @end: the page offset 'to' which to invalidate (inclusive)
|
||||
*
|
||||
* Any pages which are found to be mapped into pagetables are unmapped prior to
|
||||
* invalidation.
|
||||
*
|
||||
* Returns -EIO if any pages could not be invalidated.
|
||||
*/
|
||||
int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
struct pagevec pvec;
|
||||
pgoff_t next;
|
||||
int i;
|
||||
int ret = 0;
|
||||
int did_range_unmap = 0;
|
||||
int wrapped = 0;
|
||||
|
||||
pagevec_init(&pvec, 0);
|
||||
next = start;
|
||||
while (next <= end && !wrapped &&
|
||||
pagevec_lookup(&pvec, mapping, next,
|
||||
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
pgoff_t page_index;
|
||||
|
||||
lock_page(page);
|
||||
if (page->mapping != mapping) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
page_index = page->index;
|
||||
next = page_index + 1;
|
||||
if (next == 0)
|
||||
wrapped = 1;
|
||||
if (page_index > end) {
|
||||
unlock_page(page);
|
||||
break;
|
||||
}
|
||||
wait_on_page_writeback(page);
|
||||
while (page_mapped(page)) {
|
||||
if (!did_range_unmap) {
|
||||
/*
|
||||
* Zap the rest of the file in one hit.
|
||||
*/
|
||||
unmap_mapping_range(mapping,
|
||||
(loff_t)page_index<<PAGE_CACHE_SHIFT,
|
||||
(loff_t)(end - page_index + 1)
|
||||
<< PAGE_CACHE_SHIFT,
|
||||
0);
|
||||
did_range_unmap = 1;
|
||||
} else {
|
||||
/*
|
||||
* Just zap this page
|
||||
*/
|
||||
unmap_mapping_range(mapping,
|
||||
(loff_t)page_index<<PAGE_CACHE_SHIFT,
|
||||
PAGE_CACHE_SIZE, 0);
|
||||
}
|
||||
}
|
||||
ret = do_launder_page(mapping, page);
|
||||
if (ret == 0 && !invalidate_complete_page2(mapping, page))
|
||||
ret = -EIO;
|
||||
unlock_page(page);
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
|
||||
|
||||
/**
|
||||
* invalidate_inode_pages2 - remove all pages from an address_space
|
||||
* @mapping: the address_space
|
||||
*
|
||||
* Any pages which are found to be mapped into pagetables are unmapped prior to
|
||||
* invalidation.
|
||||
*
|
||||
* Returns -EIO if any pages could not be invalidated.
|
||||
*/
|
||||
int invalidate_inode_pages2(struct address_space *mapping)
|
||||
{
|
||||
return invalidate_inode_pages2_range(mapping, 0, -1);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
|
||||
94
mm/util.c
Normal file
94
mm/util.c
Normal file
@@ -0,0 +1,94 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/err.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
/**
|
||||
* __kzalloc - allocate memory. The memory is set to zero.
|
||||
* @size: how many bytes of memory are required.
|
||||
* @flags: the type of memory to allocate.
|
||||
*/
|
||||
void *__kzalloc(size_t size, gfp_t flags)
|
||||
{
|
||||
void *ret = kmalloc_track_caller(size, flags);
|
||||
if (ret)
|
||||
memset(ret, 0, size);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(__kzalloc);
|
||||
|
||||
/*
|
||||
* kstrdup - allocate space for and copy an existing string
|
||||
*
|
||||
* @s: the string to duplicate
|
||||
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
||||
*/
|
||||
char *kstrdup(const char *s, gfp_t gfp)
|
||||
{
|
||||
size_t len;
|
||||
char *buf;
|
||||
|
||||
if (!s)
|
||||
return NULL;
|
||||
|
||||
len = strlen(s) + 1;
|
||||
buf = kmalloc_track_caller(len, gfp);
|
||||
if (buf)
|
||||
memcpy(buf, s, len);
|
||||
return buf;
|
||||
}
|
||||
EXPORT_SYMBOL(kstrdup);
|
||||
|
||||
/**
|
||||
* kmemdup - duplicate region of memory
|
||||
*
|
||||
* @src: memory region to duplicate
|
||||
* @len: memory region length
|
||||
* @gfp: GFP mask to use
|
||||
*/
|
||||
void *kmemdup(const void *src, size_t len, gfp_t gfp)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = kmalloc_track_caller(len, gfp);
|
||||
if (p)
|
||||
memcpy(p, src, len);
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(kmemdup);
|
||||
|
||||
/*
|
||||
* strndup_user - duplicate an existing string from user space
|
||||
*
|
||||
* @s: The string to duplicate
|
||||
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
||||
*/
|
||||
char *strndup_user(const char __user *s, long n)
|
||||
{
|
||||
char *p;
|
||||
long length;
|
||||
|
||||
length = strnlen_user(s, n);
|
||||
|
||||
if (!length)
|
||||
return ERR_PTR(-EFAULT);
|
||||
|
||||
if (length > n)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
p = kmalloc(length, GFP_KERNEL);
|
||||
|
||||
if (!p)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (copy_from_user(p, s, length)) {
|
||||
kfree(p);
|
||||
return ERR_PTR(-EFAULT);
|
||||
}
|
||||
|
||||
p[length - 1] = '\0';
|
||||
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(strndup_user);
|
||||
749
mm/vmalloc.c
Normal file
749
mm/vmalloc.c
Normal file
@@ -0,0 +1,749 @@
|
||||
/*
|
||||
* linux/mm/vmalloc.c
|
||||
*
|
||||
* Copyright (C) 1993 Linus Torvalds
|
||||
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
|
||||
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
|
||||
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
|
||||
* Numa awareness, Christoph Lameter, SGI, June 2005
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/interrupt.h>
|
||||
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
|
||||
DEFINE_RWLOCK(vmlist_lock);
|
||||
struct vm_struct *vmlist;
|
||||
|
||||
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
|
||||
int node);
|
||||
|
||||
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
|
||||
{
|
||||
pte_t *pte;
|
||||
|
||||
pte = pte_offset_kernel(pmd, addr);
|
||||
do {
|
||||
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
|
||||
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
}
|
||||
|
||||
static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
|
||||
unsigned long end)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
vunmap_pte_range(pmd, addr, next);
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
|
||||
unsigned long end)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
vunmap_pmd_range(pud, addr, next);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
}
|
||||
|
||||
void unmap_vm_area(struct vm_struct *area)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
unsigned long addr = (unsigned long) area->addr;
|
||||
unsigned long end = addr + area->size;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset_k(addr);
|
||||
flush_cache_vunmap(addr, end);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
vunmap_pud_range(pgd, addr, next);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_tlb_kernel_range((unsigned long) area->addr, end);
|
||||
}
|
||||
|
||||
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
unsigned long end, pgprot_t prot, struct page ***pages)
|
||||
{
|
||||
pte_t *pte;
|
||||
|
||||
pte = pte_alloc_kernel(pmd, addr);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
do {
|
||||
struct page *page = **pages;
|
||||
WARN_ON(!pte_none(*pte));
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
|
||||
(*pages)++;
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
|
||||
unsigned long end, pgprot_t prot, struct page ***pages)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
||||
pmd = pmd_alloc(&init_mm, pud, addr);
|
||||
if (!pmd)
|
||||
return -ENOMEM;
|
||||
do {
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (vmap_pte_range(pmd, addr, next, prot, pages))
|
||||
return -ENOMEM;
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
|
||||
unsigned long end, pgprot_t prot, struct page ***pages)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
||||
pud = pud_alloc(&init_mm, pgd, addr);
|
||||
if (!pud)
|
||||
return -ENOMEM;
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (vmap_pmd_range(pud, addr, next, prot, pages))
|
||||
return -ENOMEM;
|
||||
} while (pud++, addr = next, addr != end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
unsigned long addr = (unsigned long) area->addr;
|
||||
unsigned long end = addr + area->size - PAGE_SIZE;
|
||||
int err;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
pgd = pgd_offset_k(addr);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
err = vmap_pud_range(pgd, addr, next, prot, pages);
|
||||
if (err)
|
||||
break;
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_cache_vmap((unsigned long) area->addr, end);
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
|
||||
unsigned long start, unsigned long end,
|
||||
int node, gfp_t gfp_mask)
|
||||
{
|
||||
struct vm_struct **p, *tmp, *area;
|
||||
unsigned long align = 1;
|
||||
unsigned long addr;
|
||||
|
||||
BUG_ON(in_interrupt());
|
||||
if (flags & VM_IOREMAP) {
|
||||
int bit = fls(size);
|
||||
|
||||
if (bit > IOREMAP_MAX_ORDER)
|
||||
bit = IOREMAP_MAX_ORDER;
|
||||
else if (bit < PAGE_SHIFT)
|
||||
bit = PAGE_SHIFT;
|
||||
|
||||
align = 1ul << bit;
|
||||
}
|
||||
addr = ALIGN(start, align);
|
||||
size = PAGE_ALIGN(size);
|
||||
if (unlikely(!size))
|
||||
return NULL;
|
||||
|
||||
area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
|
||||
if (unlikely(!area))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* We always allocate a guard page.
|
||||
*/
|
||||
size += PAGE_SIZE;
|
||||
|
||||
write_lock(&vmlist_lock);
|
||||
for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
|
||||
if ((unsigned long)tmp->addr < addr) {
|
||||
if((unsigned long)tmp->addr + tmp->size >= addr)
|
||||
addr = ALIGN(tmp->size +
|
||||
(unsigned long)tmp->addr, align);
|
||||
continue;
|
||||
}
|
||||
if ((size + addr) < addr)
|
||||
goto out;
|
||||
if (size + addr <= (unsigned long)tmp->addr)
|
||||
goto found;
|
||||
addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
|
||||
if (addr > end - size)
|
||||
goto out;
|
||||
}
|
||||
|
||||
found:
|
||||
area->next = *p;
|
||||
*p = area;
|
||||
|
||||
area->flags = flags;
|
||||
area->addr = (void *)addr;
|
||||
area->size = size;
|
||||
area->pages = NULL;
|
||||
area->nr_pages = 0;
|
||||
area->phys_addr = 0;
|
||||
write_unlock(&vmlist_lock);
|
||||
|
||||
return area;
|
||||
|
||||
out:
|
||||
write_unlock(&vmlist_lock);
|
||||
kfree(area);
|
||||
if (printk_ratelimit())
|
||||
printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
|
||||
}
|
||||
|
||||
/**
|
||||
* get_vm_area - reserve a contingous kernel virtual area
|
||||
* @size: size of the area
|
||||
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
|
||||
*
|
||||
* Search an area of @size in the kernel virtual mapping area,
|
||||
* and reserved it for out purposes. Returns the area descriptor
|
||||
* on success or %NULL on failure.
|
||||
*/
|
||||
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
|
||||
{
|
||||
return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
|
||||
}
|
||||
|
||||
struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
|
||||
int node, gfp_t gfp_mask)
|
||||
{
|
||||
return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
|
||||
gfp_mask);
|
||||
}
|
||||
|
||||
/* Caller must hold vmlist_lock */
|
||||
static struct vm_struct *__find_vm_area(void *addr)
|
||||
{
|
||||
struct vm_struct *tmp;
|
||||
|
||||
for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
|
||||
if (tmp->addr == addr)
|
||||
break;
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/* Caller must hold vmlist_lock */
|
||||
static struct vm_struct *__remove_vm_area(void *addr)
|
||||
{
|
||||
struct vm_struct **p, *tmp;
|
||||
|
||||
for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
|
||||
if (tmp->addr == addr)
|
||||
goto found;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
found:
|
||||
unmap_vm_area(tmp);
|
||||
*p = tmp->next;
|
||||
|
||||
/*
|
||||
* Remove the guard page.
|
||||
*/
|
||||
tmp->size -= PAGE_SIZE;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* remove_vm_area - find and remove a contingous kernel virtual area
|
||||
* @addr: base address
|
||||
*
|
||||
* Search for the kernel VM area starting at @addr, and remove it.
|
||||
* This function returns the found VM area, but using it is NOT safe
|
||||
* on SMP machines, except for its size or flags.
|
||||
*/
|
||||
struct vm_struct *remove_vm_area(void *addr)
|
||||
{
|
||||
struct vm_struct *v;
|
||||
write_lock(&vmlist_lock);
|
||||
v = __remove_vm_area(addr);
|
||||
write_unlock(&vmlist_lock);
|
||||
return v;
|
||||
}
|
||||
|
||||
void __vunmap(void *addr, int deallocate_pages)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
|
||||
if (!addr)
|
||||
return;
|
||||
|
||||
if ((PAGE_SIZE-1) & (unsigned long)addr) {
|
||||
printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
|
||||
area = remove_vm_area(addr);
|
||||
if (unlikely(!area)) {
|
||||
printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
|
||||
addr);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
|
||||
debug_check_no_locks_freed(addr, area->size);
|
||||
|
||||
if (deallocate_pages) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < area->nr_pages; i++) {
|
||||
BUG_ON(!area->pages[i]);
|
||||
__free_page(area->pages[i]);
|
||||
}
|
||||
|
||||
if (area->flags & VM_VPAGES)
|
||||
vfree(area->pages);
|
||||
else
|
||||
kfree(area->pages);
|
||||
}
|
||||
|
||||
kfree(area);
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* vfree - release memory allocated by vmalloc()
|
||||
* @addr: memory base address
|
||||
*
|
||||
* Free the virtually contiguous memory area starting at @addr, as
|
||||
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
|
||||
* NULL, no operation is performed.
|
||||
*
|
||||
* Must not be called in interrupt context.
|
||||
*/
|
||||
void vfree(void *addr)
|
||||
{
|
||||
BUG_ON(in_interrupt());
|
||||
__vunmap(addr, 1);
|
||||
}
|
||||
EXPORT_SYMBOL(vfree);
|
||||
|
||||
/**
|
||||
* vunmap - release virtual mapping obtained by vmap()
|
||||
* @addr: memory base address
|
||||
*
|
||||
* Free the virtually contiguous memory area starting at @addr,
|
||||
* which was created from the page array passed to vmap().
|
||||
*
|
||||
* Must not be called in interrupt context.
|
||||
*/
|
||||
void vunmap(void *addr)
|
||||
{
|
||||
BUG_ON(in_interrupt());
|
||||
__vunmap(addr, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(vunmap);
|
||||
|
||||
/**
|
||||
* vmap - map an array of pages into virtually contiguous space
|
||||
* @pages: array of page pointers
|
||||
* @count: number of pages to map
|
||||
* @flags: vm_area->flags
|
||||
* @prot: page protection for the mapping
|
||||
*
|
||||
* Maps @count pages from @pages into contiguous kernel virtual
|
||||
* space.
|
||||
*/
|
||||
void *vmap(struct page **pages, unsigned int count,
|
||||
unsigned long flags, pgprot_t prot)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
|
||||
if (count > num_physpages)
|
||||
return NULL;
|
||||
|
||||
area = get_vm_area((count << PAGE_SHIFT), flags);
|
||||
if (!area)
|
||||
return NULL;
|
||||
if (map_vm_area(area, prot, &pages)) {
|
||||
vunmap(area->addr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return area->addr;
|
||||
}
|
||||
EXPORT_SYMBOL(vmap);
|
||||
|
||||
void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
pgprot_t prot, int node)
|
||||
{
|
||||
struct page **pages;
|
||||
unsigned int nr_pages, array_size, i;
|
||||
|
||||
nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
|
||||
array_size = (nr_pages * sizeof(struct page *));
|
||||
|
||||
area->nr_pages = nr_pages;
|
||||
/* Please note that the recursion is strictly bounded. */
|
||||
if (array_size > PAGE_SIZE) {
|
||||
pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
|
||||
area->flags |= VM_VPAGES;
|
||||
} else {
|
||||
pages = kmalloc_node(array_size,
|
||||
(gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)),
|
||||
node);
|
||||
}
|
||||
area->pages = pages;
|
||||
if (!area->pages) {
|
||||
remove_vm_area(area->addr);
|
||||
kfree(area);
|
||||
return NULL;
|
||||
}
|
||||
memset(area->pages, 0, array_size);
|
||||
|
||||
for (i = 0; i < area->nr_pages; i++) {
|
||||
if (node < 0)
|
||||
area->pages[i] = alloc_page(gfp_mask);
|
||||
else
|
||||
area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
|
||||
if (unlikely(!area->pages[i])) {
|
||||
/* Successfully allocated i pages, free them in __vunmap() */
|
||||
area->nr_pages = i;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (map_vm_area(area, prot, &pages))
|
||||
goto fail;
|
||||
return area->addr;
|
||||
|
||||
fail:
|
||||
vfree(area->addr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
|
||||
{
|
||||
return __vmalloc_area_node(area, gfp_mask, prot, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* __vmalloc_node - allocate virtually contiguous memory
|
||||
* @size: allocation size
|
||||
* @gfp_mask: flags for the page level allocator
|
||||
* @prot: protection mask for the allocated pages
|
||||
* @node: node to use for allocation or -1
|
||||
*
|
||||
* Allocate enough pages to cover @size from the page level
|
||||
* allocator with @gfp_mask flags. Map them into contiguous
|
||||
* kernel virtual space, using a pagetable protection of @prot.
|
||||
*/
|
||||
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
|
||||
int node)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
|
||||
size = PAGE_ALIGN(size);
|
||||
if (!size || (size >> PAGE_SHIFT) > num_physpages)
|
||||
return NULL;
|
||||
|
||||
area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
|
||||
if (!area)
|
||||
return NULL;
|
||||
|
||||
return __vmalloc_area_node(area, gfp_mask, prot, node);
|
||||
}
|
||||
|
||||
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
|
||||
{
|
||||
return __vmalloc_node(size, gfp_mask, prot, -1);
|
||||
}
|
||||
EXPORT_SYMBOL(__vmalloc);
|
||||
|
||||
/**
|
||||
* vmalloc - allocate virtually contiguous memory
|
||||
* @size: allocation size
|
||||
* Allocate enough pages to cover @size from the page level
|
||||
* allocator and map them into contiguous kernel virtual space.
|
||||
*
|
||||
* For tight control over page level allocator and protection flags
|
||||
* use __vmalloc() instead.
|
||||
*/
|
||||
void *vmalloc(unsigned long size)
|
||||
{
|
||||
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
|
||||
}
|
||||
EXPORT_SYMBOL(vmalloc);
|
||||
|
||||
/**
|
||||
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
|
||||
* @size: allocation size
|
||||
*
|
||||
* The resulting memory area is zeroed so it can be mapped to userspace
|
||||
* without leaking data.
|
||||
*/
|
||||
void *vmalloc_user(unsigned long size)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
void *ret;
|
||||
|
||||
ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
|
||||
if (ret) {
|
||||
write_lock(&vmlist_lock);
|
||||
area = __find_vm_area(ret);
|
||||
area->flags |= VM_USERMAP;
|
||||
write_unlock(&vmlist_lock);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vmalloc_user);
|
||||
|
||||
/**
|
||||
* vmalloc_node - allocate memory on a specific node
|
||||
* @size: allocation size
|
||||
* @node: numa node
|
||||
*
|
||||
* Allocate enough pages to cover @size from the page level
|
||||
* allocator and map them into contiguous kernel virtual space.
|
||||
*
|
||||
* For tight control over page level allocator and protection flags
|
||||
* use __vmalloc() instead.
|
||||
*/
|
||||
void *vmalloc_node(unsigned long size, int node)
|
||||
{
|
||||
return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
|
||||
}
|
||||
EXPORT_SYMBOL(vmalloc_node);
|
||||
|
||||
#ifndef PAGE_KERNEL_EXEC
|
||||
# define PAGE_KERNEL_EXEC PAGE_KERNEL
|
||||
#endif
|
||||
|
||||
/**
|
||||
* vmalloc_exec - allocate virtually contiguous, executable memory
|
||||
* @size: allocation size
|
||||
*
|
||||
* Kernel-internal function to allocate enough pages to cover @size
|
||||
* the page level allocator and map them into contiguous and
|
||||
* executable kernel virtual space.
|
||||
*
|
||||
* For tight control over page level allocator and protection flags
|
||||
* use __vmalloc() instead.
|
||||
*/
|
||||
|
||||
void *vmalloc_exec(unsigned long size)
|
||||
{
|
||||
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
|
||||
* @size: allocation size
|
||||
*
|
||||
* Allocate enough 32bit PA addressable pages to cover @size from the
|
||||
* page level allocator and map them into contiguous kernel virtual space.
|
||||
*/
|
||||
void *vmalloc_32(unsigned long size)
|
||||
{
|
||||
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
|
||||
}
|
||||
EXPORT_SYMBOL(vmalloc_32);
|
||||
|
||||
/**
|
||||
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
|
||||
* @size: allocation size
|
||||
*
|
||||
* The resulting memory area is 32bit addressable and zeroed so it can be
|
||||
* mapped to userspace without leaking data.
|
||||
*/
|
||||
void *vmalloc_32_user(unsigned long size)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
void *ret;
|
||||
|
||||
ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
|
||||
if (ret) {
|
||||
write_lock(&vmlist_lock);
|
||||
area = __find_vm_area(ret);
|
||||
area->flags |= VM_USERMAP;
|
||||
write_unlock(&vmlist_lock);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vmalloc_32_user);
|
||||
|
||||
long vread(char *buf, char *addr, unsigned long count)
|
||||
{
|
||||
struct vm_struct *tmp;
|
||||
char *vaddr, *buf_start = buf;
|
||||
unsigned long n;
|
||||
|
||||
/* Don't allow overflow */
|
||||
if ((unsigned long) addr + count < count)
|
||||
count = -(unsigned long) addr;
|
||||
|
||||
read_lock(&vmlist_lock);
|
||||
for (tmp = vmlist; tmp; tmp = tmp->next) {
|
||||
vaddr = (char *) tmp->addr;
|
||||
if (addr >= vaddr + tmp->size - PAGE_SIZE)
|
||||
continue;
|
||||
while (addr < vaddr) {
|
||||
if (count == 0)
|
||||
goto finished;
|
||||
*buf = '\0';
|
||||
buf++;
|
||||
addr++;
|
||||
count--;
|
||||
}
|
||||
n = vaddr + tmp->size - PAGE_SIZE - addr;
|
||||
do {
|
||||
if (count == 0)
|
||||
goto finished;
|
||||
*buf = *addr;
|
||||
buf++;
|
||||
addr++;
|
||||
count--;
|
||||
} while (--n > 0);
|
||||
}
|
||||
finished:
|
||||
read_unlock(&vmlist_lock);
|
||||
return buf - buf_start;
|
||||
}
|
||||
|
||||
long vwrite(char *buf, char *addr, unsigned long count)
|
||||
{
|
||||
struct vm_struct *tmp;
|
||||
char *vaddr, *buf_start = buf;
|
||||
unsigned long n;
|
||||
|
||||
/* Don't allow overflow */
|
||||
if ((unsigned long) addr + count < count)
|
||||
count = -(unsigned long) addr;
|
||||
|
||||
read_lock(&vmlist_lock);
|
||||
for (tmp = vmlist; tmp; tmp = tmp->next) {
|
||||
vaddr = (char *) tmp->addr;
|
||||
if (addr >= vaddr + tmp->size - PAGE_SIZE)
|
||||
continue;
|
||||
while (addr < vaddr) {
|
||||
if (count == 0)
|
||||
goto finished;
|
||||
buf++;
|
||||
addr++;
|
||||
count--;
|
||||
}
|
||||
n = vaddr + tmp->size - PAGE_SIZE - addr;
|
||||
do {
|
||||
if (count == 0)
|
||||
goto finished;
|
||||
*addr = *buf;
|
||||
buf++;
|
||||
addr++;
|
||||
count--;
|
||||
} while (--n > 0);
|
||||
}
|
||||
finished:
|
||||
read_unlock(&vmlist_lock);
|
||||
return buf - buf_start;
|
||||
}
|
||||
|
||||
/**
|
||||
* remap_vmalloc_range - map vmalloc pages to userspace
|
||||
* @vma: vma to cover (map full range of vma)
|
||||
* @addr: vmalloc memory
|
||||
* @pgoff: number of pages into addr before first page to map
|
||||
* @returns: 0 for success, -Exxx on failure
|
||||
*
|
||||
* This function checks that addr is a valid vmalloc'ed area, and
|
||||
* that it is big enough to cover the vma. Will return failure if
|
||||
* that criteria isn't met.
|
||||
*
|
||||
* Similar to remap_pfn_range() (see mm/memory.c)
|
||||
*/
|
||||
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
|
||||
unsigned long pgoff)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
unsigned long uaddr = vma->vm_start;
|
||||
unsigned long usize = vma->vm_end - vma->vm_start;
|
||||
int ret;
|
||||
|
||||
if ((PAGE_SIZE-1) & (unsigned long)addr)
|
||||
return -EINVAL;
|
||||
|
||||
read_lock(&vmlist_lock);
|
||||
area = __find_vm_area(addr);
|
||||
if (!area)
|
||||
goto out_einval_locked;
|
||||
|
||||
if (!(area->flags & VM_USERMAP))
|
||||
goto out_einval_locked;
|
||||
|
||||
if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
|
||||
goto out_einval_locked;
|
||||
read_unlock(&vmlist_lock);
|
||||
|
||||
addr += pgoff << PAGE_SHIFT;
|
||||
do {
|
||||
struct page *page = vmalloc_to_page(addr);
|
||||
ret = vm_insert_page(vma, uaddr, page);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
uaddr += PAGE_SIZE;
|
||||
addr += PAGE_SIZE;
|
||||
usize -= PAGE_SIZE;
|
||||
} while (usize > 0);
|
||||
|
||||
/* Prevent "things" like memory migration? VM_flags need a cleanup... */
|
||||
vma->vm_flags |= VM_RESERVED;
|
||||
|
||||
return ret;
|
||||
|
||||
out_einval_locked:
|
||||
read_unlock(&vmlist_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL(remap_vmalloc_range);
|
||||
|
||||
1730
mm/vmscan.c
Normal file
1730
mm/vmscan.c
Normal file
File diff suppressed because it is too large
Load Diff
673
mm/vmstat.c
Normal file
673
mm/vmstat.c
Normal file
@@ -0,0 +1,673 @@
|
||||
/*
|
||||
* linux/mm/vmstat.c
|
||||
*
|
||||
* Manages VM statistics
|
||||
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
||||
*
|
||||
* zoned VM statistics
|
||||
* Copyright (C) 2006 Silicon Graphics, Inc.,
|
||||
* Christoph Lameter <christoph@lameter.com>
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpu.h>
|
||||
|
||||
#ifdef CONFIG_VM_EVENT_COUNTERS
|
||||
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
|
||||
EXPORT_PER_CPU_SYMBOL(vm_event_states);
|
||||
|
||||
static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
|
||||
{
|
||||
int cpu = 0;
|
||||
int i;
|
||||
|
||||
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
|
||||
|
||||
cpu = first_cpu(*cpumask);
|
||||
while (cpu < NR_CPUS) {
|
||||
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
|
||||
|
||||
cpu = next_cpu(cpu, *cpumask);
|
||||
|
||||
if (cpu < NR_CPUS)
|
||||
prefetch(&per_cpu(vm_event_states, cpu));
|
||||
|
||||
|
||||
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
|
||||
ret[i] += this->event[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Accumulate the vm event counters across all CPUs.
|
||||
* The result is unavoidably approximate - it can change
|
||||
* during and after execution of this function.
|
||||
*/
|
||||
void all_vm_events(unsigned long *ret)
|
||||
{
|
||||
sum_vm_events(ret, &cpu_online_map);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(all_vm_events);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG
|
||||
/*
|
||||
* Fold the foreign cpu events into our own.
|
||||
*
|
||||
* This is adding to the events on one processor
|
||||
* but keeps the global counts constant.
|
||||
*/
|
||||
void vm_events_fold_cpu(int cpu)
|
||||
{
|
||||
struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
|
||||
count_vm_events(i, fold_state->event[i]);
|
||||
fold_state->event[i] = 0;
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_HOTPLUG */
|
||||
|
||||
#endif /* CONFIG_VM_EVENT_COUNTERS */
|
||||
|
||||
/*
|
||||
* Manage combined zone based / global counters
|
||||
*
|
||||
* vm_stat contains the global counters
|
||||
*/
|
||||
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
|
||||
EXPORT_SYMBOL(vm_stat);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
static int calculate_threshold(struct zone *zone)
|
||||
{
|
||||
int threshold;
|
||||
int mem; /* memory in 128 MB units */
|
||||
|
||||
/*
|
||||
* The threshold scales with the number of processors and the amount
|
||||
* of memory per zone. More memory means that we can defer updates for
|
||||
* longer, more processors could lead to more contention.
|
||||
* fls() is used to have a cheap way of logarithmic scaling.
|
||||
*
|
||||
* Some sample thresholds:
|
||||
*
|
||||
* Threshold Processors (fls) Zonesize fls(mem+1)
|
||||
* ------------------------------------------------------------------
|
||||
* 8 1 1 0.9-1 GB 4
|
||||
* 16 2 2 0.9-1 GB 4
|
||||
* 20 2 2 1-2 GB 5
|
||||
* 24 2 2 2-4 GB 6
|
||||
* 28 2 2 4-8 GB 7
|
||||
* 32 2 2 8-16 GB 8
|
||||
* 4 2 2 <128M 1
|
||||
* 30 4 3 2-4 GB 5
|
||||
* 48 4 3 8-16 GB 8
|
||||
* 32 8 4 1-2 GB 4
|
||||
* 32 8 4 0.9-1GB 4
|
||||
* 10 16 5 <128M 1
|
||||
* 40 16 5 900M 4
|
||||
* 70 64 7 2-4 GB 5
|
||||
* 84 64 7 4-8 GB 6
|
||||
* 108 512 9 4-8 GB 6
|
||||
* 125 1024 10 8-16 GB 8
|
||||
* 125 1024 10 16-32 GB 9
|
||||
*/
|
||||
|
||||
mem = zone->present_pages >> (27 - PAGE_SHIFT);
|
||||
|
||||
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
|
||||
|
||||
/*
|
||||
* Maximum threshold is 125
|
||||
*/
|
||||
threshold = min(125, threshold);
|
||||
|
||||
return threshold;
|
||||
}
|
||||
|
||||
/*
|
||||
* Refresh the thresholds for each zone.
|
||||
*/
|
||||
static void refresh_zone_stat_thresholds(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
int cpu;
|
||||
int threshold;
|
||||
|
||||
for_each_zone(zone) {
|
||||
|
||||
if (!zone->present_pages)
|
||||
continue;
|
||||
|
||||
threshold = calculate_threshold(zone);
|
||||
|
||||
for_each_online_cpu(cpu)
|
||||
zone_pcp(zone, cpu)->stat_threshold = threshold;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For use when we know that interrupts are disabled.
|
||||
*/
|
||||
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
||||
int delta)
|
||||
{
|
||||
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
|
||||
s8 *p = pcp->vm_stat_diff + item;
|
||||
long x;
|
||||
|
||||
x = delta + *p;
|
||||
|
||||
if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
|
||||
zone_page_state_add(x, zone, item);
|
||||
x = 0;
|
||||
}
|
||||
*p = x;
|
||||
}
|
||||
EXPORT_SYMBOL(__mod_zone_page_state);
|
||||
|
||||
/*
|
||||
* For an unknown interrupt state
|
||||
*/
|
||||
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
||||
int delta)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__mod_zone_page_state(zone, item, delta);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL(mod_zone_page_state);
|
||||
|
||||
/*
|
||||
* Optimized increment and decrement functions.
|
||||
*
|
||||
* These are only for a single page and therefore can take a struct page *
|
||||
* argument instead of struct zone *. This allows the inclusion of the code
|
||||
* generated for page_zone(page) into the optimized functions.
|
||||
*
|
||||
* No overflow check is necessary and therefore the differential can be
|
||||
* incremented or decremented in place which may allow the compilers to
|
||||
* generate better code.
|
||||
* The increment or decrement is known and therefore one boundary check can
|
||||
* be omitted.
|
||||
*
|
||||
* NOTE: These functions are very performance sensitive. Change only
|
||||
* with care.
|
||||
*
|
||||
* Some processors have inc/dec instructions that are atomic vs an interrupt.
|
||||
* However, the code must first determine the differential location in a zone
|
||||
* based on the processor number and then inc/dec the counter. There is no
|
||||
* guarantee without disabling preemption that the processor will not change
|
||||
* in between and therefore the atomicity vs. interrupt cannot be exploited
|
||||
* in a useful way here.
|
||||
*/
|
||||
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
||||
{
|
||||
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
|
||||
s8 *p = pcp->vm_stat_diff + item;
|
||||
|
||||
(*p)++;
|
||||
|
||||
if (unlikely(*p > pcp->stat_threshold)) {
|
||||
int overstep = pcp->stat_threshold / 2;
|
||||
|
||||
zone_page_state_add(*p + overstep, zone, item);
|
||||
*p = -overstep;
|
||||
}
|
||||
}
|
||||
|
||||
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
||||
{
|
||||
__inc_zone_state(page_zone(page), item);
|
||||
}
|
||||
EXPORT_SYMBOL(__inc_zone_page_state);
|
||||
|
||||
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
|
||||
{
|
||||
struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
|
||||
s8 *p = pcp->vm_stat_diff + item;
|
||||
|
||||
(*p)--;
|
||||
|
||||
if (unlikely(*p < - pcp->stat_threshold)) {
|
||||
int overstep = pcp->stat_threshold / 2;
|
||||
|
||||
zone_page_state_add(*p - overstep, zone, item);
|
||||
*p = overstep;
|
||||
}
|
||||
}
|
||||
|
||||
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
||||
{
|
||||
__dec_zone_state(page_zone(page), item);
|
||||
}
|
||||
EXPORT_SYMBOL(__dec_zone_page_state);
|
||||
|
||||
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__inc_zone_state(zone, item);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct zone *zone;
|
||||
|
||||
zone = page_zone(page);
|
||||
local_irq_save(flags);
|
||||
__inc_zone_state(zone, item);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL(inc_zone_page_state);
|
||||
|
||||
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__dec_zone_page_state(page, item);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL(dec_zone_page_state);
|
||||
|
||||
/*
|
||||
* Update the zone counters for one cpu.
|
||||
*/
|
||||
void refresh_cpu_vm_stats(int cpu)
|
||||
{
|
||||
struct zone *zone;
|
||||
int i;
|
||||
unsigned long flags;
|
||||
|
||||
for_each_zone(zone) {
|
||||
struct per_cpu_pageset *pcp;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
pcp = zone_pcp(zone, cpu);
|
||||
|
||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
||||
if (pcp->vm_stat_diff[i]) {
|
||||
local_irq_save(flags);
|
||||
zone_page_state_add(pcp->vm_stat_diff[i],
|
||||
zone, i);
|
||||
pcp->vm_stat_diff[i] = 0;
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void __refresh_cpu_vm_stats(void *dummy)
|
||||
{
|
||||
refresh_cpu_vm_stats(smp_processor_id());
|
||||
}
|
||||
|
||||
/*
|
||||
* Consolidate all counters.
|
||||
*
|
||||
* Note that the result is less inaccurate but still inaccurate
|
||||
* if concurrent processes are allowed to run.
|
||||
*/
|
||||
void refresh_vm_stats(void)
|
||||
{
|
||||
on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
|
||||
}
|
||||
EXPORT_SYMBOL(refresh_vm_stats);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
/*
|
||||
* zonelist = the list of zones passed to the allocator
|
||||
* z = the zone from which the allocation occurred.
|
||||
*
|
||||
* Must be called with interrupts disabled.
|
||||
*/
|
||||
void zone_statistics(struct zonelist *zonelist, struct zone *z)
|
||||
{
|
||||
if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
|
||||
__inc_zone_state(z, NUMA_HIT);
|
||||
} else {
|
||||
__inc_zone_state(z, NUMA_MISS);
|
||||
__inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
|
||||
}
|
||||
if (z->node == numa_node_id())
|
||||
__inc_zone_state(z, NUMA_LOCAL);
|
||||
else
|
||||
__inc_zone_state(z, NUMA_OTHER);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
static void *frag_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
pg_data_t *pgdat;
|
||||
loff_t node = *pos;
|
||||
for (pgdat = first_online_pgdat();
|
||||
pgdat && node;
|
||||
pgdat = next_online_pgdat(pgdat))
|
||||
--node;
|
||||
|
||||
return pgdat;
|
||||
}
|
||||
|
||||
static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
|
||||
{
|
||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||
|
||||
(*pos)++;
|
||||
return next_online_pgdat(pgdat);
|
||||
}
|
||||
|
||||
static void frag_stop(struct seq_file *m, void *arg)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* This walks the free areas for each zone.
|
||||
*/
|
||||
static int frag_show(struct seq_file *m, void *arg)
|
||||
{
|
||||
pg_data_t *pgdat = (pg_data_t *)arg;
|
||||
struct zone *zone;
|
||||
struct zone *node_zones = pgdat->node_zones;
|
||||
unsigned long flags;
|
||||
int order;
|
||||
|
||||
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
||||
for (order = 0; order < MAX_ORDER; ++order)
|
||||
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations fragmentation_op = {
|
||||
.start = frag_start,
|
||||
.next = frag_next,
|
||||
.stop = frag_stop,
|
||||
.show = frag_show,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
#define TEXT_FOR_DMA(xx) xx "_dma",
|
||||
#else
|
||||
#define TEXT_FOR_DMA(xx)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA32
|
||||
#define TEXT_FOR_DMA32(xx) xx "_dma32",
|
||||
#else
|
||||
#define TEXT_FOR_DMA32(xx)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
#define TEXT_FOR_HIGHMEM(xx) xx "_high",
|
||||
#else
|
||||
#define TEXT_FOR_HIGHMEM(xx)
|
||||
#endif
|
||||
|
||||
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
|
||||
TEXT_FOR_HIGHMEM(xx)
|
||||
|
||||
static const char * const vmstat_text[] = {
|
||||
/* Zoned VM counters */
|
||||
"nr_free_pages",
|
||||
"nr_active",
|
||||
"nr_inactive",
|
||||
"nr_anon_pages",
|
||||
"nr_mapped",
|
||||
"nr_file_pages",
|
||||
"nr_dirty",
|
||||
"nr_writeback",
|
||||
"nr_slab_reclaimable",
|
||||
"nr_slab_unreclaimable",
|
||||
"nr_page_table_pages",
|
||||
"nr_unstable",
|
||||
"nr_bounce",
|
||||
"nr_vmscan_write",
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
"numa_hit",
|
||||
"numa_miss",
|
||||
"numa_foreign",
|
||||
"numa_interleave",
|
||||
"numa_local",
|
||||
"numa_other",
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_VM_EVENT_COUNTERS
|
||||
"pgpgin",
|
||||
"pgpgout",
|
||||
"pswpin",
|
||||
"pswpout",
|
||||
|
||||
TEXTS_FOR_ZONES("pgalloc")
|
||||
|
||||
"pgfree",
|
||||
"pgactivate",
|
||||
"pgdeactivate",
|
||||
|
||||
"pgfault",
|
||||
"pgmajfault",
|
||||
|
||||
TEXTS_FOR_ZONES("pgrefill")
|
||||
TEXTS_FOR_ZONES("pgsteal")
|
||||
TEXTS_FOR_ZONES("pgscan_kswapd")
|
||||
TEXTS_FOR_ZONES("pgscan_direct")
|
||||
|
||||
"pginodesteal",
|
||||
"slabs_scanned",
|
||||
"kswapd_steal",
|
||||
"kswapd_inodesteal",
|
||||
"pageoutrun",
|
||||
"allocstall",
|
||||
|
||||
"pgrotated",
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Output information about zones in @pgdat.
|
||||
*/
|
||||
static int zoneinfo_show(struct seq_file *m, void *arg)
|
||||
{
|
||||
pg_data_t *pgdat = arg;
|
||||
struct zone *zone;
|
||||
struct zone *node_zones = pgdat->node_zones;
|
||||
unsigned long flags;
|
||||
|
||||
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
|
||||
int i;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
|
||||
seq_printf(m,
|
||||
"\n pages free %lu"
|
||||
"\n min %lu"
|
||||
"\n low %lu"
|
||||
"\n high %lu"
|
||||
"\n scanned %lu (a: %lu i: %lu)"
|
||||
"\n spanned %lu"
|
||||
"\n present %lu",
|
||||
zone_page_state(zone, NR_FREE_PAGES),
|
||||
zone->pages_min,
|
||||
zone->pages_low,
|
||||
zone->pages_high,
|
||||
zone->pages_scanned,
|
||||
zone->nr_scan_active, zone->nr_scan_inactive,
|
||||
zone->spanned_pages,
|
||||
zone->present_pages);
|
||||
|
||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
||||
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
|
||||
zone_page_state(zone, i));
|
||||
|
||||
seq_printf(m,
|
||||
"\n protection: (%lu",
|
||||
zone->lowmem_reserve[0]);
|
||||
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
|
||||
seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
|
||||
seq_printf(m,
|
||||
")"
|
||||
"\n pagesets");
|
||||
for_each_online_cpu(i) {
|
||||
struct per_cpu_pageset *pageset;
|
||||
int j;
|
||||
|
||||
pageset = zone_pcp(zone, i);
|
||||
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
|
||||
seq_printf(m,
|
||||
"\n cpu: %i pcp: %i"
|
||||
"\n count: %i"
|
||||
"\n high: %i"
|
||||
"\n batch: %i",
|
||||
i, j,
|
||||
pageset->pcp[j].count,
|
||||
pageset->pcp[j].high,
|
||||
pageset->pcp[j].batch);
|
||||
}
|
||||
#ifdef CONFIG_SMP
|
||||
seq_printf(m, "\n vm stats threshold: %d",
|
||||
pageset->stat_threshold);
|
||||
#endif
|
||||
}
|
||||
seq_printf(m,
|
||||
"\n all_unreclaimable: %u"
|
||||
"\n prev_priority: %i"
|
||||
"\n start_pfn: %lu",
|
||||
zone->all_unreclaimable,
|
||||
zone->prev_priority,
|
||||
zone->zone_start_pfn);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct seq_operations zoneinfo_op = {
|
||||
.start = frag_start, /* iterate over all zones. The same as in
|
||||
* fragmentation. */
|
||||
.next = frag_next,
|
||||
.stop = frag_stop,
|
||||
.show = zoneinfo_show,
|
||||
};
|
||||
|
||||
static void *vmstat_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
unsigned long *v;
|
||||
#ifdef CONFIG_VM_EVENT_COUNTERS
|
||||
unsigned long *e;
|
||||
#endif
|
||||
int i;
|
||||
|
||||
if (*pos >= ARRAY_SIZE(vmstat_text))
|
||||
return NULL;
|
||||
|
||||
#ifdef CONFIG_VM_EVENT_COUNTERS
|
||||
v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
|
||||
+ sizeof(struct vm_event_state), GFP_KERNEL);
|
||||
#else
|
||||
v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
|
||||
GFP_KERNEL);
|
||||
#endif
|
||||
m->private = v;
|
||||
if (!v)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
||||
v[i] = global_page_state(i);
|
||||
#ifdef CONFIG_VM_EVENT_COUNTERS
|
||||
e = v + NR_VM_ZONE_STAT_ITEMS;
|
||||
all_vm_events(e);
|
||||
e[PGPGIN] /= 2; /* sectors -> kbytes */
|
||||
e[PGPGOUT] /= 2;
|
||||
#endif
|
||||
return v + *pos;
|
||||
}
|
||||
|
||||
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
|
||||
{
|
||||
(*pos)++;
|
||||
if (*pos >= ARRAY_SIZE(vmstat_text))
|
||||
return NULL;
|
||||
return (unsigned long *)m->private + *pos;
|
||||
}
|
||||
|
||||
static int vmstat_show(struct seq_file *m, void *arg)
|
||||
{
|
||||
unsigned long *l = arg;
|
||||
unsigned long off = l - (unsigned long *)m->private;
|
||||
|
||||
seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vmstat_stop(struct seq_file *m, void *arg)
|
||||
{
|
||||
kfree(m->private);
|
||||
m->private = NULL;
|
||||
}
|
||||
|
||||
const struct seq_operations vmstat_op = {
|
||||
.start = vmstat_start,
|
||||
.next = vmstat_next,
|
||||
.stop = vmstat_stop,
|
||||
.show = vmstat_show,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/*
|
||||
* Use the cpu notifier to insure that the thresholds are recalculated
|
||||
* when necessary.
|
||||
*/
|
||||
static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
|
||||
unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
switch (action) {
|
||||
case CPU_UP_PREPARE:
|
||||
case CPU_UP_CANCELED:
|
||||
case CPU_DEAD:
|
||||
refresh_zone_stat_thresholds();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block __cpuinitdata vmstat_notifier =
|
||||
{ &vmstat_cpuup_callback, NULL, 0 };
|
||||
|
||||
int __init setup_vmstat(void)
|
||||
{
|
||||
refresh_zone_stat_thresholds();
|
||||
register_cpu_notifier(&vmstat_notifier);
|
||||
return 0;
|
||||
}
|
||||
module_init(setup_vmstat)
|
||||
#endif
|
||||
Reference in New Issue
Block a user