Creation of Cybook 2416 (actually Gen4) repository
This commit is contained in:
267
drivers/md/Kconfig
Normal file
267
drivers/md/Kconfig
Normal file
@@ -0,0 +1,267 @@
|
||||
#
|
||||
# Block device driver configuration
|
||||
#
|
||||
|
||||
if BLOCK
|
||||
|
||||
menu "Multi-device support (RAID and LVM)"
|
||||
|
||||
config MD
|
||||
bool "Multiple devices driver support (RAID and LVM)"
|
||||
help
|
||||
Support multiple physical spindles through a single logical device.
|
||||
Required for RAID and logical volume management.
|
||||
|
||||
config BLK_DEV_MD
|
||||
tristate "RAID support"
|
||||
depends on MD
|
||||
---help---
|
||||
This driver lets you combine several hard disk partitions into one
|
||||
logical block device. This can be used to simply append one
|
||||
partition to another one or to combine several redundant hard disks
|
||||
into a RAID1/4/5 device so as to provide protection against hard
|
||||
disk failures. This is called "Software RAID" since the combining of
|
||||
the partitions is done by the kernel. "Hardware RAID" means that the
|
||||
combining is done by a dedicated controller; if you have such a
|
||||
controller, you do not need to say Y here.
|
||||
|
||||
More information about Software RAID on Linux is contained in the
|
||||
Software RAID mini-HOWTO, available from
|
||||
<http://www.tldp.org/docs.html#howto>. There you will also learn
|
||||
where to get the supporting user space utilities raidtools.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MD_LINEAR
|
||||
tristate "Linear (append) mode"
|
||||
depends on BLK_DEV_MD
|
||||
---help---
|
||||
If you say Y here, then your multiple devices driver will be able to
|
||||
use the so-called linear mode, i.e. it will combine the hard disk
|
||||
partitions by simply appending one to the other.
|
||||
|
||||
To compile this as a module, choose M here: the module
|
||||
will be called linear.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID0
|
||||
tristate "RAID-0 (striping) mode"
|
||||
depends on BLK_DEV_MD
|
||||
---help---
|
||||
If you say Y here, then your multiple devices driver will be able to
|
||||
use the so-called raid0 mode, i.e. it will combine the hard disk
|
||||
partitions into one logical device in such a fashion as to fill them
|
||||
up evenly, one chunk here and one chunk there. This will increase
|
||||
the throughput rate if the partitions reside on distinct disks.
|
||||
|
||||
Information about Software RAID on Linux is contained in the
|
||||
Software-RAID mini-HOWTO, available from
|
||||
<http://www.tldp.org/docs.html#howto>. There you will also
|
||||
learn where to get the supporting user space utilities raidtools.
|
||||
|
||||
To compile this as a module, choose M here: the module
|
||||
will be called raid0.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID1
|
||||
tristate "RAID-1 (mirroring) mode"
|
||||
depends on BLK_DEV_MD
|
||||
---help---
|
||||
A RAID-1 set consists of several disk drives which are exact copies
|
||||
of each other. In the event of a mirror failure, the RAID driver
|
||||
will continue to use the operational mirrors in the set, providing
|
||||
an error free MD (multiple device) to the higher levels of the
|
||||
kernel. In a set with N drives, the available space is the capacity
|
||||
of a single drive, and the set protects against a failure of (N - 1)
|
||||
drives.
|
||||
|
||||
Information about Software RAID on Linux is contained in the
|
||||
Software-RAID mini-HOWTO, available from
|
||||
<http://www.tldp.org/docs.html#howto>. There you will also
|
||||
learn where to get the supporting user space utilities raidtools.
|
||||
|
||||
If you want to use such a RAID-1 set, say Y. To compile this code
|
||||
as a module, choose M here: the module will be called raid1.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID10
|
||||
tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_MD && EXPERIMENTAL
|
||||
---help---
|
||||
RAID-10 provides a combination of striping (RAID-0) and
|
||||
mirroring (RAID-1) with easier configuration and more flexible
|
||||
layout.
|
||||
Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
|
||||
be the same size (or at least, only as much as the smallest device
|
||||
will be used).
|
||||
RAID-10 provides a variety of layouts that provide different levels
|
||||
of redundancy and performance.
|
||||
|
||||
RAID-10 requires mdadm-1.7.0 or later, available at:
|
||||
|
||||
ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID456
|
||||
tristate "RAID-4/RAID-5/RAID-6 mode"
|
||||
depends on BLK_DEV_MD
|
||||
---help---
|
||||
A RAID-5 set of N drives with a capacity of C MB per drive provides
|
||||
the capacity of C * (N - 1) MB, and protects against a failure
|
||||
of a single drive. For a given sector (row) number, (N - 1) drives
|
||||
contain data sectors, and one drive contains the parity protection.
|
||||
For a RAID-4 set, the parity blocks are present on a single drive,
|
||||
while a RAID-5 set distributes the parity across the drives in one
|
||||
of the available parity distribution methods.
|
||||
|
||||
A RAID-6 set of N drives with a capacity of C MB per drive
|
||||
provides the capacity of C * (N - 2) MB, and protects
|
||||
against a failure of any two drives. For a given sector
|
||||
(row) number, (N - 2) drives contain data sectors, and two
|
||||
drives contains two independent redundancy syndromes. Like
|
||||
RAID-5, RAID-6 distributes the syndromes across the drives
|
||||
in one of the available parity distribution methods.
|
||||
|
||||
Information about Software RAID on Linux is contained in the
|
||||
Software-RAID mini-HOWTO, available from
|
||||
<http://www.tldp.org/docs.html#howto>. There you will also
|
||||
learn where to get the supporting user space utilities raidtools.
|
||||
|
||||
If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
|
||||
compile this code as a module, choose M here: the module
|
||||
will be called raid456.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID5_RESHAPE
|
||||
bool "Support adding drives to a raid-5 array"
|
||||
depends on MD_RAID456
|
||||
default y
|
||||
---help---
|
||||
A RAID-5 set can be expanded by adding extra drives. This
|
||||
requires "restriping" the array which means (almost) every
|
||||
block must be written to a different place.
|
||||
|
||||
This option allows such restriping to be done while the array
|
||||
is online.
|
||||
|
||||
You will need mdadm version 2.4.1 or later to use this
|
||||
feature safely. During the early stage of reshape there is
|
||||
a critical section where live data is being over-written. A
|
||||
crash during this time needs extra care for recovery. The
|
||||
newer mdadm takes a copy of the data in the critical section
|
||||
and will restore it, if necessary, after a crash.
|
||||
|
||||
The mdadm usage is e.g.
|
||||
mdadm --grow /dev/md1 --raid-disks=6
|
||||
to grow '/dev/md1' to having 6 disks.
|
||||
|
||||
Note: The array can only be expanded, not contracted.
|
||||
There should be enough spares already present to make the new
|
||||
array workable.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_MULTIPATH
|
||||
tristate "Multipath I/O support"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
Multipath-IO is the ability of certain devices to address the same
|
||||
physical disk over multiple 'IO paths'. The code ensures that such
|
||||
paths can be defined and handled at runtime, and ensures that a
|
||||
transparent failover to the backup path(s) happens if a IO errors
|
||||
arrives on the primary path.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MD_FAULTY
|
||||
tristate "Faulty test module for MD"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
The "faulty" module allows for a block device that occasionally returns
|
||||
read or write errors. It is useful for testing.
|
||||
|
||||
In unsure, say N.
|
||||
|
||||
config BLK_DEV_DM
|
||||
tristate "Device mapper support"
|
||||
depends on MD
|
||||
---help---
|
||||
Device-mapper is a low level volume manager. It works by allowing
|
||||
people to specify mappings for ranges of logical sectors. Various
|
||||
mapping types are available, in addition people may write their own
|
||||
modules containing custom mappings if they wish.
|
||||
|
||||
Higher level volume managers such as LVM2 use this driver.
|
||||
|
||||
To compile this as a module, choose M here: the module will be
|
||||
called dm-mod.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DM_DEBUG
|
||||
boolean "Device mapper debugging support"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
Enable this for messages that may help debug device-mapper problems.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DM_CRYPT
|
||||
tristate "Crypt target support"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
select CRYPTO
|
||||
select CRYPTO_CBC
|
||||
---help---
|
||||
This device-mapper target allows you to create a device that
|
||||
transparently encrypts the data on it. You'll need to activate
|
||||
the ciphers you're going to use in the cryptoapi configuration.
|
||||
|
||||
Information on how to use dm-crypt can be found on
|
||||
|
||||
<http://www.saout.de/misc/dm-crypt/>
|
||||
|
||||
To compile this code as a module, choose M here: the module will
|
||||
be called dm-crypt.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DM_SNAPSHOT
|
||||
tristate "Snapshot target (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
Allow volume managers to take writable snapshots of a device.
|
||||
|
||||
config DM_MIRROR
|
||||
tristate "Mirror target (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
Allow volume managers to mirror logical volumes, also
|
||||
needed for live data migration tools such as 'pvmove'.
|
||||
|
||||
config DM_ZERO
|
||||
tristate "Zero target (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
A target that discards writes, and returns all zeroes for
|
||||
reads. Useful in some recovery situations.
|
||||
|
||||
config DM_MULTIPATH
|
||||
tristate "Multipath target (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
Allow volume managers to support multipath hardware.
|
||||
|
||||
config DM_MULTIPATH_EMC
|
||||
tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
|
||||
depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
|
||||
---help---
|
||||
Multipath support for EMC CX/AX series hardware.
|
||||
|
||||
endmenu
|
||||
|
||||
endif
|
||||
107
drivers/md/Makefile
Normal file
107
drivers/md/Makefile
Normal file
@@ -0,0 +1,107 @@
|
||||
#
|
||||
# Makefile for the kernel software RAID and LVM drivers.
|
||||
#
|
||||
|
||||
dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
|
||||
dm-ioctl.o dm-io.o kcopyd.o
|
||||
dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
|
||||
dm-snapshot-objs := dm-snap.o dm-exception-store.o
|
||||
dm-mirror-objs := dm-log.o dm-raid1.o
|
||||
md-mod-objs := md.o bitmap.o
|
||||
raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
|
||||
raid6int1.o raid6int2.o raid6int4.o \
|
||||
raid6int8.o raid6int16.o raid6int32.o \
|
||||
raid6altivec1.o raid6altivec2.o raid6altivec4.o \
|
||||
raid6altivec8.o \
|
||||
raid6mmx.o raid6sse1.o raid6sse2.o
|
||||
hostprogs-y := mktables
|
||||
|
||||
# Note: link order is important. All raid personalities
|
||||
# and xor.o must come before md.o, as they each initialise
|
||||
# themselves, and md.o may use the personalities when it
|
||||
# auto-initialised.
|
||||
|
||||
obj-$(CONFIG_MD_LINEAR) += linear.o
|
||||
obj-$(CONFIG_MD_RAID0) += raid0.o
|
||||
obj-$(CONFIG_MD_RAID1) += raid1.o
|
||||
obj-$(CONFIG_MD_RAID10) += raid10.o
|
||||
obj-$(CONFIG_MD_RAID456) += raid456.o xor.o
|
||||
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
|
||||
obj-$(CONFIG_MD_FAULTY) += faulty.o
|
||||
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
|
||||
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
|
||||
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
|
||||
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
|
||||
obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
|
||||
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
|
||||
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
|
||||
obj-$(CONFIG_DM_ZERO) += dm-zero.o
|
||||
|
||||
quiet_cmd_unroll = UNROLL $@
|
||||
cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
|
||||
< $< > $@ || ( rm -f $@ && exit 1 )
|
||||
|
||||
ifeq ($(CONFIG_ALTIVEC),y)
|
||||
altivec_flags := -maltivec -mabi=altivec
|
||||
endif
|
||||
|
||||
targets += raid6int1.c
|
||||
$(obj)/raid6int1.c: UNROLL := 1
|
||||
$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += raid6int2.c
|
||||
$(obj)/raid6int2.c: UNROLL := 2
|
||||
$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += raid6int4.c
|
||||
$(obj)/raid6int4.c: UNROLL := 4
|
||||
$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += raid6int8.c
|
||||
$(obj)/raid6int8.c: UNROLL := 8
|
||||
$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += raid6int16.c
|
||||
$(obj)/raid6int16.c: UNROLL := 16
|
||||
$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
targets += raid6int32.c
|
||||
$(obj)/raid6int32.c: UNROLL := 32
|
||||
$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
CFLAGS_raid6altivec1.o += $(altivec_flags)
|
||||
targets += raid6altivec1.c
|
||||
$(obj)/raid6altivec1.c: UNROLL := 1
|
||||
$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
CFLAGS_raid6altivec2.o += $(altivec_flags)
|
||||
targets += raid6altivec2.c
|
||||
$(obj)/raid6altivec2.c: UNROLL := 2
|
||||
$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
CFLAGS_raid6altivec4.o += $(altivec_flags)
|
||||
targets += raid6altivec4.c
|
||||
$(obj)/raid6altivec4.c: UNROLL := 4
|
||||
$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
CFLAGS_raid6altivec8.o += $(altivec_flags)
|
||||
targets += raid6altivec8.c
|
||||
$(obj)/raid6altivec8.c: UNROLL := 8
|
||||
$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
|
||||
$(call if_changed,unroll)
|
||||
|
||||
quiet_cmd_mktable = TABLE $@
|
||||
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
|
||||
|
||||
targets += raid6tables.c
|
||||
$(obj)/raid6tables.c: $(obj)/mktables FORCE
|
||||
$(call if_changed,mktable)
|
||||
1533
drivers/md/bitmap.c
Normal file
1533
drivers/md/bitmap.c
Normal file
File diff suppressed because it is too large
Load Diff
85
drivers/md/dm-bio-list.h
Normal file
85
drivers/md/dm-bio-list.h
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (C) 2004 Red Hat UK Ltd.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_BIO_LIST_H
|
||||
#define DM_BIO_LIST_H
|
||||
|
||||
#include <linux/bio.h>
|
||||
|
||||
struct bio_list {
|
||||
struct bio *head;
|
||||
struct bio *tail;
|
||||
};
|
||||
|
||||
static inline void bio_list_init(struct bio_list *bl)
|
||||
{
|
||||
bl->head = bl->tail = NULL;
|
||||
}
|
||||
|
||||
static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
|
||||
{
|
||||
bio->bi_next = NULL;
|
||||
|
||||
if (bl->tail)
|
||||
bl->tail->bi_next = bio;
|
||||
else
|
||||
bl->head = bio;
|
||||
|
||||
bl->tail = bio;
|
||||
}
|
||||
|
||||
static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
|
||||
{
|
||||
if (!bl2->head)
|
||||
return;
|
||||
|
||||
if (bl->tail)
|
||||
bl->tail->bi_next = bl2->head;
|
||||
else
|
||||
bl->head = bl2->head;
|
||||
|
||||
bl->tail = bl2->tail;
|
||||
}
|
||||
|
||||
static inline void bio_list_merge_head(struct bio_list *bl,
|
||||
struct bio_list *bl2)
|
||||
{
|
||||
if (!bl2->head)
|
||||
return;
|
||||
|
||||
if (bl->head)
|
||||
bl2->tail->bi_next = bl->head;
|
||||
else
|
||||
bl->tail = bl2->tail;
|
||||
|
||||
bl->head = bl2->head;
|
||||
}
|
||||
|
||||
static inline struct bio *bio_list_pop(struct bio_list *bl)
|
||||
{
|
||||
struct bio *bio = bl->head;
|
||||
|
||||
if (bio) {
|
||||
bl->head = bl->head->bi_next;
|
||||
if (!bl->head)
|
||||
bl->tail = NULL;
|
||||
|
||||
bio->bi_next = NULL;
|
||||
}
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
||||
static inline struct bio *bio_list_get(struct bio_list *bl)
|
||||
{
|
||||
struct bio *bio = bl->head;
|
||||
|
||||
bl->head = bl->tail = NULL;
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
||||
#endif
|
||||
45
drivers/md/dm-bio-record.h
Normal file
45
drivers/md/dm-bio-record.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_BIO_RECORD_H
|
||||
#define DM_BIO_RECORD_H
|
||||
|
||||
#include <linux/bio.h>
|
||||
|
||||
/*
|
||||
* There are lots of mutable fields in the bio struct that get
|
||||
* changed by the lower levels of the block layer. Some targets,
|
||||
* such as multipath, may wish to resubmit a bio on error. The
|
||||
* functions in this file help the target record and restore the
|
||||
* original bio state.
|
||||
*/
|
||||
struct dm_bio_details {
|
||||
sector_t bi_sector;
|
||||
struct block_device *bi_bdev;
|
||||
unsigned int bi_size;
|
||||
unsigned short bi_idx;
|
||||
unsigned long bi_flags;
|
||||
};
|
||||
|
||||
static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
|
||||
{
|
||||
bd->bi_sector = bio->bi_sector;
|
||||
bd->bi_bdev = bio->bi_bdev;
|
||||
bd->bi_size = bio->bi_size;
|
||||
bd->bi_idx = bio->bi_idx;
|
||||
bd->bi_flags = bio->bi_flags;
|
||||
}
|
||||
|
||||
static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
|
||||
{
|
||||
bio->bi_sector = bd->bi_sector;
|
||||
bio->bi_bdev = bd->bi_bdev;
|
||||
bio->bi_size = bd->bi_size;
|
||||
bio->bi_idx = bd->bi_idx;
|
||||
bio->bi_flags = bd->bi_flags;
|
||||
}
|
||||
|
||||
#endif
|
||||
1120
drivers/md/dm-crypt.c
Normal file
1120
drivers/md/dm-crypt.c
Normal file
File diff suppressed because it is too large
Load Diff
362
drivers/md/dm-emc.c
Normal file
362
drivers/md/dm-emc.c
Normal file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Multipath support for EMC CLARiiON AX/CX-series hardware.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-hw-handler.h"
|
||||
#include <scsi/scsi.h>
|
||||
#include <scsi/scsi_cmnd.h>
|
||||
|
||||
#define DM_MSG_PREFIX "multipath emc"
|
||||
|
||||
struct emc_handler {
|
||||
spinlock_t lock;
|
||||
|
||||
/* Whether we should send the short trespass command (FC-series)
|
||||
* or the long version (default for AX/CX CLARiiON arrays). */
|
||||
unsigned short_trespass;
|
||||
/* Whether or not to honor SCSI reservations when initiating a
|
||||
* switch-over. Default: Don't. */
|
||||
unsigned hr;
|
||||
|
||||
unsigned char sense[SCSI_SENSE_BUFFERSIZE];
|
||||
};
|
||||
|
||||
#define TRESPASS_PAGE 0x22
|
||||
#define EMC_FAILOVER_TIMEOUT (60 * HZ)
|
||||
|
||||
/* Code borrowed from dm-lsi-rdac by Mike Christie */
|
||||
|
||||
static inline void free_bio(struct bio *bio)
|
||||
{
|
||||
__free_page(bio->bi_io_vec[0].bv_page);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
|
||||
{
|
||||
struct dm_path *path = bio->bi_private;
|
||||
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
/* We also need to look at the sense keys here whether or not to
|
||||
* switch to the next PG etc.
|
||||
*
|
||||
* For now simple logic: either it works or it doesn't.
|
||||
*/
|
||||
if (error)
|
||||
dm_pg_init_complete(path, MP_FAIL_PATH);
|
||||
else
|
||||
dm_pg_init_complete(path, 0);
|
||||
|
||||
/* request is freed in block layer */
|
||||
free_bio(bio);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size)
|
||||
{
|
||||
struct bio *bio;
|
||||
struct page *page;
|
||||
|
||||
bio = bio_alloc(GFP_ATOMIC, 1);
|
||||
if (!bio) {
|
||||
DMERR("get_failover_bio: bio_alloc() failed.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bio->bi_rw |= (1 << BIO_RW);
|
||||
bio->bi_bdev = path->dev->bdev;
|
||||
bio->bi_sector = 0;
|
||||
bio->bi_private = path;
|
||||
bio->bi_end_io = emc_endio;
|
||||
|
||||
page = alloc_page(GFP_ATOMIC);
|
||||
if (!page) {
|
||||
DMERR("get_failover_bio: alloc_page() failed.");
|
||||
bio_put(bio);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (bio_add_page(bio, page, data_size, 0) != data_size) {
|
||||
DMERR("get_failover_bio: alloc_page() failed.");
|
||||
__free_page(page);
|
||||
bio_put(bio);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
||||
static struct request *get_failover_req(struct emc_handler *h,
|
||||
struct bio *bio, struct dm_path *path)
|
||||
{
|
||||
struct request *rq;
|
||||
struct block_device *bdev = bio->bi_bdev;
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
/* FIXME: Figure out why it fails with GFP_ATOMIC. */
|
||||
rq = blk_get_request(q, WRITE, __GFP_WAIT);
|
||||
if (!rq) {
|
||||
DMERR("get_failover_req: blk_get_request failed");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rq->bio = rq->biotail = bio;
|
||||
blk_rq_bio_prep(q, rq, bio);
|
||||
|
||||
rq->rq_disk = bdev->bd_contains->bd_disk;
|
||||
|
||||
/* bio backed don't set data */
|
||||
rq->buffer = rq->data = NULL;
|
||||
/* rq data_len used for pc cmd's request_bufflen */
|
||||
rq->data_len = bio->bi_size;
|
||||
|
||||
rq->sense = h->sense;
|
||||
memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
|
||||
rq->sense_len = 0;
|
||||
|
||||
memset(&rq->cmd, 0, BLK_MAX_CDB);
|
||||
|
||||
rq->timeout = EMC_FAILOVER_TIMEOUT;
|
||||
rq->cmd_type = REQ_TYPE_BLOCK_PC;
|
||||
rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
static struct request *emc_trespass_get(struct emc_handler *h,
|
||||
struct dm_path *path)
|
||||
{
|
||||
struct bio *bio;
|
||||
struct request *rq;
|
||||
unsigned char *page22;
|
||||
unsigned char long_trespass_pg[] = {
|
||||
0, 0, 0, 0,
|
||||
TRESPASS_PAGE, /* Page code */
|
||||
0x09, /* Page length - 2 */
|
||||
h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
|
||||
0xff, 0xff, /* Trespass target */
|
||||
0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */
|
||||
};
|
||||
unsigned char short_trespass_pg[] = {
|
||||
0, 0, 0, 0,
|
||||
TRESPASS_PAGE, /* Page code */
|
||||
0x02, /* Page length - 2 */
|
||||
h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
|
||||
0xff, /* Trespass target */
|
||||
};
|
||||
unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
|
||||
sizeof(long_trespass_pg);
|
||||
|
||||
/* get bio backing */
|
||||
if (data_size > PAGE_SIZE)
|
||||
/* this should never happen */
|
||||
return NULL;
|
||||
|
||||
bio = get_failover_bio(path, data_size);
|
||||
if (!bio) {
|
||||
DMERR("emc_trespass_get: no bio");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
page22 = (unsigned char *)bio_data(bio);
|
||||
memset(page22, 0, data_size);
|
||||
|
||||
memcpy(page22, h->short_trespass ?
|
||||
short_trespass_pg : long_trespass_pg, data_size);
|
||||
|
||||
/* get request for block layer packet command */
|
||||
rq = get_failover_req(h, bio, path);
|
||||
if (!rq) {
|
||||
DMERR("emc_trespass_get: no rq");
|
||||
free_bio(bio);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Prepare the command. */
|
||||
rq->cmd[0] = MODE_SELECT;
|
||||
rq->cmd[1] = 0x10;
|
||||
rq->cmd[4] = data_size;
|
||||
rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
|
||||
struct dm_path *path)
|
||||
{
|
||||
struct request *rq;
|
||||
struct request_queue *q = bdev_get_queue(path->dev->bdev);
|
||||
|
||||
/*
|
||||
* We can either blindly init the pg (then look at the sense),
|
||||
* or we can send some commands to get the state here (then
|
||||
* possibly send the fo cmnd), or we can also have the
|
||||
* initial state passed into us and then get an update here.
|
||||
*/
|
||||
if (!q) {
|
||||
DMINFO("emc_pg_init: no queue");
|
||||
goto fail_path;
|
||||
}
|
||||
|
||||
/* FIXME: The request should be pre-allocated. */
|
||||
rq = emc_trespass_get(hwh->context, path);
|
||||
if (!rq) {
|
||||
DMERR("emc_pg_init: no rq");
|
||||
goto fail_path;
|
||||
}
|
||||
|
||||
DMINFO("emc_pg_init: sending switch-over command");
|
||||
elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
|
||||
return;
|
||||
|
||||
fail_path:
|
||||
dm_pg_init_complete(path, MP_FAIL_PATH);
|
||||
}
|
||||
|
||||
static struct emc_handler *alloc_emc_handler(void)
|
||||
{
|
||||
struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
|
||||
|
||||
if (h) {
|
||||
memset(h, 0, sizeof(*h));
|
||||
spin_lock_init(&h->lock);
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
|
||||
{
|
||||
struct emc_handler *h;
|
||||
unsigned hr, short_trespass;
|
||||
|
||||
if (argc == 0) {
|
||||
/* No arguments: use defaults */
|
||||
hr = 0;
|
||||
short_trespass = 0;
|
||||
} else if (argc != 2) {
|
||||
DMWARN("incorrect number of arguments");
|
||||
return -EINVAL;
|
||||
} else {
|
||||
if ((sscanf(argv[0], "%u", &short_trespass) != 1)
|
||||
|| (short_trespass > 1)) {
|
||||
DMWARN("invalid trespass mode selected");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((sscanf(argv[1], "%u", &hr) != 1)
|
||||
|| (hr > 1)) {
|
||||
DMWARN("invalid honor reservation flag selected");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
h = alloc_emc_handler();
|
||||
if (!h)
|
||||
return -ENOMEM;
|
||||
|
||||
hwh->context = h;
|
||||
|
||||
if ((h->short_trespass = short_trespass))
|
||||
DMWARN("short trespass command will be send");
|
||||
else
|
||||
DMWARN("long trespass command will be send");
|
||||
|
||||
if ((h->hr = hr))
|
||||
DMWARN("honor reservation bit will be set");
|
||||
else
|
||||
DMWARN("honor reservation bit will not be set (default)");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void emc_destroy(struct hw_handler *hwh)
|
||||
{
|
||||
struct emc_handler *h = (struct emc_handler *) hwh->context;
|
||||
|
||||
kfree(h);
|
||||
hwh->context = NULL;
|
||||
}
|
||||
|
||||
static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
|
||||
{
|
||||
/* FIXME: Patch from axboe still missing */
|
||||
#if 0
|
||||
int sense;
|
||||
|
||||
if (bio->bi_error & BIO_SENSE) {
|
||||
sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
|
||||
|
||||
if (sense == 0x020403) {
|
||||
/* LUN Not Ready - Manual Intervention Required
|
||||
* indicates this is a passive path.
|
||||
*
|
||||
* FIXME: However, if this is seen and EVPD C0
|
||||
* indicates that this is due to a NDU in
|
||||
* progress, we should set FAIL_PATH too.
|
||||
* This indicates we might have to do a SCSI
|
||||
* inquiry in the end_io path. Ugh. */
|
||||
return MP_BYPASS_PG | MP_RETRY_IO;
|
||||
} else if (sense == 0x052501) {
|
||||
/* An array based copy is in progress. Do not
|
||||
* fail the path, do not bypass to another PG,
|
||||
* do not retry. Fail the IO immediately.
|
||||
* (Actually this is the same conclusion as in
|
||||
* the default handler, but lets make sure.) */
|
||||
return 0;
|
||||
} else if (sense == 0x062900) {
|
||||
/* Unit Attention Code. This is the first IO
|
||||
* to the new path, so just retry. */
|
||||
return MP_RETRY_IO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Try default handler */
|
||||
return dm_scsi_err_handler(hwh, bio);
|
||||
}
|
||||
|
||||
static struct hw_handler_type emc_hwh = {
|
||||
.name = "emc",
|
||||
.module = THIS_MODULE,
|
||||
.create = emc_create,
|
||||
.destroy = emc_destroy,
|
||||
.pg_init = emc_pg_init,
|
||||
.error = emc_error,
|
||||
};
|
||||
|
||||
static int __init dm_emc_init(void)
|
||||
{
|
||||
int r = dm_register_hw_handler(&emc_hwh);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("register failed %d", r);
|
||||
|
||||
DMINFO("version 0.0.3 loaded");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void __exit dm_emc_exit(void)
|
||||
{
|
||||
int r = dm_unregister_hw_handler(&emc_hwh);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("unregister failed %d", r);
|
||||
}
|
||||
|
||||
module_init(dm_emc_init);
|
||||
module_exit(dm_emc_exit);
|
||||
|
||||
MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
|
||||
MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
|
||||
MODULE_LICENSE("GPL");
|
||||
664
drivers/md/dm-exception-store.c
Normal file
664
drivers/md/dm-exception-store.c
Normal file
@@ -0,0 +1,664 @@
|
||||
/*
|
||||
* dm-snapshot.c
|
||||
*
|
||||
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-snap.h"
|
||||
#include "dm-io.h"
|
||||
#include "kcopyd.h"
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define DM_MSG_PREFIX "snapshots"
|
||||
#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Persistent snapshots, by persistent we mean that the snapshot
|
||||
* will survive a reboot.
|
||||
*---------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* We need to store a record of which parts of the origin have
|
||||
* been copied to the snapshot device. The snapshot code
|
||||
* requires that we copy exception chunks to chunk aligned areas
|
||||
* of the COW store. It makes sense therefore, to store the
|
||||
* metadata in chunk size blocks.
|
||||
*
|
||||
* There is no backward or forward compatibility implemented,
|
||||
* snapshots with different disk versions than the kernel will
|
||||
* not be usable. It is expected that "lvcreate" will blank out
|
||||
* the start of a fresh COW device before calling the snapshot
|
||||
* constructor.
|
||||
*
|
||||
* The first chunk of the COW device just contains the header.
|
||||
* After this there is a chunk filled with exception metadata,
|
||||
* followed by as many exception chunks as can fit in the
|
||||
* metadata areas.
|
||||
*
|
||||
* All on disk structures are in little-endian format. The end
|
||||
* of the exceptions info is indicated by an exception with a
|
||||
* new_chunk of 0, which is invalid since it would point to the
|
||||
* header chunk.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Magic for persistent snapshots: "SnAp" - Feeble isn't it.
|
||||
*/
|
||||
#define SNAP_MAGIC 0x70416e53
|
||||
|
||||
/*
|
||||
* The on-disk version of the metadata.
|
||||
*/
|
||||
#define SNAPSHOT_DISK_VERSION 1
|
||||
|
||||
struct disk_header {
|
||||
uint32_t magic;
|
||||
|
||||
/*
|
||||
* Is this snapshot valid. There is no way of recovering
|
||||
* an invalid snapshot.
|
||||
*/
|
||||
uint32_t valid;
|
||||
|
||||
/*
|
||||
* Simple, incrementing version. no backward
|
||||
* compatibility.
|
||||
*/
|
||||
uint32_t version;
|
||||
|
||||
/* In sectors */
|
||||
uint32_t chunk_size;
|
||||
};
|
||||
|
||||
struct disk_exception {
|
||||
uint64_t old_chunk;
|
||||
uint64_t new_chunk;
|
||||
};
|
||||
|
||||
struct commit_callback {
|
||||
void (*callback)(void *, int success);
|
||||
void *context;
|
||||
};
|
||||
|
||||
/*
|
||||
* The top level structure for a persistent exception store.
|
||||
*/
|
||||
struct pstore {
|
||||
struct dm_snapshot *snap; /* up pointer to my snapshot */
|
||||
int version;
|
||||
int valid;
|
||||
uint32_t exceptions_per_area;
|
||||
|
||||
/*
|
||||
* Now that we have an asynchronous kcopyd there is no
|
||||
* need for large chunk sizes, so it wont hurt to have a
|
||||
* whole chunks worth of metadata in memory at once.
|
||||
*/
|
||||
void *area;
|
||||
|
||||
/*
|
||||
* Used to keep track of which metadata area the data in
|
||||
* 'chunk' refers to.
|
||||
*/
|
||||
uint32_t current_area;
|
||||
|
||||
/*
|
||||
* The next free chunk for an exception.
|
||||
*/
|
||||
uint32_t next_free;
|
||||
|
||||
/*
|
||||
* The index of next free exception in the current
|
||||
* metadata area.
|
||||
*/
|
||||
uint32_t current_committed;
|
||||
|
||||
atomic_t pending_count;
|
||||
uint32_t callback_count;
|
||||
struct commit_callback *callbacks;
|
||||
};
|
||||
|
||||
static inline unsigned int sectors_to_pages(unsigned int sectors)
|
||||
{
|
||||
return sectors / (PAGE_SIZE >> 9);
|
||||
}
|
||||
|
||||
static int alloc_area(struct pstore *ps)
|
||||
{
|
||||
int r = -ENOMEM;
|
||||
size_t len;
|
||||
|
||||
len = ps->snap->chunk_size << SECTOR_SHIFT;
|
||||
|
||||
/*
|
||||
* Allocate the chunk_size block of memory that will hold
|
||||
* a single metadata area.
|
||||
*/
|
||||
ps->area = vmalloc(len);
|
||||
if (!ps->area)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_area(struct pstore *ps)
|
||||
{
|
||||
vfree(ps->area);
|
||||
ps->area = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read or write a chunk aligned and sized block of data from a device.
|
||||
*/
|
||||
static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
|
||||
{
|
||||
struct io_region where;
|
||||
unsigned long bits;
|
||||
|
||||
where.bdev = ps->snap->cow->bdev;
|
||||
where.sector = ps->snap->chunk_size * chunk;
|
||||
where.count = ps->snap->chunk_size;
|
||||
|
||||
return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read or write a metadata area. Remembering to skip the first
|
||||
* chunk which holds the header.
|
||||
*/
|
||||
static int area_io(struct pstore *ps, uint32_t area, int rw)
|
||||
{
|
||||
int r;
|
||||
uint32_t chunk;
|
||||
|
||||
/* convert a metadata area index to a chunk index */
|
||||
chunk = 1 + ((ps->exceptions_per_area + 1) * area);
|
||||
|
||||
r = chunk_io(ps, chunk, rw);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
ps->current_area = area;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int zero_area(struct pstore *ps, uint32_t area)
|
||||
{
|
||||
memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
|
||||
return area_io(ps, area, WRITE);
|
||||
}
|
||||
|
||||
static int read_header(struct pstore *ps, int *new_snapshot)
|
||||
{
|
||||
int r;
|
||||
struct disk_header *dh;
|
||||
chunk_t chunk_size;
|
||||
int chunk_size_supplied = 1;
|
||||
|
||||
/*
|
||||
* Use default chunk size (or hardsect_size, if larger) if none supplied
|
||||
*/
|
||||
if (!ps->snap->chunk_size) {
|
||||
ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
|
||||
bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
|
||||
ps->snap->chunk_mask = ps->snap->chunk_size - 1;
|
||||
ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
|
||||
chunk_size_supplied = 0;
|
||||
}
|
||||
|
||||
r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = alloc_area(ps);
|
||||
if (r)
|
||||
goto bad1;
|
||||
|
||||
r = chunk_io(ps, 0, READ);
|
||||
if (r)
|
||||
goto bad2;
|
||||
|
||||
dh = (struct disk_header *) ps->area;
|
||||
|
||||
if (le32_to_cpu(dh->magic) == 0) {
|
||||
*new_snapshot = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
|
||||
DMWARN("Invalid or corrupt snapshot");
|
||||
r = -ENXIO;
|
||||
goto bad2;
|
||||
}
|
||||
|
||||
*new_snapshot = 0;
|
||||
ps->valid = le32_to_cpu(dh->valid);
|
||||
ps->version = le32_to_cpu(dh->version);
|
||||
chunk_size = le32_to_cpu(dh->chunk_size);
|
||||
|
||||
if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
|
||||
return 0;
|
||||
|
||||
DMWARN("chunk size %llu in device metadata overrides "
|
||||
"table chunk size of %llu.",
|
||||
(unsigned long long)chunk_size,
|
||||
(unsigned long long)ps->snap->chunk_size);
|
||||
|
||||
/* We had a bogus chunk_size. Fix stuff up. */
|
||||
dm_io_put(sectors_to_pages(ps->snap->chunk_size));
|
||||
free_area(ps);
|
||||
|
||||
ps->snap->chunk_size = chunk_size;
|
||||
ps->snap->chunk_mask = chunk_size - 1;
|
||||
ps->snap->chunk_shift = ffs(chunk_size) - 1;
|
||||
|
||||
r = dm_io_get(sectors_to_pages(chunk_size));
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = alloc_area(ps);
|
||||
if (r)
|
||||
goto bad1;
|
||||
|
||||
return 0;
|
||||
|
||||
bad2:
|
||||
free_area(ps);
|
||||
bad1:
|
||||
dm_io_put(sectors_to_pages(ps->snap->chunk_size));
|
||||
return r;
|
||||
}
|
||||
|
||||
static int write_header(struct pstore *ps)
|
||||
{
|
||||
struct disk_header *dh;
|
||||
|
||||
memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
|
||||
|
||||
dh = (struct disk_header *) ps->area;
|
||||
dh->magic = cpu_to_le32(SNAP_MAGIC);
|
||||
dh->valid = cpu_to_le32(ps->valid);
|
||||
dh->version = cpu_to_le32(ps->version);
|
||||
dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
|
||||
|
||||
return chunk_io(ps, 0, WRITE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Access functions for the disk exceptions, these do the endian conversions.
|
||||
*/
|
||||
static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
|
||||
{
|
||||
BUG_ON(index >= ps->exceptions_per_area);
|
||||
|
||||
return ((struct disk_exception *) ps->area) + index;
|
||||
}
|
||||
|
||||
static void read_exception(struct pstore *ps,
|
||||
uint32_t index, struct disk_exception *result)
|
||||
{
|
||||
struct disk_exception *e = get_exception(ps, index);
|
||||
|
||||
/* copy it */
|
||||
result->old_chunk = le64_to_cpu(e->old_chunk);
|
||||
result->new_chunk = le64_to_cpu(e->new_chunk);
|
||||
}
|
||||
|
||||
static void write_exception(struct pstore *ps,
|
||||
uint32_t index, struct disk_exception *de)
|
||||
{
|
||||
struct disk_exception *e = get_exception(ps, index);
|
||||
|
||||
/* copy it */
|
||||
e->old_chunk = cpu_to_le64(de->old_chunk);
|
||||
e->new_chunk = cpu_to_le64(de->new_chunk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Registers the exceptions that are present in the current area.
|
||||
* 'full' is filled in to indicate if the area has been
|
||||
* filled.
|
||||
*/
|
||||
static int insert_exceptions(struct pstore *ps, int *full)
|
||||
{
|
||||
int r;
|
||||
unsigned int i;
|
||||
struct disk_exception de;
|
||||
|
||||
/* presume the area is full */
|
||||
*full = 1;
|
||||
|
||||
for (i = 0; i < ps->exceptions_per_area; i++) {
|
||||
read_exception(ps, i, &de);
|
||||
|
||||
/*
|
||||
* If the new_chunk is pointing at the start of
|
||||
* the COW device, where the first metadata area
|
||||
* is we know that we've hit the end of the
|
||||
* exceptions. Therefore the area is not full.
|
||||
*/
|
||||
if (de.new_chunk == 0LL) {
|
||||
ps->current_committed = i;
|
||||
*full = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Keep track of the start of the free chunks.
|
||||
*/
|
||||
if (ps->next_free <= de.new_chunk)
|
||||
ps->next_free = de.new_chunk + 1;
|
||||
|
||||
/*
|
||||
* Otherwise we add the exception to the snapshot.
|
||||
*/
|
||||
r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_exceptions(struct pstore *ps)
|
||||
{
|
||||
uint32_t area;
|
||||
int r, full = 1;
|
||||
|
||||
/*
|
||||
* Keeping reading chunks and inserting exceptions until
|
||||
* we find a partially full area.
|
||||
*/
|
||||
for (area = 0; full; area++) {
|
||||
r = area_io(ps, area, READ);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = insert_exceptions(ps, &full);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline struct pstore *get_info(struct exception_store *store)
|
||||
{
|
||||
return (struct pstore *) store->context;
|
||||
}
|
||||
|
||||
static void persistent_fraction_full(struct exception_store *store,
|
||||
sector_t *numerator, sector_t *denominator)
|
||||
{
|
||||
*numerator = get_info(store)->next_free * store->snap->chunk_size;
|
||||
*denominator = get_dev_size(store->snap->cow->bdev);
|
||||
}
|
||||
|
||||
static void persistent_destroy(struct exception_store *store)
|
||||
{
|
||||
struct pstore *ps = get_info(store);
|
||||
|
||||
dm_io_put(sectors_to_pages(ps->snap->chunk_size));
|
||||
vfree(ps->callbacks);
|
||||
free_area(ps);
|
||||
kfree(ps);
|
||||
}
|
||||
|
||||
static int persistent_read_metadata(struct exception_store *store)
|
||||
{
|
||||
int r, new_snapshot;
|
||||
struct pstore *ps = get_info(store);
|
||||
|
||||
/*
|
||||
* Read the snapshot header.
|
||||
*/
|
||||
r = read_header(ps, &new_snapshot);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/*
|
||||
* Now we know correct chunk_size, complete the initialisation.
|
||||
*/
|
||||
ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
|
||||
sizeof(struct disk_exception);
|
||||
ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
|
||||
sizeof(*ps->callbacks));
|
||||
if (!ps->callbacks)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Do we need to setup a new snapshot ?
|
||||
*/
|
||||
if (new_snapshot) {
|
||||
r = write_header(ps);
|
||||
if (r) {
|
||||
DMWARN("write_header failed");
|
||||
return r;
|
||||
}
|
||||
|
||||
r = zero_area(ps, 0);
|
||||
if (r) {
|
||||
DMWARN("zero_area(0) failed");
|
||||
return r;
|
||||
}
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Sanity checks.
|
||||
*/
|
||||
if (!ps->valid) {
|
||||
DMWARN("snapshot is marked invalid");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ps->version != SNAPSHOT_DISK_VERSION) {
|
||||
DMWARN("unable to handle snapshot disk version %d",
|
||||
ps->version);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the metadata.
|
||||
*/
|
||||
r = read_exceptions(ps);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int persistent_prepare(struct exception_store *store,
|
||||
struct exception *e)
|
||||
{
|
||||
struct pstore *ps = get_info(store);
|
||||
uint32_t stride;
|
||||
sector_t size = get_dev_size(store->snap->cow->bdev);
|
||||
|
||||
/* Is there enough room ? */
|
||||
if (size < ((ps->next_free + 1) * store->snap->chunk_size))
|
||||
return -ENOSPC;
|
||||
|
||||
e->new_chunk = ps->next_free;
|
||||
|
||||
/*
|
||||
* Move onto the next free pending, making sure to take
|
||||
* into account the location of the metadata chunks.
|
||||
*/
|
||||
stride = (ps->exceptions_per_area + 1);
|
||||
if ((++ps->next_free % stride) == 1)
|
||||
ps->next_free++;
|
||||
|
||||
atomic_inc(&ps->pending_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void persistent_commit(struct exception_store *store,
|
||||
struct exception *e,
|
||||
void (*callback) (void *, int success),
|
||||
void *callback_context)
|
||||
{
|
||||
int r;
|
||||
unsigned int i;
|
||||
struct pstore *ps = get_info(store);
|
||||
struct disk_exception de;
|
||||
struct commit_callback *cb;
|
||||
|
||||
de.old_chunk = e->old_chunk;
|
||||
de.new_chunk = e->new_chunk;
|
||||
write_exception(ps, ps->current_committed++, &de);
|
||||
|
||||
/*
|
||||
* Add the callback to the back of the array. This code
|
||||
* is the only place where the callback array is
|
||||
* manipulated, and we know that it will never be called
|
||||
* multiple times concurrently.
|
||||
*/
|
||||
cb = ps->callbacks + ps->callback_count++;
|
||||
cb->callback = callback;
|
||||
cb->context = callback_context;
|
||||
|
||||
/*
|
||||
* If there are no more exceptions in flight, or we have
|
||||
* filled this metadata area we commit the exceptions to
|
||||
* disk.
|
||||
*/
|
||||
if (atomic_dec_and_test(&ps->pending_count) ||
|
||||
(ps->current_committed == ps->exceptions_per_area)) {
|
||||
r = area_io(ps, ps->current_area, WRITE);
|
||||
if (r)
|
||||
ps->valid = 0;
|
||||
|
||||
/*
|
||||
* Have we completely filled the current area ?
|
||||
*/
|
||||
if (ps->current_committed == ps->exceptions_per_area) {
|
||||
ps->current_committed = 0;
|
||||
r = zero_area(ps, ps->current_area + 1);
|
||||
if (r)
|
||||
ps->valid = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < ps->callback_count; i++) {
|
||||
cb = ps->callbacks + i;
|
||||
cb->callback(cb->context, r == 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
ps->callback_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void persistent_drop(struct exception_store *store)
|
||||
{
|
||||
struct pstore *ps = get_info(store);
|
||||
|
||||
ps->valid = 0;
|
||||
if (write_header(ps))
|
||||
DMWARN("write header failed");
|
||||
}
|
||||
|
||||
int dm_create_persistent(struct exception_store *store)
|
||||
{
|
||||
struct pstore *ps;
|
||||
|
||||
/* allocate the pstore */
|
||||
ps = kmalloc(sizeof(*ps), GFP_KERNEL);
|
||||
if (!ps)
|
||||
return -ENOMEM;
|
||||
|
||||
ps->snap = store->snap;
|
||||
ps->valid = 1;
|
||||
ps->version = SNAPSHOT_DISK_VERSION;
|
||||
ps->area = NULL;
|
||||
ps->next_free = 2; /* skipping the header and first area */
|
||||
ps->current_committed = 0;
|
||||
|
||||
ps->callback_count = 0;
|
||||
atomic_set(&ps->pending_count, 0);
|
||||
ps->callbacks = NULL;
|
||||
|
||||
store->destroy = persistent_destroy;
|
||||
store->read_metadata = persistent_read_metadata;
|
||||
store->prepare_exception = persistent_prepare;
|
||||
store->commit_exception = persistent_commit;
|
||||
store->drop_snapshot = persistent_drop;
|
||||
store->fraction_full = persistent_fraction_full;
|
||||
store->context = ps;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Implementation of the store for non-persistent snapshots.
|
||||
*---------------------------------------------------------------*/
|
||||
struct transient_c {
|
||||
sector_t next_free;
|
||||
};
|
||||
|
||||
static void transient_destroy(struct exception_store *store)
|
||||
{
|
||||
kfree(store->context);
|
||||
}
|
||||
|
||||
static int transient_read_metadata(struct exception_store *store)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int transient_prepare(struct exception_store *store, struct exception *e)
|
||||
{
|
||||
struct transient_c *tc = (struct transient_c *) store->context;
|
||||
sector_t size = get_dev_size(store->snap->cow->bdev);
|
||||
|
||||
if (size < (tc->next_free + store->snap->chunk_size))
|
||||
return -1;
|
||||
|
||||
e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
|
||||
tc->next_free += store->snap->chunk_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void transient_commit(struct exception_store *store,
|
||||
struct exception *e,
|
||||
void (*callback) (void *, int success),
|
||||
void *callback_context)
|
||||
{
|
||||
/* Just succeed */
|
||||
callback(callback_context, 1);
|
||||
}
|
||||
|
||||
static void transient_fraction_full(struct exception_store *store,
|
||||
sector_t *numerator, sector_t *denominator)
|
||||
{
|
||||
*numerator = ((struct transient_c *) store->context)->next_free;
|
||||
*denominator = get_dev_size(store->snap->cow->bdev);
|
||||
}
|
||||
|
||||
int dm_create_transient(struct exception_store *store)
|
||||
{
|
||||
struct transient_c *tc;
|
||||
|
||||
store->destroy = transient_destroy;
|
||||
store->read_metadata = transient_read_metadata;
|
||||
store->prepare_exception = transient_prepare;
|
||||
store->commit_exception = transient_commit;
|
||||
store->drop_snapshot = NULL;
|
||||
store->fraction_full = transient_fraction_full;
|
||||
|
||||
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
|
||||
if (!tc)
|
||||
return -ENOMEM;
|
||||
|
||||
tc->next_free = 0;
|
||||
store->context = tc;
|
||||
|
||||
return 0;
|
||||
}
|
||||
215
drivers/md/dm-hw-handler.c
Normal file
215
drivers/md/dm-hw-handler.c
Normal file
@@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Multipath hardware handler registration.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-hw-handler.h"
|
||||
|
||||
#include <linux/slab.h>
|
||||
|
||||
struct hwh_internal {
|
||||
struct hw_handler_type hwht;
|
||||
|
||||
struct list_head list;
|
||||
long use;
|
||||
};
|
||||
|
||||
#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
|
||||
|
||||
static LIST_HEAD(_hw_handlers);
|
||||
static DECLARE_RWSEM(_hwh_lock);
|
||||
|
||||
static struct hwh_internal *__find_hw_handler_type(const char *name)
|
||||
{
|
||||
struct hwh_internal *hwhi;
|
||||
|
||||
list_for_each_entry(hwhi, &_hw_handlers, list) {
|
||||
if (!strcmp(name, hwhi->hwht.name))
|
||||
return hwhi;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct hwh_internal *get_hw_handler(const char *name)
|
||||
{
|
||||
struct hwh_internal *hwhi;
|
||||
|
||||
down_read(&_hwh_lock);
|
||||
hwhi = __find_hw_handler_type(name);
|
||||
if (hwhi) {
|
||||
if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
|
||||
hwhi = NULL;
|
||||
else
|
||||
hwhi->use++;
|
||||
}
|
||||
up_read(&_hwh_lock);
|
||||
|
||||
return hwhi;
|
||||
}
|
||||
|
||||
struct hw_handler_type *dm_get_hw_handler(const char *name)
|
||||
{
|
||||
struct hwh_internal *hwhi;
|
||||
|
||||
if (!name)
|
||||
return NULL;
|
||||
|
||||
hwhi = get_hw_handler(name);
|
||||
if (!hwhi) {
|
||||
request_module("dm-%s", name);
|
||||
hwhi = get_hw_handler(name);
|
||||
}
|
||||
|
||||
return hwhi ? &hwhi->hwht : NULL;
|
||||
}
|
||||
|
||||
void dm_put_hw_handler(struct hw_handler_type *hwht)
|
||||
{
|
||||
struct hwh_internal *hwhi;
|
||||
|
||||
if (!hwht)
|
||||
return;
|
||||
|
||||
down_read(&_hwh_lock);
|
||||
hwhi = __find_hw_handler_type(hwht->name);
|
||||
if (!hwhi)
|
||||
goto out;
|
||||
|
||||
if (--hwhi->use == 0)
|
||||
module_put(hwhi->hwht.module);
|
||||
|
||||
BUG_ON(hwhi->use < 0);
|
||||
|
||||
out:
|
||||
up_read(&_hwh_lock);
|
||||
}
|
||||
|
||||
static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
|
||||
{
|
||||
struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
|
||||
|
||||
if (hwhi) {
|
||||
memset(hwhi, 0, sizeof(*hwhi));
|
||||
hwhi->hwht = *hwht;
|
||||
}
|
||||
|
||||
return hwhi;
|
||||
}
|
||||
|
||||
int dm_register_hw_handler(struct hw_handler_type *hwht)
|
||||
{
|
||||
int r = 0;
|
||||
struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
|
||||
|
||||
if (!hwhi)
|
||||
return -ENOMEM;
|
||||
|
||||
down_write(&_hwh_lock);
|
||||
|
||||
if (__find_hw_handler_type(hwht->name)) {
|
||||
kfree(hwhi);
|
||||
r = -EEXIST;
|
||||
} else
|
||||
list_add(&hwhi->list, &_hw_handlers);
|
||||
|
||||
up_write(&_hwh_lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int dm_unregister_hw_handler(struct hw_handler_type *hwht)
|
||||
{
|
||||
struct hwh_internal *hwhi;
|
||||
|
||||
down_write(&_hwh_lock);
|
||||
|
||||
hwhi = __find_hw_handler_type(hwht->name);
|
||||
if (!hwhi) {
|
||||
up_write(&_hwh_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (hwhi->use) {
|
||||
up_write(&_hwh_lock);
|
||||
return -ETXTBSY;
|
||||
}
|
||||
|
||||
list_del(&hwhi->list);
|
||||
|
||||
up_write(&_hwh_lock);
|
||||
|
||||
kfree(hwhi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
|
||||
{
|
||||
#if 0
|
||||
int sense_key, asc, ascq;
|
||||
|
||||
if (bio->bi_error & BIO_SENSE) {
|
||||
/* FIXME: This is just an initial guess. */
|
||||
/* key / asc / ascq */
|
||||
sense_key = (bio->bi_error >> 16) & 0xff;
|
||||
asc = (bio->bi_error >> 8) & 0xff;
|
||||
ascq = bio->bi_error & 0xff;
|
||||
|
||||
switch (sense_key) {
|
||||
/* This block as a whole comes from the device.
|
||||
* So no point retrying on another path. */
|
||||
case 0x03: /* Medium error */
|
||||
case 0x05: /* Illegal request */
|
||||
case 0x07: /* Data protect */
|
||||
case 0x08: /* Blank check */
|
||||
case 0x0a: /* copy aborted */
|
||||
case 0x0c: /* obsolete - no clue ;-) */
|
||||
case 0x0d: /* volume overflow */
|
||||
case 0x0e: /* data miscompare */
|
||||
case 0x0f: /* reserved - no idea either. */
|
||||
return MP_ERROR_IO;
|
||||
|
||||
/* For these errors it's unclear whether they
|
||||
* come from the device or the controller.
|
||||
* So just lets try a different path, and if
|
||||
* it eventually succeeds, user-space will clear
|
||||
* the paths again... */
|
||||
case 0x02: /* Not ready */
|
||||
case 0x04: /* Hardware error */
|
||||
case 0x09: /* vendor specific */
|
||||
case 0x0b: /* Aborted command */
|
||||
return MP_FAIL_PATH;
|
||||
|
||||
case 0x06: /* Unit attention - might want to decode */
|
||||
if (asc == 0x04 && ascq == 0x01)
|
||||
/* "Unit in the process of
|
||||
* becoming ready" */
|
||||
return 0;
|
||||
return MP_FAIL_PATH;
|
||||
|
||||
/* FIXME: For Unit Not Ready we may want
|
||||
* to have a generic pg activation
|
||||
* feature (START_UNIT). */
|
||||
|
||||
/* Should these two ever end up in the
|
||||
* error path? I don't think so. */
|
||||
case 0x00: /* No sense */
|
||||
case 0x01: /* Recovered error */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* We got no idea how to decode the other kinds of errors ->
|
||||
* assume generic error condition. */
|
||||
return MP_FAIL_PATH;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(dm_register_hw_handler);
|
||||
EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
|
||||
EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
|
||||
61
drivers/md/dm-hw-handler.h
Normal file
61
drivers/md/dm-hw-handler.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Multipath hardware handler registration.
|
||||
*/
|
||||
|
||||
#ifndef DM_HW_HANDLER_H
|
||||
#define DM_HW_HANDLER_H
|
||||
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
#include "dm-mpath.h"
|
||||
|
||||
struct hw_handler_type;
|
||||
struct hw_handler {
|
||||
struct hw_handler_type *type;
|
||||
void *context;
|
||||
};
|
||||
|
||||
/*
|
||||
* Constructs a hardware handler object, takes custom arguments
|
||||
*/
|
||||
/* Information about a hardware handler type */
|
||||
struct hw_handler_type {
|
||||
char *name;
|
||||
struct module *module;
|
||||
|
||||
int (*create) (struct hw_handler *handler, unsigned int argc,
|
||||
char **argv);
|
||||
void (*destroy) (struct hw_handler *hwh);
|
||||
|
||||
void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
|
||||
struct dm_path *path);
|
||||
unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
|
||||
int (*status) (struct hw_handler *hwh, status_type_t type,
|
||||
char *result, unsigned int maxlen);
|
||||
};
|
||||
|
||||
/* Register a hardware handler */
|
||||
int dm_register_hw_handler(struct hw_handler_type *type);
|
||||
|
||||
/* Unregister a hardware handler */
|
||||
int dm_unregister_hw_handler(struct hw_handler_type *type);
|
||||
|
||||
/* Returns a registered hardware handler type */
|
||||
struct hw_handler_type *dm_get_hw_handler(const char *name);
|
||||
|
||||
/* Releases a hardware handler */
|
||||
void dm_put_hw_handler(struct hw_handler_type *hwht);
|
||||
|
||||
/* Default err function */
|
||||
unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
|
||||
|
||||
/* Error flags for err and dm_pg_init_complete */
|
||||
#define MP_FAIL_PATH 1
|
||||
#define MP_BYPASS_PG 2
|
||||
#define MP_ERROR_IO 4 /* Don't retry this I/O */
|
||||
|
||||
#endif
|
||||
426
drivers/md/dm-io.c
Normal file
426
drivers/md/dm-io.c
Normal file
@@ -0,0 +1,426 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm-io.h"
|
||||
|
||||
#include <linux/bio.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
static struct bio_set *_bios;
|
||||
|
||||
/* FIXME: can we shrink this ? */
|
||||
struct io {
|
||||
unsigned long error;
|
||||
atomic_t count;
|
||||
struct task_struct *sleeper;
|
||||
io_notify_fn callback;
|
||||
void *context;
|
||||
};
|
||||
|
||||
/*
|
||||
* io contexts are only dynamically allocated for asynchronous
|
||||
* io. Since async io is likely to be the majority of io we'll
|
||||
* have the same number of io contexts as buffer heads ! (FIXME:
|
||||
* must reduce this).
|
||||
*/
|
||||
static unsigned _num_ios;
|
||||
static mempool_t *_io_pool;
|
||||
|
||||
static unsigned int pages_to_ios(unsigned int pages)
|
||||
{
|
||||
return 4 * pages; /* too many ? */
|
||||
}
|
||||
|
||||
static int resize_pool(unsigned int new_ios)
|
||||
{
|
||||
int r = 0;
|
||||
|
||||
if (_io_pool) {
|
||||
if (new_ios == 0) {
|
||||
/* free off the pool */
|
||||
mempool_destroy(_io_pool);
|
||||
_io_pool = NULL;
|
||||
bioset_free(_bios);
|
||||
|
||||
} else {
|
||||
/* resize the pool */
|
||||
r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
|
||||
}
|
||||
|
||||
} else {
|
||||
/* create new pool */
|
||||
_io_pool = mempool_create_kmalloc_pool(new_ios,
|
||||
sizeof(struct io));
|
||||
if (!_io_pool)
|
||||
return -ENOMEM;
|
||||
|
||||
_bios = bioset_create(16, 16, 4);
|
||||
if (!_bios) {
|
||||
mempool_destroy(_io_pool);
|
||||
_io_pool = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (!r)
|
||||
_num_ios = new_ios;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int dm_io_get(unsigned int num_pages)
|
||||
{
|
||||
return resize_pool(_num_ios + pages_to_ios(num_pages));
|
||||
}
|
||||
|
||||
void dm_io_put(unsigned int num_pages)
|
||||
{
|
||||
resize_pool(_num_ios - pages_to_ios(num_pages));
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* We need to keep track of which region a bio is doing io for.
|
||||
* In order to save a memory allocation we store this the last
|
||||
* bvec which we know is unused (blech).
|
||||
* XXX This is ugly and can OOPS with some configs... find another way.
|
||||
*---------------------------------------------------------------*/
|
||||
static inline void bio_set_region(struct bio *bio, unsigned region)
|
||||
{
|
||||
bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
|
||||
}
|
||||
|
||||
static inline unsigned bio_get_region(struct bio *bio)
|
||||
{
|
||||
return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* We need an io object to keep track of the number of bios that
|
||||
* have been dispatched for a particular io.
|
||||
*---------------------------------------------------------------*/
|
||||
static void dec_count(struct io *io, unsigned int region, int error)
|
||||
{
|
||||
if (error)
|
||||
set_bit(region, &io->error);
|
||||
|
||||
if (atomic_dec_and_test(&io->count)) {
|
||||
if (io->sleeper)
|
||||
wake_up_process(io->sleeper);
|
||||
|
||||
else {
|
||||
int r = io->error;
|
||||
io_notify_fn fn = io->callback;
|
||||
void *context = io->context;
|
||||
|
||||
mempool_free(io, _io_pool);
|
||||
fn(r, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int endio(struct bio *bio, unsigned int done, int error)
|
||||
{
|
||||
struct io *io = (struct io *) bio->bi_private;
|
||||
|
||||
/* keep going until we've finished */
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
if (error && bio_data_dir(bio) == READ)
|
||||
zero_fill_bio(bio);
|
||||
|
||||
dec_count(io, bio_get_region(bio), error);
|
||||
bio->bi_max_vecs++;
|
||||
bio_put(bio);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* These little objects provide an abstraction for getting a new
|
||||
* destination page for io.
|
||||
*---------------------------------------------------------------*/
|
||||
struct dpages {
|
||||
void (*get_page)(struct dpages *dp,
|
||||
struct page **p, unsigned long *len, unsigned *offset);
|
||||
void (*next_page)(struct dpages *dp);
|
||||
|
||||
unsigned context_u;
|
||||
void *context_ptr;
|
||||
};
|
||||
|
||||
/*
|
||||
* Functions for getting the pages from a list.
|
||||
*/
|
||||
static void list_get_page(struct dpages *dp,
|
||||
struct page **p, unsigned long *len, unsigned *offset)
|
||||
{
|
||||
unsigned o = dp->context_u;
|
||||
struct page_list *pl = (struct page_list *) dp->context_ptr;
|
||||
|
||||
*p = pl->page;
|
||||
*len = PAGE_SIZE - o;
|
||||
*offset = o;
|
||||
}
|
||||
|
||||
static void list_next_page(struct dpages *dp)
|
||||
{
|
||||
struct page_list *pl = (struct page_list *) dp->context_ptr;
|
||||
dp->context_ptr = pl->next;
|
||||
dp->context_u = 0;
|
||||
}
|
||||
|
||||
static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
|
||||
{
|
||||
dp->get_page = list_get_page;
|
||||
dp->next_page = list_next_page;
|
||||
dp->context_u = offset;
|
||||
dp->context_ptr = pl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Functions for getting the pages from a bvec.
|
||||
*/
|
||||
static void bvec_get_page(struct dpages *dp,
|
||||
struct page **p, unsigned long *len, unsigned *offset)
|
||||
{
|
||||
struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
|
||||
*p = bvec->bv_page;
|
||||
*len = bvec->bv_len;
|
||||
*offset = bvec->bv_offset;
|
||||
}
|
||||
|
||||
static void bvec_next_page(struct dpages *dp)
|
||||
{
|
||||
struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
|
||||
dp->context_ptr = bvec + 1;
|
||||
}
|
||||
|
||||
static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
|
||||
{
|
||||
dp->get_page = bvec_get_page;
|
||||
dp->next_page = bvec_next_page;
|
||||
dp->context_ptr = bvec;
|
||||
}
|
||||
|
||||
static void vm_get_page(struct dpages *dp,
|
||||
struct page **p, unsigned long *len, unsigned *offset)
|
||||
{
|
||||
*p = vmalloc_to_page(dp->context_ptr);
|
||||
*offset = dp->context_u;
|
||||
*len = PAGE_SIZE - dp->context_u;
|
||||
}
|
||||
|
||||
static void vm_next_page(struct dpages *dp)
|
||||
{
|
||||
dp->context_ptr += PAGE_SIZE - dp->context_u;
|
||||
dp->context_u = 0;
|
||||
}
|
||||
|
||||
static void vm_dp_init(struct dpages *dp, void *data)
|
||||
{
|
||||
dp->get_page = vm_get_page;
|
||||
dp->next_page = vm_next_page;
|
||||
dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
|
||||
dp->context_ptr = data;
|
||||
}
|
||||
|
||||
static void dm_bio_destructor(struct bio *bio)
|
||||
{
|
||||
bio_free(bio, _bios);
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* IO routines that accept a list of pages.
|
||||
*---------------------------------------------------------------*/
|
||||
static void do_region(int rw, unsigned int region, struct io_region *where,
|
||||
struct dpages *dp, struct io *io)
|
||||
{
|
||||
struct bio *bio;
|
||||
struct page *page;
|
||||
unsigned long len;
|
||||
unsigned offset;
|
||||
unsigned num_bvecs;
|
||||
sector_t remaining = where->count;
|
||||
|
||||
while (remaining) {
|
||||
/*
|
||||
* Allocate a suitably sized-bio: we add an extra
|
||||
* bvec for bio_get/set_region() and decrement bi_max_vecs
|
||||
* to hide it from bio_add_page().
|
||||
*/
|
||||
num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
|
||||
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
|
||||
bio->bi_sector = where->sector + (where->count - remaining);
|
||||
bio->bi_bdev = where->bdev;
|
||||
bio->bi_end_io = endio;
|
||||
bio->bi_private = io;
|
||||
bio->bi_destructor = dm_bio_destructor;
|
||||
bio->bi_max_vecs--;
|
||||
bio_set_region(bio, region);
|
||||
|
||||
/*
|
||||
* Try and add as many pages as possible.
|
||||
*/
|
||||
while (remaining) {
|
||||
dp->get_page(dp, &page, &len, &offset);
|
||||
len = min(len, to_bytes(remaining));
|
||||
if (!bio_add_page(bio, page, len, offset))
|
||||
break;
|
||||
|
||||
offset = 0;
|
||||
remaining -= to_sector(len);
|
||||
dp->next_page(dp);
|
||||
}
|
||||
|
||||
atomic_inc(&io->count);
|
||||
submit_bio(rw, bio);
|
||||
}
|
||||
}
|
||||
|
||||
static void dispatch_io(int rw, unsigned int num_regions,
|
||||
struct io_region *where, struct dpages *dp,
|
||||
struct io *io, int sync)
|
||||
{
|
||||
int i;
|
||||
struct dpages old_pages = *dp;
|
||||
|
||||
if (sync)
|
||||
rw |= (1 << BIO_RW_SYNC);
|
||||
|
||||
/*
|
||||
* For multiple regions we need to be careful to rewind
|
||||
* the dp object for each call to do_region.
|
||||
*/
|
||||
for (i = 0; i < num_regions; i++) {
|
||||
*dp = old_pages;
|
||||
if (where[i].count)
|
||||
do_region(rw, i, where + i, dp, io);
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop the extra reference that we were holding to avoid
|
||||
* the io being completed too early.
|
||||
*/
|
||||
dec_count(io, 0, 0);
|
||||
}
|
||||
|
||||
static int sync_io(unsigned int num_regions, struct io_region *where,
|
||||
int rw, struct dpages *dp, unsigned long *error_bits)
|
||||
{
|
||||
struct io io;
|
||||
|
||||
if (num_regions > 1 && rw != WRITE) {
|
||||
WARN_ON(1);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
io.error = 0;
|
||||
atomic_set(&io.count, 1); /* see dispatch_io() */
|
||||
io.sleeper = current;
|
||||
|
||||
dispatch_io(rw, num_regions, where, dp, &io, 1);
|
||||
|
||||
while (1) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
|
||||
if (!atomic_read(&io.count) || signal_pending(current))
|
||||
break;
|
||||
|
||||
io_schedule();
|
||||
}
|
||||
set_current_state(TASK_RUNNING);
|
||||
|
||||
if (atomic_read(&io.count))
|
||||
return -EINTR;
|
||||
|
||||
*error_bits = io.error;
|
||||
return io.error ? -EIO : 0;
|
||||
}
|
||||
|
||||
static int async_io(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct dpages *dp, io_notify_fn fn, void *context)
|
||||
{
|
||||
struct io *io;
|
||||
|
||||
if (num_regions > 1 && rw != WRITE) {
|
||||
WARN_ON(1);
|
||||
fn(1, context);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
io = mempool_alloc(_io_pool, GFP_NOIO);
|
||||
io->error = 0;
|
||||
atomic_set(&io->count, 1); /* see dispatch_io() */
|
||||
io->sleeper = NULL;
|
||||
io->callback = fn;
|
||||
io->context = context;
|
||||
|
||||
dispatch_io(rw, num_regions, where, dp, io, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct page_list *pl, unsigned int offset,
|
||||
unsigned long *error_bits)
|
||||
{
|
||||
struct dpages dp;
|
||||
list_dp_init(&dp, pl, offset);
|
||||
return sync_io(num_regions, where, rw, &dp, error_bits);
|
||||
}
|
||||
|
||||
int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct bio_vec *bvec, unsigned long *error_bits)
|
||||
{
|
||||
struct dpages dp;
|
||||
bvec_dp_init(&dp, bvec);
|
||||
return sync_io(num_regions, where, rw, &dp, error_bits);
|
||||
}
|
||||
|
||||
int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
|
||||
void *data, unsigned long *error_bits)
|
||||
{
|
||||
struct dpages dp;
|
||||
vm_dp_init(&dp, data);
|
||||
return sync_io(num_regions, where, rw, &dp, error_bits);
|
||||
}
|
||||
|
||||
int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct page_list *pl, unsigned int offset,
|
||||
io_notify_fn fn, void *context)
|
||||
{
|
||||
struct dpages dp;
|
||||
list_dp_init(&dp, pl, offset);
|
||||
return async_io(num_regions, where, rw, &dp, fn, context);
|
||||
}
|
||||
|
||||
int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct bio_vec *bvec, io_notify_fn fn, void *context)
|
||||
{
|
||||
struct dpages dp;
|
||||
bvec_dp_init(&dp, bvec);
|
||||
return async_io(num_regions, where, rw, &dp, fn, context);
|
||||
}
|
||||
|
||||
int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
|
||||
void *data, io_notify_fn fn, void *context)
|
||||
{
|
||||
struct dpages dp;
|
||||
vm_dp_init(&dp, data);
|
||||
return async_io(num_regions, where, rw, &dp, fn, context);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(dm_io_get);
|
||||
EXPORT_SYMBOL(dm_io_put);
|
||||
EXPORT_SYMBOL(dm_io_sync);
|
||||
EXPORT_SYMBOL(dm_io_async);
|
||||
EXPORT_SYMBOL(dm_io_sync_bvec);
|
||||
EXPORT_SYMBOL(dm_io_async_bvec);
|
||||
EXPORT_SYMBOL(dm_io_sync_vm);
|
||||
EXPORT_SYMBOL(dm_io_async_vm);
|
||||
74
drivers/md/dm-io.h
Normal file
74
drivers/md/dm-io.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#ifndef _DM_IO_H
|
||||
#define _DM_IO_H
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
struct io_region {
|
||||
struct block_device *bdev;
|
||||
sector_t sector;
|
||||
sector_t count;
|
||||
};
|
||||
|
||||
struct page_list {
|
||||
struct page_list *next;
|
||||
struct page *page;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* 'error' is a bitset, with each bit indicating whether an error
|
||||
* occurred doing io to the corresponding region.
|
||||
*/
|
||||
typedef void (*io_notify_fn)(unsigned long error, void *context);
|
||||
|
||||
|
||||
/*
|
||||
* Before anyone uses the IO interface they should call
|
||||
* dm_io_get(), specifying roughly how many pages they are
|
||||
* expecting to perform io on concurrently.
|
||||
*
|
||||
* This function may block.
|
||||
*/
|
||||
int dm_io_get(unsigned int num_pages);
|
||||
void dm_io_put(unsigned int num_pages);
|
||||
|
||||
/*
|
||||
* Synchronous IO.
|
||||
*
|
||||
* Please ensure that the rw flag in the next two functions is
|
||||
* either READ or WRITE, ie. we don't take READA. Any
|
||||
* regions with a zero count field will be ignored.
|
||||
*/
|
||||
int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct page_list *pl, unsigned int offset,
|
||||
unsigned long *error_bits);
|
||||
|
||||
int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct bio_vec *bvec, unsigned long *error_bits);
|
||||
|
||||
int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
|
||||
void *data, unsigned long *error_bits);
|
||||
|
||||
/*
|
||||
* Aynchronous IO.
|
||||
*
|
||||
* The 'where' array may be safely allocated on the stack since
|
||||
* the function takes a copy.
|
||||
*/
|
||||
int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct page_list *pl, unsigned int offset,
|
||||
io_notify_fn fn, void *context);
|
||||
|
||||
int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
|
||||
struct bio_vec *bvec, io_notify_fn fn, void *context);
|
||||
|
||||
int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
|
||||
void *data, io_notify_fn fn, void *context);
|
||||
|
||||
#endif
|
||||
1517
drivers/md/dm-ioctl.c
Normal file
1517
drivers/md/dm-ioctl.c
Normal file
File diff suppressed because it is too large
Load Diff
144
drivers/md/dm-linear.c
Normal file
144
drivers/md/dm-linear.c
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Copyright (C) 2001-2003 Sistina Software (UK) Limited.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define DM_MSG_PREFIX "linear"
|
||||
|
||||
/*
|
||||
* Linear: maps a linear range of a device.
|
||||
*/
|
||||
struct linear_c {
|
||||
struct dm_dev *dev;
|
||||
sector_t start;
|
||||
};
|
||||
|
||||
/*
|
||||
* Construct a linear mapping: <dev_path> <offset>
|
||||
*/
|
||||
static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||
{
|
||||
struct linear_c *lc;
|
||||
unsigned long long tmp;
|
||||
|
||||
if (argc != 2) {
|
||||
ti->error = "Invalid argument count";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
|
||||
if (lc == NULL) {
|
||||
ti->error = "dm-linear: Cannot allocate linear context";
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (sscanf(argv[1], "%llu", &tmp) != 1) {
|
||||
ti->error = "dm-linear: Invalid device sector";
|
||||
goto bad;
|
||||
}
|
||||
lc->start = tmp;
|
||||
|
||||
if (dm_get_device(ti, argv[0], lc->start, ti->len,
|
||||
dm_table_get_mode(ti->table), &lc->dev)) {
|
||||
ti->error = "dm-linear: Device lookup failed";
|
||||
goto bad;
|
||||
}
|
||||
|
||||
ti->private = lc;
|
||||
return 0;
|
||||
|
||||
bad:
|
||||
kfree(lc);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void linear_dtr(struct dm_target *ti)
|
||||
{
|
||||
struct linear_c *lc = (struct linear_c *) ti->private;
|
||||
|
||||
dm_put_device(ti, lc->dev);
|
||||
kfree(lc);
|
||||
}
|
||||
|
||||
static int linear_map(struct dm_target *ti, struct bio *bio,
|
||||
union map_info *map_context)
|
||||
{
|
||||
struct linear_c *lc = (struct linear_c *) ti->private;
|
||||
|
||||
bio->bi_bdev = lc->dev->bdev;
|
||||
bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
|
||||
|
||||
return DM_MAPIO_REMAPPED;
|
||||
}
|
||||
|
||||
static int linear_status(struct dm_target *ti, status_type_t type,
|
||||
char *result, unsigned int maxlen)
|
||||
{
|
||||
struct linear_c *lc = (struct linear_c *) ti->private;
|
||||
|
||||
switch (type) {
|
||||
case STATUSTYPE_INFO:
|
||||
result[0] = '\0';
|
||||
break;
|
||||
|
||||
case STATUSTYPE_TABLE:
|
||||
snprintf(result, maxlen, "%s %llu", lc->dev->name,
|
||||
(unsigned long long)lc->start);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linear_ioctl(struct dm_target *ti, struct inode *inode,
|
||||
struct file *filp, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
struct linear_c *lc = (struct linear_c *) ti->private;
|
||||
struct block_device *bdev = lc->dev->bdev;
|
||||
struct file fake_file = {};
|
||||
struct dentry fake_dentry = {};
|
||||
|
||||
fake_file.f_mode = lc->dev->mode;
|
||||
fake_file.f_path.dentry = &fake_dentry;
|
||||
fake_dentry.d_inode = bdev->bd_inode;
|
||||
|
||||
return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
|
||||
}
|
||||
|
||||
static struct target_type linear_target = {
|
||||
.name = "linear",
|
||||
.version= {1, 0, 2},
|
||||
.module = THIS_MODULE,
|
||||
.ctr = linear_ctr,
|
||||
.dtr = linear_dtr,
|
||||
.map = linear_map,
|
||||
.status = linear_status,
|
||||
.ioctl = linear_ioctl,
|
||||
};
|
||||
|
||||
int __init dm_linear_init(void)
|
||||
{
|
||||
int r = dm_register_target(&linear_target);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("register failed %d", r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void dm_linear_exit(void)
|
||||
{
|
||||
int r = dm_unregister_target(&linear_target);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("unregister failed %d", r);
|
||||
}
|
||||
690
drivers/md/dm-log.c
Normal file
690
drivers/md/dm-log.c
Normal file
@@ -0,0 +1,690 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software
|
||||
*
|
||||
* This file is released under the LGPL.
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include "dm-log.h"
|
||||
#include "dm-io.h"
|
||||
|
||||
#define DM_MSG_PREFIX "mirror log"
|
||||
|
||||
static LIST_HEAD(_log_types);
|
||||
static DEFINE_SPINLOCK(_lock);
|
||||
|
||||
int dm_register_dirty_log_type(struct dirty_log_type *type)
|
||||
{
|
||||
spin_lock(&_lock);
|
||||
type->use_count = 0;
|
||||
list_add(&type->list, &_log_types);
|
||||
spin_unlock(&_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dm_unregister_dirty_log_type(struct dirty_log_type *type)
|
||||
{
|
||||
spin_lock(&_lock);
|
||||
|
||||
if (type->use_count)
|
||||
DMWARN("Attempt to unregister a log type that is still in use");
|
||||
else
|
||||
list_del(&type->list);
|
||||
|
||||
spin_unlock(&_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct dirty_log_type *get_type(const char *type_name)
|
||||
{
|
||||
struct dirty_log_type *type;
|
||||
|
||||
spin_lock(&_lock);
|
||||
list_for_each_entry (type, &_log_types, list)
|
||||
if (!strcmp(type_name, type->name)) {
|
||||
if (!type->use_count && !try_module_get(type->module)){
|
||||
spin_unlock(&_lock);
|
||||
return NULL;
|
||||
}
|
||||
type->use_count++;
|
||||
spin_unlock(&_lock);
|
||||
return type;
|
||||
}
|
||||
|
||||
spin_unlock(&_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void put_type(struct dirty_log_type *type)
|
||||
{
|
||||
spin_lock(&_lock);
|
||||
if (!--type->use_count)
|
||||
module_put(type->module);
|
||||
spin_unlock(&_lock);
|
||||
}
|
||||
|
||||
struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
|
||||
unsigned int argc, char **argv)
|
||||
{
|
||||
struct dirty_log_type *type;
|
||||
struct dirty_log *log;
|
||||
|
||||
log = kmalloc(sizeof(*log), GFP_KERNEL);
|
||||
if (!log)
|
||||
return NULL;
|
||||
|
||||
type = get_type(type_name);
|
||||
if (!type) {
|
||||
kfree(log);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
log->type = type;
|
||||
if (type->ctr(log, ti, argc, argv)) {
|
||||
kfree(log);
|
||||
put_type(type);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return log;
|
||||
}
|
||||
|
||||
void dm_destroy_dirty_log(struct dirty_log *log)
|
||||
{
|
||||
log->type->dtr(log);
|
||||
put_type(log->type);
|
||||
kfree(log);
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Persistent and core logs share a lot of their implementation.
|
||||
* FIXME: need a reload method to be called from a resume
|
||||
*---------------------------------------------------------------*/
|
||||
/*
|
||||
* Magic for persistent mirrors: "MiRr"
|
||||
*/
|
||||
#define MIRROR_MAGIC 0x4D695272
|
||||
|
||||
/*
|
||||
* The on-disk version of the metadata.
|
||||
*/
|
||||
#define MIRROR_DISK_VERSION 2
|
||||
#define LOG_OFFSET 2
|
||||
|
||||
struct log_header {
|
||||
uint32_t magic;
|
||||
|
||||
/*
|
||||
* Simple, incrementing version. no backward
|
||||
* compatibility.
|
||||
*/
|
||||
uint32_t version;
|
||||
sector_t nr_regions;
|
||||
};
|
||||
|
||||
struct log_c {
|
||||
struct dm_target *ti;
|
||||
int touched;
|
||||
uint32_t region_size;
|
||||
unsigned int region_count;
|
||||
region_t sync_count;
|
||||
|
||||
unsigned bitset_uint32_count;
|
||||
uint32_t *clean_bits;
|
||||
uint32_t *sync_bits;
|
||||
uint32_t *recovering_bits; /* FIXME: this seems excessive */
|
||||
|
||||
int sync_search;
|
||||
|
||||
/* Resync flag */
|
||||
enum sync {
|
||||
DEFAULTSYNC, /* Synchronize if necessary */
|
||||
NOSYNC, /* Devices known to be already in sync */
|
||||
FORCESYNC, /* Force a sync to happen */
|
||||
} sync;
|
||||
|
||||
/*
|
||||
* Disk log fields
|
||||
*/
|
||||
struct dm_dev *log_dev;
|
||||
struct log_header header;
|
||||
|
||||
struct io_region header_location;
|
||||
struct log_header *disk_header;
|
||||
};
|
||||
|
||||
/*
|
||||
* The touched member needs to be updated every time we access
|
||||
* one of the bitsets.
|
||||
*/
|
||||
static inline int log_test_bit(uint32_t *bs, unsigned bit)
|
||||
{
|
||||
return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
|
||||
}
|
||||
|
||||
static inline void log_set_bit(struct log_c *l,
|
||||
uint32_t *bs, unsigned bit)
|
||||
{
|
||||
ext2_set_bit(bit, (unsigned long *) bs);
|
||||
l->touched = 1;
|
||||
}
|
||||
|
||||
static inline void log_clear_bit(struct log_c *l,
|
||||
uint32_t *bs, unsigned bit)
|
||||
{
|
||||
ext2_clear_bit(bit, (unsigned long *) bs);
|
||||
l->touched = 1;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------
|
||||
* Header IO
|
||||
*--------------------------------------------------------------*/
|
||||
static void header_to_disk(struct log_header *core, struct log_header *disk)
|
||||
{
|
||||
disk->magic = cpu_to_le32(core->magic);
|
||||
disk->version = cpu_to_le32(core->version);
|
||||
disk->nr_regions = cpu_to_le64(core->nr_regions);
|
||||
}
|
||||
|
||||
static void header_from_disk(struct log_header *core, struct log_header *disk)
|
||||
{
|
||||
core->magic = le32_to_cpu(disk->magic);
|
||||
core->version = le32_to_cpu(disk->version);
|
||||
core->nr_regions = le64_to_cpu(disk->nr_regions);
|
||||
}
|
||||
|
||||
static int read_header(struct log_c *log)
|
||||
{
|
||||
int r;
|
||||
unsigned long ebits;
|
||||
|
||||
r = dm_io_sync_vm(1, &log->header_location, READ,
|
||||
log->disk_header, &ebits);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
header_from_disk(&log->header, log->disk_header);
|
||||
|
||||
/* New log required? */
|
||||
if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
|
||||
log->header.magic = MIRROR_MAGIC;
|
||||
log->header.version = MIRROR_DISK_VERSION;
|
||||
log->header.nr_regions = 0;
|
||||
}
|
||||
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
if (log->header.version == 1)
|
||||
log->header.version = 2;
|
||||
#endif
|
||||
|
||||
if (log->header.version != MIRROR_DISK_VERSION) {
|
||||
DMWARN("incompatible disk log version");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int write_header(struct log_c *log)
|
||||
{
|
||||
unsigned long ebits;
|
||||
|
||||
header_to_disk(&log->header, log->disk_header);
|
||||
return dm_io_sync_vm(1, &log->header_location, WRITE,
|
||||
log->disk_header, &ebits);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------
|
||||
* core log constructor/destructor
|
||||
*
|
||||
* argv contains region_size followed optionally by [no]sync
|
||||
*--------------------------------------------------------------*/
|
||||
#define BYTE_SHIFT 3
|
||||
static int create_log_context(struct dirty_log *log, struct dm_target *ti,
|
||||
unsigned int argc, char **argv,
|
||||
struct dm_dev *dev)
|
||||
{
|
||||
enum sync sync = DEFAULTSYNC;
|
||||
|
||||
struct log_c *lc;
|
||||
uint32_t region_size;
|
||||
unsigned int region_count;
|
||||
size_t bitset_size, buf_size;
|
||||
|
||||
if (argc < 1 || argc > 2) {
|
||||
DMWARN("wrong number of arguments to mirror log");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (argc > 1) {
|
||||
if (!strcmp(argv[1], "sync"))
|
||||
sync = FORCESYNC;
|
||||
else if (!strcmp(argv[1], "nosync"))
|
||||
sync = NOSYNC;
|
||||
else {
|
||||
DMWARN("unrecognised sync argument to mirror log: %s",
|
||||
argv[1]);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
if (sscanf(argv[0], "%u", ®ion_size) != 1) {
|
||||
DMWARN("invalid region size string");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
region_count = dm_sector_div_up(ti->len, region_size);
|
||||
|
||||
lc = kmalloc(sizeof(*lc), GFP_KERNEL);
|
||||
if (!lc) {
|
||||
DMWARN("couldn't allocate core log");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
lc->ti = ti;
|
||||
lc->touched = 0;
|
||||
lc->region_size = region_size;
|
||||
lc->region_count = region_count;
|
||||
lc->sync = sync;
|
||||
|
||||
/*
|
||||
* Work out how many "unsigned long"s we need to hold the bitset.
|
||||
*/
|
||||
bitset_size = dm_round_up(region_count,
|
||||
sizeof(*lc->clean_bits) << BYTE_SHIFT);
|
||||
bitset_size >>= BYTE_SHIFT;
|
||||
|
||||
lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
|
||||
|
||||
/*
|
||||
* Disk log?
|
||||
*/
|
||||
if (!dev) {
|
||||
lc->clean_bits = vmalloc(bitset_size);
|
||||
if (!lc->clean_bits) {
|
||||
DMWARN("couldn't allocate clean bitset");
|
||||
kfree(lc);
|
||||
return -ENOMEM;
|
||||
}
|
||||
lc->disk_header = NULL;
|
||||
} else {
|
||||
lc->log_dev = dev;
|
||||
lc->header_location.bdev = lc->log_dev->bdev;
|
||||
lc->header_location.sector = 0;
|
||||
|
||||
/*
|
||||
* Buffer holds both header and bitset.
|
||||
*/
|
||||
buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
|
||||
bitset_size, ti->limits.hardsect_size);
|
||||
lc->header_location.count = buf_size >> SECTOR_SHIFT;
|
||||
|
||||
lc->disk_header = vmalloc(buf_size);
|
||||
if (!lc->disk_header) {
|
||||
DMWARN("couldn't allocate disk log buffer");
|
||||
kfree(lc);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
lc->clean_bits = (void *)lc->disk_header +
|
||||
(LOG_OFFSET << SECTOR_SHIFT);
|
||||
}
|
||||
|
||||
memset(lc->clean_bits, -1, bitset_size);
|
||||
|
||||
lc->sync_bits = vmalloc(bitset_size);
|
||||
if (!lc->sync_bits) {
|
||||
DMWARN("couldn't allocate sync bitset");
|
||||
if (!dev)
|
||||
vfree(lc->clean_bits);
|
||||
vfree(lc->disk_header);
|
||||
kfree(lc);
|
||||
return -ENOMEM;
|
||||
}
|
||||
memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
|
||||
lc->sync_count = (sync == NOSYNC) ? region_count : 0;
|
||||
|
||||
lc->recovering_bits = vmalloc(bitset_size);
|
||||
if (!lc->recovering_bits) {
|
||||
DMWARN("couldn't allocate sync bitset");
|
||||
vfree(lc->sync_bits);
|
||||
if (!dev)
|
||||
vfree(lc->clean_bits);
|
||||
vfree(lc->disk_header);
|
||||
kfree(lc);
|
||||
return -ENOMEM;
|
||||
}
|
||||
memset(lc->recovering_bits, 0, bitset_size);
|
||||
lc->sync_search = 0;
|
||||
log->context = lc;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int core_ctr(struct dirty_log *log, struct dm_target *ti,
|
||||
unsigned int argc, char **argv)
|
||||
{
|
||||
return create_log_context(log, ti, argc, argv, NULL);
|
||||
}
|
||||
|
||||
static void destroy_log_context(struct log_c *lc)
|
||||
{
|
||||
vfree(lc->sync_bits);
|
||||
vfree(lc->recovering_bits);
|
||||
kfree(lc);
|
||||
}
|
||||
|
||||
static void core_dtr(struct dirty_log *log)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
vfree(lc->clean_bits);
|
||||
destroy_log_context(lc);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------
|
||||
* disk log constructor/destructor
|
||||
*
|
||||
* argv contains log_device region_size followed optionally by [no]sync
|
||||
*--------------------------------------------------------------*/
|
||||
static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
|
||||
unsigned int argc, char **argv)
|
||||
{
|
||||
int r;
|
||||
struct dm_dev *dev;
|
||||
|
||||
if (argc < 2 || argc > 3) {
|
||||
DMWARN("wrong number of arguments to disk mirror log");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
|
||||
FMODE_READ | FMODE_WRITE, &dev);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = create_log_context(log, ti, argc - 1, argv + 1, dev);
|
||||
if (r) {
|
||||
dm_put_device(ti, dev);
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void disk_dtr(struct dirty_log *log)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
dm_put_device(lc->ti, lc->log_dev);
|
||||
vfree(lc->disk_header);
|
||||
destroy_log_context(lc);
|
||||
}
|
||||
|
||||
static int count_bits32(uint32_t *addr, unsigned size)
|
||||
{
|
||||
int count = 0, i;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
count += hweight32(*(addr+i));
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static int disk_resume(struct dirty_log *log)
|
||||
{
|
||||
int r;
|
||||
unsigned i;
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
|
||||
|
||||
/* read the disk header */
|
||||
r = read_header(lc);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/* set or clear any new bits -- device has grown */
|
||||
if (lc->sync == NOSYNC)
|
||||
for (i = lc->header.nr_regions; i < lc->region_count; i++)
|
||||
/* FIXME: amazingly inefficient */
|
||||
log_set_bit(lc, lc->clean_bits, i);
|
||||
else
|
||||
for (i = lc->header.nr_regions; i < lc->region_count; i++)
|
||||
/* FIXME: amazingly inefficient */
|
||||
log_clear_bit(lc, lc->clean_bits, i);
|
||||
|
||||
/* clear any old bits -- device has shrunk */
|
||||
for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
|
||||
log_clear_bit(lc, lc->clean_bits, i);
|
||||
|
||||
/* copy clean across to sync */
|
||||
memcpy(lc->sync_bits, lc->clean_bits, size);
|
||||
lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
|
||||
lc->sync_search = 0;
|
||||
|
||||
/* set the correct number of regions in the header */
|
||||
lc->header.nr_regions = lc->region_count;
|
||||
|
||||
/* write the new header */
|
||||
return write_header(lc);
|
||||
}
|
||||
|
||||
static uint32_t core_get_region_size(struct dirty_log *log)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
return lc->region_size;
|
||||
}
|
||||
|
||||
static int core_resume(struct dirty_log *log)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
lc->sync_search = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int core_is_clean(struct dirty_log *log, region_t region)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
return log_test_bit(lc->clean_bits, region);
|
||||
}
|
||||
|
||||
static int core_in_sync(struct dirty_log *log, region_t region, int block)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
return log_test_bit(lc->sync_bits, region);
|
||||
}
|
||||
|
||||
static int core_flush(struct dirty_log *log)
|
||||
{
|
||||
/* no op */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int disk_flush(struct dirty_log *log)
|
||||
{
|
||||
int r;
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
/* only write if the log has changed */
|
||||
if (!lc->touched)
|
||||
return 0;
|
||||
|
||||
r = write_header(lc);
|
||||
if (!r)
|
||||
lc->touched = 0;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void core_mark_region(struct dirty_log *log, region_t region)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
log_clear_bit(lc, lc->clean_bits, region);
|
||||
}
|
||||
|
||||
static void core_clear_region(struct dirty_log *log, region_t region)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
log_set_bit(lc, lc->clean_bits, region);
|
||||
}
|
||||
|
||||
static int core_get_resync_work(struct dirty_log *log, region_t *region)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
if (lc->sync_search >= lc->region_count)
|
||||
return 0;
|
||||
|
||||
do {
|
||||
*region = ext2_find_next_zero_bit(
|
||||
(unsigned long *) lc->sync_bits,
|
||||
lc->region_count,
|
||||
lc->sync_search);
|
||||
lc->sync_search = *region + 1;
|
||||
|
||||
if (*region >= lc->region_count)
|
||||
return 0;
|
||||
|
||||
} while (log_test_bit(lc->recovering_bits, *region));
|
||||
|
||||
log_set_bit(lc, lc->recovering_bits, *region);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void core_set_region_sync(struct dirty_log *log, region_t region,
|
||||
int in_sync)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
log_clear_bit(lc, lc->recovering_bits, region);
|
||||
if (in_sync) {
|
||||
log_set_bit(lc, lc->sync_bits, region);
|
||||
lc->sync_count++;
|
||||
} else if (log_test_bit(lc->sync_bits, region)) {
|
||||
lc->sync_count--;
|
||||
log_clear_bit(lc, lc->sync_bits, region);
|
||||
}
|
||||
}
|
||||
|
||||
static region_t core_get_sync_count(struct dirty_log *log)
|
||||
{
|
||||
struct log_c *lc = (struct log_c *) log->context;
|
||||
|
||||
return lc->sync_count;
|
||||
}
|
||||
|
||||
#define DMEMIT_SYNC \
|
||||
if (lc->sync != DEFAULTSYNC) \
|
||||
DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
|
||||
|
||||
static int core_status(struct dirty_log *log, status_type_t status,
|
||||
char *result, unsigned int maxlen)
|
||||
{
|
||||
int sz = 0;
|
||||
struct log_c *lc = log->context;
|
||||
|
||||
switch(status) {
|
||||
case STATUSTYPE_INFO:
|
||||
break;
|
||||
|
||||
case STATUSTYPE_TABLE:
|
||||
DMEMIT("%s %u %u ", log->type->name,
|
||||
lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
|
||||
DMEMIT_SYNC;
|
||||
}
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
||||
static int disk_status(struct dirty_log *log, status_type_t status,
|
||||
char *result, unsigned int maxlen)
|
||||
{
|
||||
int sz = 0;
|
||||
char buffer[16];
|
||||
struct log_c *lc = log->context;
|
||||
|
||||
switch(status) {
|
||||
case STATUSTYPE_INFO:
|
||||
break;
|
||||
|
||||
case STATUSTYPE_TABLE:
|
||||
format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
|
||||
DMEMIT("%s %u %s %u ", log->type->name,
|
||||
lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
|
||||
lc->region_size);
|
||||
DMEMIT_SYNC;
|
||||
}
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
||||
static struct dirty_log_type _core_type = {
|
||||
.name = "core",
|
||||
.module = THIS_MODULE,
|
||||
.ctr = core_ctr,
|
||||
.dtr = core_dtr,
|
||||
.resume = core_resume,
|
||||
.get_region_size = core_get_region_size,
|
||||
.is_clean = core_is_clean,
|
||||
.in_sync = core_in_sync,
|
||||
.flush = core_flush,
|
||||
.mark_region = core_mark_region,
|
||||
.clear_region = core_clear_region,
|
||||
.get_resync_work = core_get_resync_work,
|
||||
.set_region_sync = core_set_region_sync,
|
||||
.get_sync_count = core_get_sync_count,
|
||||
.status = core_status,
|
||||
};
|
||||
|
||||
static struct dirty_log_type _disk_type = {
|
||||
.name = "disk",
|
||||
.module = THIS_MODULE,
|
||||
.ctr = disk_ctr,
|
||||
.dtr = disk_dtr,
|
||||
.suspend = disk_flush,
|
||||
.resume = disk_resume,
|
||||
.get_region_size = core_get_region_size,
|
||||
.is_clean = core_is_clean,
|
||||
.in_sync = core_in_sync,
|
||||
.flush = disk_flush,
|
||||
.mark_region = core_mark_region,
|
||||
.clear_region = core_clear_region,
|
||||
.get_resync_work = core_get_resync_work,
|
||||
.set_region_sync = core_set_region_sync,
|
||||
.get_sync_count = core_get_sync_count,
|
||||
.status = disk_status,
|
||||
};
|
||||
|
||||
int __init dm_dirty_log_init(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = dm_register_dirty_log_type(&_core_type);
|
||||
if (r)
|
||||
DMWARN("couldn't register core log");
|
||||
|
||||
r = dm_register_dirty_log_type(&_disk_type);
|
||||
if (r) {
|
||||
DMWARN("couldn't register disk type");
|
||||
dm_unregister_dirty_log_type(&_core_type);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void dm_dirty_log_exit(void)
|
||||
{
|
||||
dm_unregister_dirty_log_type(&_disk_type);
|
||||
dm_unregister_dirty_log_type(&_core_type);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(dm_register_dirty_log_type);
|
||||
EXPORT_SYMBOL(dm_unregister_dirty_log_type);
|
||||
EXPORT_SYMBOL(dm_create_dirty_log);
|
||||
EXPORT_SYMBOL(dm_destroy_dirty_log);
|
||||
130
drivers/md/dm-log.h
Normal file
130
drivers/md/dm-log.h
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software
|
||||
*
|
||||
* This file is released under the LGPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_DIRTY_LOG
|
||||
#define DM_DIRTY_LOG
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
typedef sector_t region_t;
|
||||
|
||||
struct dirty_log_type;
|
||||
|
||||
struct dirty_log {
|
||||
struct dirty_log_type *type;
|
||||
void *context;
|
||||
};
|
||||
|
||||
struct dirty_log_type {
|
||||
struct list_head list;
|
||||
const char *name;
|
||||
struct module *module;
|
||||
unsigned int use_count;
|
||||
|
||||
int (*ctr)(struct dirty_log *log, struct dm_target *ti,
|
||||
unsigned int argc, char **argv);
|
||||
void (*dtr)(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* There are times when we don't want the log to touch
|
||||
* the disk.
|
||||
*/
|
||||
int (*suspend)(struct dirty_log *log);
|
||||
int (*resume)(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* Retrieves the smallest size of region that the log can
|
||||
* deal with.
|
||||
*/
|
||||
uint32_t (*get_region_size)(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* A predicate to say whether a region is clean or not.
|
||||
* May block.
|
||||
*/
|
||||
int (*is_clean)(struct dirty_log *log, region_t region);
|
||||
|
||||
/*
|
||||
* Returns: 0, 1, -EWOULDBLOCK, < 0
|
||||
*
|
||||
* A predicate function to check the area given by
|
||||
* [sector, sector + len) is in sync.
|
||||
*
|
||||
* If -EWOULDBLOCK is returned the state of the region is
|
||||
* unknown, typically this will result in a read being
|
||||
* passed to a daemon to deal with, since a daemon is
|
||||
* allowed to block.
|
||||
*/
|
||||
int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
|
||||
|
||||
/*
|
||||
* Flush the current log state (eg, to disk). This
|
||||
* function may block.
|
||||
*/
|
||||
int (*flush)(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* Mark an area as clean or dirty. These functions may
|
||||
* block, though for performance reasons blocking should
|
||||
* be extremely rare (eg, allocating another chunk of
|
||||
* memory for some reason).
|
||||
*/
|
||||
void (*mark_region)(struct dirty_log *log, region_t region);
|
||||
void (*clear_region)(struct dirty_log *log, region_t region);
|
||||
|
||||
/*
|
||||
* Returns: <0 (error), 0 (no region), 1 (region)
|
||||
*
|
||||
* The mirrord will need perform recovery on regions of
|
||||
* the mirror that are in the NOSYNC state. This
|
||||
* function asks the log to tell the caller about the
|
||||
* next region that this machine should recover.
|
||||
*
|
||||
* Do not confuse this function with 'in_sync()', one
|
||||
* tells you if an area is synchronised, the other
|
||||
* assigns recovery work.
|
||||
*/
|
||||
int (*get_resync_work)(struct dirty_log *log, region_t *region);
|
||||
|
||||
/*
|
||||
* This notifies the log that the resync status of a region
|
||||
* has changed. It also clears the region from the recovering
|
||||
* list (if present).
|
||||
*/
|
||||
void (*set_region_sync)(struct dirty_log *log,
|
||||
region_t region, int in_sync);
|
||||
|
||||
/*
|
||||
* Returns the number of regions that are in sync.
|
||||
*/
|
||||
region_t (*get_sync_count)(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* Support function for mirror status requests.
|
||||
*/
|
||||
int (*status)(struct dirty_log *log, status_type_t status_type,
|
||||
char *result, unsigned int maxlen);
|
||||
};
|
||||
|
||||
int dm_register_dirty_log_type(struct dirty_log_type *type);
|
||||
int dm_unregister_dirty_log_type(struct dirty_log_type *type);
|
||||
|
||||
|
||||
/*
|
||||
* Make sure you use these two functions, rather than calling
|
||||
* type->constructor/destructor() directly.
|
||||
*/
|
||||
struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
|
||||
unsigned int argc, char **argv);
|
||||
void dm_destroy_dirty_log(struct dirty_log *log);
|
||||
|
||||
/*
|
||||
* init/exit functions.
|
||||
*/
|
||||
int dm_dirty_log_init(void);
|
||||
void dm_dirty_log_exit(void);
|
||||
|
||||
#endif
|
||||
1394
drivers/md/dm-mpath.c
Normal file
1394
drivers/md/dm-mpath.c
Normal file
File diff suppressed because it is too large
Load Diff
25
drivers/md/dm-mpath.h
Normal file
25
drivers/md/dm-mpath.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Multipath.
|
||||
*/
|
||||
|
||||
#ifndef DM_MPATH_H
|
||||
#define DM_MPATH_H
|
||||
|
||||
struct dm_dev;
|
||||
|
||||
struct dm_path {
|
||||
struct dm_dev *dev; /* Read-only */
|
||||
unsigned is_active; /* Read-only */
|
||||
|
||||
void *pscontext; /* For path-selector use */
|
||||
void *hwhcontext; /* For hw-handler use */
|
||||
};
|
||||
|
||||
/* Callback for hwh_pg_init_fn to use when complete */
|
||||
void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
|
||||
|
||||
#endif
|
||||
155
drivers/md/dm-path-selector.c
Normal file
155
drivers/md/dm-path-selector.c
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software.
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* Module Author: Heinz Mauelshagen
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Path selector registration.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-path-selector.h"
|
||||
|
||||
#include <linux/slab.h>
|
||||
|
||||
struct ps_internal {
|
||||
struct path_selector_type pst;
|
||||
|
||||
struct list_head list;
|
||||
long use;
|
||||
};
|
||||
|
||||
#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
|
||||
|
||||
static LIST_HEAD(_path_selectors);
|
||||
static DECLARE_RWSEM(_ps_lock);
|
||||
|
||||
static struct ps_internal *__find_path_selector_type(const char *name)
|
||||
{
|
||||
struct ps_internal *psi;
|
||||
|
||||
list_for_each_entry(psi, &_path_selectors, list) {
|
||||
if (!strcmp(name, psi->pst.name))
|
||||
return psi;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ps_internal *get_path_selector(const char *name)
|
||||
{
|
||||
struct ps_internal *psi;
|
||||
|
||||
down_read(&_ps_lock);
|
||||
psi = __find_path_selector_type(name);
|
||||
if (psi) {
|
||||
if ((psi->use == 0) && !try_module_get(psi->pst.module))
|
||||
psi = NULL;
|
||||
else
|
||||
psi->use++;
|
||||
}
|
||||
up_read(&_ps_lock);
|
||||
|
||||
return psi;
|
||||
}
|
||||
|
||||
struct path_selector_type *dm_get_path_selector(const char *name)
|
||||
{
|
||||
struct ps_internal *psi;
|
||||
|
||||
if (!name)
|
||||
return NULL;
|
||||
|
||||
psi = get_path_selector(name);
|
||||
if (!psi) {
|
||||
request_module("dm-%s", name);
|
||||
psi = get_path_selector(name);
|
||||
}
|
||||
|
||||
return psi ? &psi->pst : NULL;
|
||||
}
|
||||
|
||||
void dm_put_path_selector(struct path_selector_type *pst)
|
||||
{
|
||||
struct ps_internal *psi;
|
||||
|
||||
if (!pst)
|
||||
return;
|
||||
|
||||
down_read(&_ps_lock);
|
||||
psi = __find_path_selector_type(pst->name);
|
||||
if (!psi)
|
||||
goto out;
|
||||
|
||||
if (--psi->use == 0)
|
||||
module_put(psi->pst.module);
|
||||
|
||||
BUG_ON(psi->use < 0);
|
||||
|
||||
out:
|
||||
up_read(&_ps_lock);
|
||||
}
|
||||
|
||||
static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
|
||||
{
|
||||
struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
|
||||
|
||||
if (psi) {
|
||||
memset(psi, 0, sizeof(*psi));
|
||||
psi->pst = *pst;
|
||||
}
|
||||
|
||||
return psi;
|
||||
}
|
||||
|
||||
int dm_register_path_selector(struct path_selector_type *pst)
|
||||
{
|
||||
int r = 0;
|
||||
struct ps_internal *psi = _alloc_path_selector(pst);
|
||||
|
||||
if (!psi)
|
||||
return -ENOMEM;
|
||||
|
||||
down_write(&_ps_lock);
|
||||
|
||||
if (__find_path_selector_type(pst->name)) {
|
||||
kfree(psi);
|
||||
r = -EEXIST;
|
||||
} else
|
||||
list_add(&psi->list, &_path_selectors);
|
||||
|
||||
up_write(&_ps_lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int dm_unregister_path_selector(struct path_selector_type *pst)
|
||||
{
|
||||
struct ps_internal *psi;
|
||||
|
||||
down_write(&_ps_lock);
|
||||
|
||||
psi = __find_path_selector_type(pst->name);
|
||||
if (!psi) {
|
||||
up_write(&_ps_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (psi->use) {
|
||||
up_write(&_ps_lock);
|
||||
return -ETXTBSY;
|
||||
}
|
||||
|
||||
list_del(&psi->list);
|
||||
|
||||
up_write(&_ps_lock);
|
||||
|
||||
kfree(psi);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(dm_register_path_selector);
|
||||
EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
|
||||
93
drivers/md/dm-path-selector.h
Normal file
93
drivers/md/dm-path-selector.h
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software.
|
||||
* Copyright (C) 2004 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* Module Author: Heinz Mauelshagen
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Path-Selector registration.
|
||||
*/
|
||||
|
||||
#ifndef DM_PATH_SELECTOR_H
|
||||
#define DM_PATH_SELECTOR_H
|
||||
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
#include "dm-mpath.h"
|
||||
|
||||
/*
|
||||
* We provide an abstraction for the code that chooses which path
|
||||
* to send some io down.
|
||||
*/
|
||||
struct path_selector_type;
|
||||
struct path_selector {
|
||||
struct path_selector_type *type;
|
||||
void *context;
|
||||
};
|
||||
|
||||
/* Information about a path selector type */
|
||||
struct path_selector_type {
|
||||
char *name;
|
||||
struct module *module;
|
||||
|
||||
unsigned int table_args;
|
||||
unsigned int info_args;
|
||||
|
||||
/*
|
||||
* Constructs a path selector object, takes custom arguments
|
||||
*/
|
||||
int (*create) (struct path_selector *ps, unsigned argc, char **argv);
|
||||
void (*destroy) (struct path_selector *ps);
|
||||
|
||||
/*
|
||||
* Add an opaque path object, along with some selector specific
|
||||
* path args (eg, path priority).
|
||||
*/
|
||||
int (*add_path) (struct path_selector *ps, struct dm_path *path,
|
||||
int argc, char **argv, char **error);
|
||||
|
||||
/*
|
||||
* Chooses a path for this io, if no paths are available then
|
||||
* NULL will be returned.
|
||||
*
|
||||
* repeat_count is the number of times to use the path before
|
||||
* calling the function again. 0 means don't call it again unless
|
||||
* the path fails.
|
||||
*/
|
||||
struct dm_path *(*select_path) (struct path_selector *ps,
|
||||
unsigned *repeat_count);
|
||||
|
||||
/*
|
||||
* Notify the selector that a path has failed.
|
||||
*/
|
||||
void (*fail_path) (struct path_selector *ps, struct dm_path *p);
|
||||
|
||||
/*
|
||||
* Ask selector to reinstate a path.
|
||||
*/
|
||||
int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
|
||||
|
||||
/*
|
||||
* Table content based on parameters added in ps_add_path_fn
|
||||
* or path selector status
|
||||
*/
|
||||
int (*status) (struct path_selector *ps, struct dm_path *path,
|
||||
status_type_t type, char *result, unsigned int maxlen);
|
||||
|
||||
int (*end_io) (struct path_selector *ps, struct dm_path *path);
|
||||
};
|
||||
|
||||
/* Register a path selector */
|
||||
int dm_register_path_selector(struct path_selector_type *type);
|
||||
|
||||
/* Unregister a path selector */
|
||||
int dm_unregister_path_selector(struct path_selector_type *type);
|
||||
|
||||
/* Returns a registered path selector type */
|
||||
struct path_selector_type *dm_get_path_selector(const char *name);
|
||||
|
||||
/* Releases a path selector */
|
||||
void dm_put_path_selector(struct path_selector_type *pst);
|
||||
|
||||
#endif
|
||||
1310
drivers/md/dm-raid1.c
Normal file
1310
drivers/md/dm-raid1.c
Normal file
File diff suppressed because it is too large
Load Diff
216
drivers/md/dm-round-robin.c
Normal file
216
drivers/md/dm-round-robin.c
Normal file
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Sistina Software.
|
||||
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* Module Author: Heinz Mauelshagen
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Round-robin path selector.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-path-selector.h"
|
||||
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define DM_MSG_PREFIX "multipath round-robin"
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Path-handling code, paths are held in lists
|
||||
*---------------------------------------------------------------*/
|
||||
struct path_info {
|
||||
struct list_head list;
|
||||
struct dm_path *path;
|
||||
unsigned repeat_count;
|
||||
};
|
||||
|
||||
static void free_paths(struct list_head *paths)
|
||||
{
|
||||
struct path_info *pi, *next;
|
||||
|
||||
list_for_each_entry_safe(pi, next, paths, list) {
|
||||
list_del(&pi->list);
|
||||
kfree(pi);
|
||||
}
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Round-robin selector
|
||||
*---------------------------------------------------------------*/
|
||||
|
||||
#define RR_MIN_IO 1000
|
||||
|
||||
struct selector {
|
||||
struct list_head valid_paths;
|
||||
struct list_head invalid_paths;
|
||||
};
|
||||
|
||||
static struct selector *alloc_selector(void)
|
||||
{
|
||||
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
|
||||
|
||||
if (s) {
|
||||
INIT_LIST_HEAD(&s->valid_paths);
|
||||
INIT_LIST_HEAD(&s->invalid_paths);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
|
||||
{
|
||||
struct selector *s;
|
||||
|
||||
s = alloc_selector();
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
ps->context = s;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rr_destroy(struct path_selector *ps)
|
||||
{
|
||||
struct selector *s = (struct selector *) ps->context;
|
||||
|
||||
free_paths(&s->valid_paths);
|
||||
free_paths(&s->invalid_paths);
|
||||
kfree(s);
|
||||
ps->context = NULL;
|
||||
}
|
||||
|
||||
static int rr_status(struct path_selector *ps, struct dm_path *path,
|
||||
status_type_t type, char *result, unsigned int maxlen)
|
||||
{
|
||||
struct path_info *pi;
|
||||
int sz = 0;
|
||||
|
||||
if (!path)
|
||||
DMEMIT("0 ");
|
||||
else {
|
||||
switch(type) {
|
||||
case STATUSTYPE_INFO:
|
||||
break;
|
||||
case STATUSTYPE_TABLE:
|
||||
pi = path->pscontext;
|
||||
DMEMIT("%u ", pi->repeat_count);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sz;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called during initialisation to register each path with an
|
||||
* optional repeat_count.
|
||||
*/
|
||||
static int rr_add_path(struct path_selector *ps, struct dm_path *path,
|
||||
int argc, char **argv, char **error)
|
||||
{
|
||||
struct selector *s = (struct selector *) ps->context;
|
||||
struct path_info *pi;
|
||||
unsigned repeat_count = RR_MIN_IO;
|
||||
|
||||
if (argc > 1) {
|
||||
*error = "round-robin ps: incorrect number of arguments";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* First path argument is number of I/Os before switching path */
|
||||
if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
|
||||
*error = "round-robin ps: invalid repeat count";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* allocate the path */
|
||||
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
|
||||
if (!pi) {
|
||||
*error = "round-robin ps: Error allocating path context";
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
pi->path = path;
|
||||
pi->repeat_count = repeat_count;
|
||||
|
||||
path->pscontext = pi;
|
||||
|
||||
list_add_tail(&pi->list, &s->valid_paths);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
|
||||
{
|
||||
struct selector *s = (struct selector *) ps->context;
|
||||
struct path_info *pi = p->pscontext;
|
||||
|
||||
list_move(&pi->list, &s->invalid_paths);
|
||||
}
|
||||
|
||||
static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
|
||||
{
|
||||
struct selector *s = (struct selector *) ps->context;
|
||||
struct path_info *pi = p->pscontext;
|
||||
|
||||
list_move(&pi->list, &s->valid_paths);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct dm_path *rr_select_path(struct path_selector *ps,
|
||||
unsigned *repeat_count)
|
||||
{
|
||||
struct selector *s = (struct selector *) ps->context;
|
||||
struct path_info *pi = NULL;
|
||||
|
||||
if (!list_empty(&s->valid_paths)) {
|
||||
pi = list_entry(s->valid_paths.next, struct path_info, list);
|
||||
list_move_tail(&pi->list, &s->valid_paths);
|
||||
*repeat_count = pi->repeat_count;
|
||||
}
|
||||
|
||||
return pi ? pi->path : NULL;
|
||||
}
|
||||
|
||||
static struct path_selector_type rr_ps = {
|
||||
.name = "round-robin",
|
||||
.module = THIS_MODULE,
|
||||
.table_args = 1,
|
||||
.info_args = 0,
|
||||
.create = rr_create,
|
||||
.destroy = rr_destroy,
|
||||
.status = rr_status,
|
||||
.add_path = rr_add_path,
|
||||
.fail_path = rr_fail_path,
|
||||
.reinstate_path = rr_reinstate_path,
|
||||
.select_path = rr_select_path,
|
||||
};
|
||||
|
||||
static int __init dm_rr_init(void)
|
||||
{
|
||||
int r = dm_register_path_selector(&rr_ps);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("register failed %d", r);
|
||||
|
||||
DMINFO("version 1.0.0 loaded");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void __exit dm_rr_exit(void)
|
||||
{
|
||||
int r = dm_unregister_path_selector(&rr_ps);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("round-robin: unregister failed %d", r);
|
||||
}
|
||||
|
||||
module_init(dm_rr_init);
|
||||
module_exit(dm_rr_exit);
|
||||
|
||||
MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
|
||||
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
1330
drivers/md/dm-snap.c
Normal file
1330
drivers/md/dm-snap.c
Normal file
File diff suppressed because it is too large
Load Diff
174
drivers/md/dm-snap.h
Normal file
174
drivers/md/dm-snap.h
Normal file
@@ -0,0 +1,174 @@
|
||||
/*
|
||||
* dm-snapshot.c
|
||||
*
|
||||
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_SNAPSHOT_H
|
||||
#define DM_SNAPSHOT_H
|
||||
|
||||
#include "dm.h"
|
||||
#include "dm-bio-list.h"
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
struct exception_table {
|
||||
uint32_t hash_mask;
|
||||
struct list_head *table;
|
||||
};
|
||||
|
||||
/*
|
||||
* The snapshot code deals with largish chunks of the disk at a
|
||||
* time. Typically 64k - 256k.
|
||||
*/
|
||||
/* FIXME: can we get away with limiting these to a uint32_t ? */
|
||||
typedef sector_t chunk_t;
|
||||
|
||||
/*
|
||||
* An exception is used where an old chunk of data has been
|
||||
* replaced by a new one.
|
||||
*/
|
||||
struct exception {
|
||||
struct list_head hash_list;
|
||||
|
||||
chunk_t old_chunk;
|
||||
chunk_t new_chunk;
|
||||
};
|
||||
|
||||
/*
|
||||
* Abstraction to handle the meta/layout of exception stores (the
|
||||
* COW device).
|
||||
*/
|
||||
struct exception_store {
|
||||
|
||||
/*
|
||||
* Destroys this object when you've finished with it.
|
||||
*/
|
||||
void (*destroy) (struct exception_store *store);
|
||||
|
||||
/*
|
||||
* The target shouldn't read the COW device until this is
|
||||
* called.
|
||||
*/
|
||||
int (*read_metadata) (struct exception_store *store);
|
||||
|
||||
/*
|
||||
* Find somewhere to store the next exception.
|
||||
*/
|
||||
int (*prepare_exception) (struct exception_store *store,
|
||||
struct exception *e);
|
||||
|
||||
/*
|
||||
* Update the metadata with this exception.
|
||||
*/
|
||||
void (*commit_exception) (struct exception_store *store,
|
||||
struct exception *e,
|
||||
void (*callback) (void *, int success),
|
||||
void *callback_context);
|
||||
|
||||
/*
|
||||
* The snapshot is invalid, note this in the metadata.
|
||||
*/
|
||||
void (*drop_snapshot) (struct exception_store *store);
|
||||
|
||||
/*
|
||||
* Return how full the snapshot is.
|
||||
*/
|
||||
void (*fraction_full) (struct exception_store *store,
|
||||
sector_t *numerator,
|
||||
sector_t *denominator);
|
||||
|
||||
struct dm_snapshot *snap;
|
||||
void *context;
|
||||
};
|
||||
|
||||
struct dm_snapshot {
|
||||
struct rw_semaphore lock;
|
||||
struct dm_table *table;
|
||||
|
||||
struct dm_dev *origin;
|
||||
struct dm_dev *cow;
|
||||
|
||||
/* List of snapshots per Origin */
|
||||
struct list_head list;
|
||||
|
||||
/* Size of data blocks saved - must be a power of 2 */
|
||||
chunk_t chunk_size;
|
||||
chunk_t chunk_mask;
|
||||
chunk_t chunk_shift;
|
||||
|
||||
/* You can't use a snapshot if this is 0 (e.g. if full) */
|
||||
int valid;
|
||||
|
||||
/* Origin writes don't trigger exceptions until this is set */
|
||||
int active;
|
||||
|
||||
/* Used for display of table */
|
||||
char type;
|
||||
|
||||
/* The last percentage we notified */
|
||||
int last_percent;
|
||||
|
||||
struct exception_table pending;
|
||||
struct exception_table complete;
|
||||
|
||||
/*
|
||||
* pe_lock protects all pending_exception operations and access
|
||||
* as well as the snapshot_bios list.
|
||||
*/
|
||||
spinlock_t pe_lock;
|
||||
|
||||
/* The on disk metadata handler */
|
||||
struct exception_store store;
|
||||
|
||||
struct kcopyd_client *kcopyd_client;
|
||||
|
||||
/* Queue of snapshot writes for ksnapd to flush */
|
||||
struct bio_list queued_bios;
|
||||
struct work_struct queued_bios_work;
|
||||
};
|
||||
|
||||
/*
|
||||
* Used by the exception stores to load exceptions hen
|
||||
* initialising.
|
||||
*/
|
||||
int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
|
||||
|
||||
/*
|
||||
* Constructor and destructor for the default persistent
|
||||
* store.
|
||||
*/
|
||||
int dm_create_persistent(struct exception_store *store);
|
||||
|
||||
int dm_create_transient(struct exception_store *store);
|
||||
|
||||
/*
|
||||
* Return the number of sectors in the device.
|
||||
*/
|
||||
static inline sector_t get_dev_size(struct block_device *bdev)
|
||||
{
|
||||
return bdev->bd_inode->i_size >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
|
||||
{
|
||||
return (sector & ~s->chunk_mask) >> s->chunk_shift;
|
||||
}
|
||||
|
||||
static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
|
||||
{
|
||||
return chunk << s->chunk_shift;
|
||||
}
|
||||
|
||||
static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
|
||||
{
|
||||
/*
|
||||
* There is only ever one instance of a particular block
|
||||
* device so we can compare pointers safely.
|
||||
*/
|
||||
return lhs == rhs;
|
||||
}
|
||||
|
||||
#endif
|
||||
242
drivers/md/dm-stripe.c
Normal file
242
drivers/md/dm-stripe.c
Normal file
@@ -0,0 +1,242 @@
|
||||
/*
|
||||
* Copyright (C) 2001-2003 Sistina Software (UK) Limited.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define DM_MSG_PREFIX "striped"
|
||||
|
||||
struct stripe {
|
||||
struct dm_dev *dev;
|
||||
sector_t physical_start;
|
||||
};
|
||||
|
||||
struct stripe_c {
|
||||
uint32_t stripes;
|
||||
|
||||
/* The size of this target / num. stripes */
|
||||
sector_t stripe_width;
|
||||
|
||||
/* stripe chunk size */
|
||||
uint32_t chunk_shift;
|
||||
sector_t chunk_mask;
|
||||
|
||||
struct stripe stripe[0];
|
||||
};
|
||||
|
||||
static inline struct stripe_c *alloc_context(unsigned int stripes)
|
||||
{
|
||||
size_t len;
|
||||
|
||||
if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
|
||||
stripes))
|
||||
return NULL;
|
||||
|
||||
len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
|
||||
|
||||
return kmalloc(len, GFP_KERNEL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a single <dev> <sector> pair
|
||||
*/
|
||||
static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
|
||||
unsigned int stripe, char **argv)
|
||||
{
|
||||
unsigned long long start;
|
||||
|
||||
if (sscanf(argv[1], "%llu", &start) != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (dm_get_device(ti, argv[0], start, sc->stripe_width,
|
||||
dm_table_get_mode(ti->table),
|
||||
&sc->stripe[stripe].dev))
|
||||
return -ENXIO;
|
||||
|
||||
sc->stripe[stripe].physical_start = start;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct a striped mapping.
|
||||
* <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
|
||||
*/
|
||||
static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||
{
|
||||
struct stripe_c *sc;
|
||||
sector_t width;
|
||||
uint32_t stripes;
|
||||
uint32_t chunk_size;
|
||||
char *end;
|
||||
int r;
|
||||
unsigned int i;
|
||||
|
||||
if (argc < 2) {
|
||||
ti->error = "Not enough arguments";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
stripes = simple_strtoul(argv[0], &end, 10);
|
||||
if (*end) {
|
||||
ti->error = "Invalid stripe count";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
chunk_size = simple_strtoul(argv[1], &end, 10);
|
||||
if (*end) {
|
||||
ti->error = "Invalid chunk_size";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* chunk_size is a power of two
|
||||
*/
|
||||
if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
|
||||
(chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
|
||||
ti->error = "Invalid chunk size";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ti->len & (chunk_size - 1)) {
|
||||
ti->error = "Target length not divisible by "
|
||||
"chunk size";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
width = ti->len;
|
||||
if (sector_div(width, stripes)) {
|
||||
ti->error = "Target length not divisible by "
|
||||
"number of stripes";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do we have enough arguments for that many stripes ?
|
||||
*/
|
||||
if (argc != (2 + 2 * stripes)) {
|
||||
ti->error = "Not enough destinations "
|
||||
"specified";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
sc = alloc_context(stripes);
|
||||
if (!sc) {
|
||||
ti->error = "Memory allocation for striped context "
|
||||
"failed";
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
sc->stripes = stripes;
|
||||
sc->stripe_width = width;
|
||||
ti->split_io = chunk_size;
|
||||
|
||||
sc->chunk_mask = ((sector_t) chunk_size) - 1;
|
||||
for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
|
||||
chunk_size >>= 1;
|
||||
sc->chunk_shift--;
|
||||
|
||||
/*
|
||||
* Get the stripe destinations.
|
||||
*/
|
||||
for (i = 0; i < stripes; i++) {
|
||||
argv += 2;
|
||||
|
||||
r = get_stripe(ti, sc, i, argv);
|
||||
if (r < 0) {
|
||||
ti->error = "Couldn't parse stripe destination";
|
||||
while (i--)
|
||||
dm_put_device(ti, sc->stripe[i].dev);
|
||||
kfree(sc);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
ti->private = sc;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void stripe_dtr(struct dm_target *ti)
|
||||
{
|
||||
unsigned int i;
|
||||
struct stripe_c *sc = (struct stripe_c *) ti->private;
|
||||
|
||||
for (i = 0; i < sc->stripes; i++)
|
||||
dm_put_device(ti, sc->stripe[i].dev);
|
||||
|
||||
kfree(sc);
|
||||
}
|
||||
|
||||
static int stripe_map(struct dm_target *ti, struct bio *bio,
|
||||
union map_info *map_context)
|
||||
{
|
||||
struct stripe_c *sc = (struct stripe_c *) ti->private;
|
||||
|
||||
sector_t offset = bio->bi_sector - ti->begin;
|
||||
sector_t chunk = offset >> sc->chunk_shift;
|
||||
uint32_t stripe = sector_div(chunk, sc->stripes);
|
||||
|
||||
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
|
||||
bio->bi_sector = sc->stripe[stripe].physical_start +
|
||||
(chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
|
||||
return DM_MAPIO_REMAPPED;
|
||||
}
|
||||
|
||||
static int stripe_status(struct dm_target *ti,
|
||||
status_type_t type, char *result, unsigned int maxlen)
|
||||
{
|
||||
struct stripe_c *sc = (struct stripe_c *) ti->private;
|
||||
unsigned int sz = 0;
|
||||
unsigned int i;
|
||||
|
||||
switch (type) {
|
||||
case STATUSTYPE_INFO:
|
||||
result[0] = '\0';
|
||||
break;
|
||||
|
||||
case STATUSTYPE_TABLE:
|
||||
DMEMIT("%d %llu", sc->stripes,
|
||||
(unsigned long long)sc->chunk_mask + 1);
|
||||
for (i = 0; i < sc->stripes; i++)
|
||||
DMEMIT(" %s %llu", sc->stripe[i].dev->name,
|
||||
(unsigned long long)sc->stripe[i].physical_start);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct target_type stripe_target = {
|
||||
.name = "striped",
|
||||
.version= {1, 0, 2},
|
||||
.module = THIS_MODULE,
|
||||
.ctr = stripe_ctr,
|
||||
.dtr = stripe_dtr,
|
||||
.map = stripe_map,
|
||||
.status = stripe_status,
|
||||
};
|
||||
|
||||
int __init dm_stripe_init(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = dm_register_target(&stripe_target);
|
||||
if (r < 0)
|
||||
DMWARN("target registration failed");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void dm_stripe_exit(void)
|
||||
{
|
||||
if (dm_unregister_target(&stripe_target))
|
||||
DMWARN("target unregistration failed");
|
||||
|
||||
return;
|
||||
}
|
||||
1044
drivers/md/dm-table.c
Normal file
1044
drivers/md/dm-table.c
Normal file
File diff suppressed because it is too large
Load Diff
197
drivers/md/dm-target.c
Normal file
197
drivers/md/dm-target.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Copyright (C) 2001 Sistina Software (UK) Limited
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kmod.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#define DM_MSG_PREFIX "target"
|
||||
|
||||
struct tt_internal {
|
||||
struct target_type tt;
|
||||
|
||||
struct list_head list;
|
||||
long use;
|
||||
};
|
||||
|
||||
static LIST_HEAD(_targets);
|
||||
static DECLARE_RWSEM(_lock);
|
||||
|
||||
#define DM_MOD_NAME_SIZE 32
|
||||
|
||||
static inline struct tt_internal *__find_target_type(const char *name)
|
||||
{
|
||||
struct tt_internal *ti;
|
||||
|
||||
list_for_each_entry (ti, &_targets, list)
|
||||
if (!strcmp(name, ti->tt.name))
|
||||
return ti;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct tt_internal *get_target_type(const char *name)
|
||||
{
|
||||
struct tt_internal *ti;
|
||||
|
||||
down_read(&_lock);
|
||||
|
||||
ti = __find_target_type(name);
|
||||
if (ti) {
|
||||
if ((ti->use == 0) && !try_module_get(ti->tt.module))
|
||||
ti = NULL;
|
||||
else
|
||||
ti->use++;
|
||||
}
|
||||
|
||||
up_read(&_lock);
|
||||
return ti;
|
||||
}
|
||||
|
||||
static void load_module(const char *name)
|
||||
{
|
||||
request_module("dm-%s", name);
|
||||
}
|
||||
|
||||
struct target_type *dm_get_target_type(const char *name)
|
||||
{
|
||||
struct tt_internal *ti = get_target_type(name);
|
||||
|
||||
if (!ti) {
|
||||
load_module(name);
|
||||
ti = get_target_type(name);
|
||||
}
|
||||
|
||||
return ti ? &ti->tt : NULL;
|
||||
}
|
||||
|
||||
void dm_put_target_type(struct target_type *t)
|
||||
{
|
||||
struct tt_internal *ti = (struct tt_internal *) t;
|
||||
|
||||
down_read(&_lock);
|
||||
if (--ti->use == 0)
|
||||
module_put(ti->tt.module);
|
||||
|
||||
BUG_ON(ti->use < 0);
|
||||
up_read(&_lock);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static struct tt_internal *alloc_target(struct target_type *t)
|
||||
{
|
||||
struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
|
||||
|
||||
if (ti) {
|
||||
memset(ti, 0, sizeof(*ti));
|
||||
ti->tt = *t;
|
||||
}
|
||||
|
||||
return ti;
|
||||
}
|
||||
|
||||
|
||||
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
|
||||
void *param), void *param)
|
||||
{
|
||||
struct tt_internal *ti;
|
||||
|
||||
down_read(&_lock);
|
||||
list_for_each_entry (ti, &_targets, list)
|
||||
iter_func(&ti->tt, param);
|
||||
up_read(&_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dm_register_target(struct target_type *t)
|
||||
{
|
||||
int rv = 0;
|
||||
struct tt_internal *ti = alloc_target(t);
|
||||
|
||||
if (!ti)
|
||||
return -ENOMEM;
|
||||
|
||||
down_write(&_lock);
|
||||
if (__find_target_type(t->name))
|
||||
rv = -EEXIST;
|
||||
else
|
||||
list_add(&ti->list, &_targets);
|
||||
|
||||
up_write(&_lock);
|
||||
if (rv)
|
||||
kfree(ti);
|
||||
return rv;
|
||||
}
|
||||
|
||||
int dm_unregister_target(struct target_type *t)
|
||||
{
|
||||
struct tt_internal *ti;
|
||||
|
||||
down_write(&_lock);
|
||||
if (!(ti = __find_target_type(t->name))) {
|
||||
up_write(&_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ti->use) {
|
||||
up_write(&_lock);
|
||||
return -ETXTBSY;
|
||||
}
|
||||
|
||||
list_del(&ti->list);
|
||||
kfree(ti);
|
||||
|
||||
up_write(&_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* io-err: always fails an io, useful for bringing
|
||||
* up LVs that have holes in them.
|
||||
*/
|
||||
static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_err_dtr(struct dm_target *ti)
|
||||
{
|
||||
/* empty */
|
||||
}
|
||||
|
||||
static int io_err_map(struct dm_target *ti, struct bio *bio,
|
||||
union map_info *map_context)
|
||||
{
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct target_type error_target = {
|
||||
.name = "error",
|
||||
.version = {1, 0, 1},
|
||||
.ctr = io_err_ctr,
|
||||
.dtr = io_err_dtr,
|
||||
.map = io_err_map,
|
||||
};
|
||||
|
||||
int __init dm_target_init(void)
|
||||
{
|
||||
return dm_register_target(&error_target);
|
||||
}
|
||||
|
||||
void dm_target_exit(void)
|
||||
{
|
||||
if (dm_unregister_target(&error_target))
|
||||
DMWARN("error target unregistration failed");
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(dm_register_target);
|
||||
EXPORT_SYMBOL(dm_unregister_target);
|
||||
83
drivers/md/dm-zero.c
Normal file
83
drivers/md/dm-zero.c
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#include "dm.h"
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/bio.h>
|
||||
|
||||
#define DM_MSG_PREFIX "zero"
|
||||
|
||||
/*
|
||||
* Construct a dummy mapping that only returns zeros
|
||||
*/
|
||||
static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||
{
|
||||
if (argc != 0) {
|
||||
ti->error = "No arguments required";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return zeros only on reads
|
||||
*/
|
||||
static int zero_map(struct dm_target *ti, struct bio *bio,
|
||||
union map_info *map_context)
|
||||
{
|
||||
switch(bio_rw(bio)) {
|
||||
case READ:
|
||||
zero_fill_bio(bio);
|
||||
break;
|
||||
case READA:
|
||||
/* readahead of null bytes only wastes buffer cache */
|
||||
return -EIO;
|
||||
case WRITE:
|
||||
/* writes get silently dropped */
|
||||
break;
|
||||
}
|
||||
|
||||
bio_endio(bio, bio->bi_size, 0);
|
||||
|
||||
/* accepted bio, don't make new request */
|
||||
return DM_MAPIO_SUBMITTED;
|
||||
}
|
||||
|
||||
static struct target_type zero_target = {
|
||||
.name = "zero",
|
||||
.version = {1, 0, 0},
|
||||
.module = THIS_MODULE,
|
||||
.ctr = zero_ctr,
|
||||
.map = zero_map,
|
||||
};
|
||||
|
||||
static int __init dm_zero_init(void)
|
||||
{
|
||||
int r = dm_register_target(&zero_target);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("register failed %d", r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void __exit dm_zero_exit(void)
|
||||
{
|
||||
int r = dm_unregister_target(&zero_target);
|
||||
|
||||
if (r < 0)
|
||||
DMERR("unregister failed %d", r);
|
||||
}
|
||||
|
||||
module_init(dm_zero_init)
|
||||
module_exit(dm_zero_exit)
|
||||
|
||||
MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
|
||||
MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
|
||||
MODULE_LICENSE("GPL");
|
||||
1564
drivers/md/dm.c
Normal file
1564
drivers/md/dm.c
Normal file
File diff suppressed because it is too large
Load Diff
154
drivers/md/dm.h
Normal file
154
drivers/md/dm.h
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Internal header file for device mapper
|
||||
*
|
||||
* Copyright (C) 2001, 2002 Sistina Software
|
||||
* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the LGPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_INTERNAL_H
|
||||
#define DM_INTERNAL_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/device-mapper.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/hdreg.h>
|
||||
|
||||
#define DM_NAME "device-mapper"
|
||||
|
||||
#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
|
||||
#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
|
||||
#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
|
||||
#ifdef CONFIG_DM_DEBUG
|
||||
# define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg)
|
||||
#else
|
||||
# define DMDEBUG(f, arg...) do {} while (0)
|
||||
#endif
|
||||
|
||||
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
|
||||
0 : scnprintf(result + sz, maxlen - sz, x))
|
||||
|
||||
#define SECTOR_SHIFT 9
|
||||
|
||||
/*
|
||||
* Definitions of return values from target end_io function.
|
||||
*/
|
||||
#define DM_ENDIO_INCOMPLETE 1
|
||||
#define DM_ENDIO_REQUEUE 2
|
||||
|
||||
/*
|
||||
* Definitions of return values from target map function.
|
||||
*/
|
||||
#define DM_MAPIO_SUBMITTED 0
|
||||
#define DM_MAPIO_REMAPPED 1
|
||||
#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
|
||||
|
||||
/*
|
||||
* Suspend feature flags
|
||||
*/
|
||||
#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
|
||||
#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
|
||||
|
||||
/*
|
||||
* List of devices that a metadevice uses and should open/close.
|
||||
*/
|
||||
struct dm_dev {
|
||||
struct list_head list;
|
||||
|
||||
atomic_t count;
|
||||
int mode;
|
||||
struct block_device *bdev;
|
||||
char name[16];
|
||||
};
|
||||
|
||||
struct dm_table;
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Internal table functions.
|
||||
*---------------------------------------------------------------*/
|
||||
void dm_table_event_callback(struct dm_table *t,
|
||||
void (*fn)(void *), void *context);
|
||||
struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
|
||||
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
|
||||
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
|
||||
struct list_head *dm_table_get_devices(struct dm_table *t);
|
||||
void dm_table_presuspend_targets(struct dm_table *t);
|
||||
void dm_table_postsuspend_targets(struct dm_table *t);
|
||||
int dm_table_resume_targets(struct dm_table *t);
|
||||
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
|
||||
void dm_table_unplug_all(struct dm_table *t);
|
||||
int dm_table_flush_all(struct dm_table *t);
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* A registry of target types.
|
||||
*---------------------------------------------------------------*/
|
||||
int dm_target_init(void);
|
||||
void dm_target_exit(void);
|
||||
struct target_type *dm_get_target_type(const char *name);
|
||||
void dm_put_target_type(struct target_type *t);
|
||||
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
|
||||
void *param), void *param);
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Useful inlines.
|
||||
*---------------------------------------------------------------*/
|
||||
static inline int array_too_big(unsigned long fixed, unsigned long obj,
|
||||
unsigned long num)
|
||||
{
|
||||
return (num > (ULONG_MAX - fixed) / obj);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ceiling(n / sz)
|
||||
*/
|
||||
#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
|
||||
|
||||
#define dm_sector_div_up(n, sz) ( \
|
||||
{ \
|
||||
sector_t _r = ((n) + (sz) - 1); \
|
||||
sector_div(_r, (sz)); \
|
||||
_r; \
|
||||
} \
|
||||
)
|
||||
|
||||
/*
|
||||
* ceiling(n / size) * size
|
||||
*/
|
||||
#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
|
||||
|
||||
static inline sector_t to_sector(unsigned long n)
|
||||
{
|
||||
return (n >> 9);
|
||||
}
|
||||
|
||||
static inline unsigned long to_bytes(sector_t n)
|
||||
{
|
||||
return (n << 9);
|
||||
}
|
||||
|
||||
int dm_split_args(int *argc, char ***argvp, char *input);
|
||||
|
||||
/*
|
||||
* The device-mapper can be driven through one of two interfaces;
|
||||
* ioctl or filesystem, depending which patch you have applied.
|
||||
*/
|
||||
int dm_interface_init(void);
|
||||
void dm_interface_exit(void);
|
||||
|
||||
/*
|
||||
* Targets for linear and striped mappings
|
||||
*/
|
||||
int dm_linear_init(void);
|
||||
void dm_linear_exit(void);
|
||||
|
||||
int dm_stripe_init(void);
|
||||
void dm_stripe_exit(void);
|
||||
|
||||
void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
|
||||
union map_info *dm_get_mapinfo(struct bio *bio);
|
||||
int dm_open_count(struct mapped_device *md);
|
||||
int dm_lock_for_deletion(struct mapped_device *md);
|
||||
|
||||
#endif
|
||||
346
drivers/md/faulty.c
Normal file
346
drivers/md/faulty.c
Normal file
@@ -0,0 +1,346 @@
|
||||
/*
|
||||
* faulty.c : Multiple Devices driver for Linux
|
||||
*
|
||||
* Copyright (C) 2004 Neil Brown
|
||||
*
|
||||
* fautly-device-simulator personality for md
|
||||
*
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* (for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* The "faulty" personality causes some requests to fail.
|
||||
*
|
||||
* Possible failure modes are:
|
||||
* reads fail "randomly" but succeed on retry
|
||||
* writes fail "randomly" but succeed on retry
|
||||
* reads for some address fail and then persist until a write
|
||||
* reads for some address fail and then persist irrespective of write
|
||||
* writes for some address fail and persist
|
||||
* all writes fail
|
||||
*
|
||||
* Different modes can be active at a time, but only
|
||||
* one can be set at array creation. Others can be added later.
|
||||
* A mode can be one-shot or recurrent with the recurrance being
|
||||
* once in every N requests.
|
||||
* The bottom 5 bits of the "layout" indicate the mode. The
|
||||
* remainder indicate a period, or 0 for one-shot.
|
||||
*
|
||||
* There is an implementation limit on the number of concurrently
|
||||
* persisting-faulty blocks. When a new fault is requested that would
|
||||
* exceed the limit, it is ignored.
|
||||
* All current faults can be clear using a layout of "0".
|
||||
*
|
||||
* Requests are always sent to the device. If they are to fail,
|
||||
* we clone the bio and insert a new b_end_io into the chain.
|
||||
*/
|
||||
|
||||
#define WriteTransient 0
|
||||
#define ReadTransient 1
|
||||
#define WritePersistent 2
|
||||
#define ReadPersistent 3
|
||||
#define WriteAll 4 /* doesn't go to device */
|
||||
#define ReadFixable 5
|
||||
#define Modes 6
|
||||
|
||||
#define ClearErrors 31
|
||||
#define ClearFaults 30
|
||||
|
||||
#define AllPersist 100 /* internal use only */
|
||||
#define NoPersist 101
|
||||
|
||||
#define ModeMask 0x1f
|
||||
#define ModeShift 5
|
||||
|
||||
#define MaxFault 50
|
||||
#include <linux/raid/md.h>
|
||||
|
||||
|
||||
static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error)
|
||||
{
|
||||
struct bio *b = bio->bi_private;
|
||||
|
||||
b->bi_size = bio->bi_size;
|
||||
b->bi_sector = bio->bi_sector;
|
||||
|
||||
if (bio->bi_size == 0)
|
||||
bio_put(bio);
|
||||
|
||||
clear_bit(BIO_UPTODATE, &b->bi_flags);
|
||||
return (b->bi_end_io)(b, bytes_done, -EIO);
|
||||
}
|
||||
|
||||
typedef struct faulty_conf {
|
||||
int period[Modes];
|
||||
atomic_t counters[Modes];
|
||||
sector_t faults[MaxFault];
|
||||
int modes[MaxFault];
|
||||
int nfaults;
|
||||
mdk_rdev_t *rdev;
|
||||
} conf_t;
|
||||
|
||||
static int check_mode(conf_t *conf, int mode)
|
||||
{
|
||||
if (conf->period[mode] == 0 &&
|
||||
atomic_read(&conf->counters[mode]) <= 0)
|
||||
return 0; /* no failure, no decrement */
|
||||
|
||||
|
||||
if (atomic_dec_and_test(&conf->counters[mode])) {
|
||||
if (conf->period[mode])
|
||||
atomic_set(&conf->counters[mode], conf->period[mode]);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
|
||||
{
|
||||
/* If we find a ReadFixable sector, we fix it ... */
|
||||
int i;
|
||||
for (i=0; i<conf->nfaults; i++)
|
||||
if (conf->faults[i] >= start &&
|
||||
conf->faults[i] < end) {
|
||||
/* found it ... */
|
||||
switch (conf->modes[i] * 2 + dir) {
|
||||
case WritePersistent*2+WRITE: return 1;
|
||||
case ReadPersistent*2+READ: return 1;
|
||||
case ReadFixable*2+READ: return 1;
|
||||
case ReadFixable*2+WRITE:
|
||||
conf->modes[i] = NoPersist;
|
||||
return 0;
|
||||
case AllPersist*2+READ:
|
||||
case AllPersist*2+WRITE: return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void add_sector(conf_t *conf, sector_t start, int mode)
|
||||
{
|
||||
int i;
|
||||
int n = conf->nfaults;
|
||||
for (i=0; i<conf->nfaults; i++)
|
||||
if (conf->faults[i] == start) {
|
||||
switch(mode) {
|
||||
case NoPersist: conf->modes[i] = mode; return;
|
||||
case WritePersistent:
|
||||
if (conf->modes[i] == ReadPersistent ||
|
||||
conf->modes[i] == ReadFixable)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = WritePersistent;
|
||||
return;
|
||||
case ReadPersistent:
|
||||
if (conf->modes[i] == WritePersistent)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = ReadPersistent;
|
||||
return;
|
||||
case ReadFixable:
|
||||
if (conf->modes[i] == WritePersistent ||
|
||||
conf->modes[i] == ReadPersistent)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = ReadFixable;
|
||||
return;
|
||||
}
|
||||
} else if (conf->modes[i] == NoPersist)
|
||||
n = i;
|
||||
|
||||
if (n >= MaxFault)
|
||||
return;
|
||||
conf->faults[n] = start;
|
||||
conf->modes[n] = mode;
|
||||
if (conf->nfaults == n)
|
||||
conf->nfaults = n+1;
|
||||
}
|
||||
|
||||
static int make_request(request_queue_t *q, struct bio *bio)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
conf_t *conf = (conf_t*)mddev->private;
|
||||
int failit = 0;
|
||||
|
||||
if (bio_data_dir(bio) == WRITE) {
|
||||
/* write request */
|
||||
if (atomic_read(&conf->counters[WriteAll])) {
|
||||
/* special case - don't decrement, don't generic_make_request,
|
||||
* just fail immediately
|
||||
*/
|
||||
bio_endio(bio, bio->bi_size, -EIO);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
|
||||
WRITE))
|
||||
failit = 1;
|
||||
if (check_mode(conf, WritePersistent)) {
|
||||
add_sector(conf, bio->bi_sector, WritePersistent);
|
||||
failit = 1;
|
||||
}
|
||||
if (check_mode(conf, WriteTransient))
|
||||
failit = 1;
|
||||
} else {
|
||||
/* read request */
|
||||
if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
|
||||
READ))
|
||||
failit = 1;
|
||||
if (check_mode(conf, ReadTransient))
|
||||
failit = 1;
|
||||
if (check_mode(conf, ReadPersistent)) {
|
||||
add_sector(conf, bio->bi_sector, ReadPersistent);
|
||||
failit = 1;
|
||||
}
|
||||
if (check_mode(conf, ReadFixable)) {
|
||||
add_sector(conf, bio->bi_sector, ReadFixable);
|
||||
failit = 1;
|
||||
}
|
||||
}
|
||||
if (failit) {
|
||||
struct bio *b = bio_clone(bio, GFP_NOIO);
|
||||
b->bi_bdev = conf->rdev->bdev;
|
||||
b->bi_private = bio;
|
||||
b->bi_end_io = faulty_fail;
|
||||
generic_make_request(b);
|
||||
return 0;
|
||||
} else {
|
||||
bio->bi_bdev = conf->rdev->bdev;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void status(struct seq_file *seq, mddev_t *mddev)
|
||||
{
|
||||
conf_t *conf = (conf_t*)mddev->private;
|
||||
int n;
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
|
||||
seq_printf(seq, " WriteTransient=%d(%d)",
|
||||
n, conf->period[WriteTransient]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
|
||||
seq_printf(seq, " ReadTransient=%d(%d)",
|
||||
n, conf->period[ReadTransient]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
|
||||
seq_printf(seq, " WritePersistent=%d(%d)",
|
||||
n, conf->period[WritePersistent]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
|
||||
seq_printf(seq, " ReadPersistent=%d(%d)",
|
||||
n, conf->period[ReadPersistent]);
|
||||
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
|
||||
seq_printf(seq, " ReadFixable=%d(%d)",
|
||||
n, conf->period[ReadFixable]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
|
||||
seq_printf(seq, " WriteAll");
|
||||
|
||||
seq_printf(seq, " nfaults=%d", conf->nfaults);
|
||||
}
|
||||
|
||||
|
||||
static int reconfig(mddev_t *mddev, int layout, int chunk_size)
|
||||
{
|
||||
int mode = layout & ModeMask;
|
||||
int count = layout >> ModeShift;
|
||||
conf_t *conf = mddev->private;
|
||||
|
||||
if (chunk_size != -1)
|
||||
return -EINVAL;
|
||||
|
||||
/* new layout */
|
||||
if (mode == ClearFaults)
|
||||
conf->nfaults = 0;
|
||||
else if (mode == ClearErrors) {
|
||||
int i;
|
||||
for (i=0 ; i < Modes ; i++) {
|
||||
conf->period[i] = 0;
|
||||
atomic_set(&conf->counters[i], 0);
|
||||
}
|
||||
} else if (mode < Modes) {
|
||||
conf->period[mode] = count;
|
||||
if (!count) count++;
|
||||
atomic_set(&conf->counters[mode], count);
|
||||
} else
|
||||
return -EINVAL;
|
||||
mddev->layout = -1; /* makes sure further changes come through */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int run(mddev_t *mddev)
|
||||
{
|
||||
mdk_rdev_t *rdev;
|
||||
struct list_head *tmp;
|
||||
int i;
|
||||
|
||||
conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
|
||||
|
||||
for (i=0; i<Modes; i++) {
|
||||
atomic_set(&conf->counters[i], 0);
|
||||
conf->period[i] = 0;
|
||||
}
|
||||
conf->nfaults = 0;
|
||||
|
||||
ITERATE_RDEV(mddev, rdev, tmp)
|
||||
conf->rdev = rdev;
|
||||
|
||||
mddev->array_size = mddev->size;
|
||||
mddev->private = conf;
|
||||
|
||||
reconfig(mddev, mddev->layout, -1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int stop(mddev_t *mddev)
|
||||
{
|
||||
conf_t *conf = (conf_t *)mddev->private;
|
||||
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct mdk_personality faulty_personality =
|
||||
{
|
||||
.name = "faulty",
|
||||
.level = LEVEL_FAULTY,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = make_request,
|
||||
.run = run,
|
||||
.stop = stop,
|
||||
.status = status,
|
||||
.reconfig = reconfig,
|
||||
};
|
||||
|
||||
static int __init raid_init(void)
|
||||
{
|
||||
return register_md_personality(&faulty_personality);
|
||||
}
|
||||
|
||||
static void raid_exit(void)
|
||||
{
|
||||
unregister_md_personality(&faulty_personality);
|
||||
}
|
||||
|
||||
module_init(raid_init);
|
||||
module_exit(raid_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("md-personality-10"); /* faulty */
|
||||
MODULE_ALIAS("md-faulty");
|
||||
MODULE_ALIAS("md-level--5");
|
||||
703
drivers/md/kcopyd.c
Normal file
703
drivers/md/kcopyd.c
Normal file
@@ -0,0 +1,703 @@
|
||||
/*
|
||||
* Copyright (C) 2002 Sistina Software (UK) Limited.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Kcopyd provides a simple interface for copying an area of one
|
||||
* block-device to one or more other block-devices, with an asynchronous
|
||||
* completion notification.
|
||||
*/
|
||||
|
||||
#include <asm/types.h>
|
||||
#include <asm/atomic.h>
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/mutex.h>
|
||||
|
||||
#include "kcopyd.h"
|
||||
|
||||
static struct workqueue_struct *_kcopyd_wq;
|
||||
static struct work_struct _kcopyd_work;
|
||||
|
||||
static inline void wake(void)
|
||||
{
|
||||
queue_work(_kcopyd_wq, &_kcopyd_work);
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Each kcopyd client has its own little pool of preallocated
|
||||
* pages for kcopyd io.
|
||||
*---------------------------------------------------------------*/
|
||||
struct kcopyd_client {
|
||||
struct list_head list;
|
||||
|
||||
spinlock_t lock;
|
||||
struct page_list *pages;
|
||||
unsigned int nr_pages;
|
||||
unsigned int nr_free_pages;
|
||||
|
||||
wait_queue_head_t destroyq;
|
||||
atomic_t nr_jobs;
|
||||
};
|
||||
|
||||
static struct page_list *alloc_pl(void)
|
||||
{
|
||||
struct page_list *pl;
|
||||
|
||||
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
|
||||
if (!pl)
|
||||
return NULL;
|
||||
|
||||
pl->page = alloc_page(GFP_KERNEL);
|
||||
if (!pl->page) {
|
||||
kfree(pl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return pl;
|
||||
}
|
||||
|
||||
static void free_pl(struct page_list *pl)
|
||||
{
|
||||
__free_page(pl->page);
|
||||
kfree(pl);
|
||||
}
|
||||
|
||||
static int kcopyd_get_pages(struct kcopyd_client *kc,
|
||||
unsigned int nr, struct page_list **pages)
|
||||
{
|
||||
struct page_list *pl;
|
||||
|
||||
spin_lock(&kc->lock);
|
||||
if (kc->nr_free_pages < nr) {
|
||||
spin_unlock(&kc->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
kc->nr_free_pages -= nr;
|
||||
for (*pages = pl = kc->pages; --nr; pl = pl->next)
|
||||
;
|
||||
|
||||
kc->pages = pl->next;
|
||||
pl->next = NULL;
|
||||
|
||||
spin_unlock(&kc->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl)
|
||||
{
|
||||
struct page_list *cursor;
|
||||
|
||||
spin_lock(&kc->lock);
|
||||
for (cursor = pl; cursor->next; cursor = cursor->next)
|
||||
kc->nr_free_pages++;
|
||||
|
||||
kc->nr_free_pages++;
|
||||
cursor->next = kc->pages;
|
||||
kc->pages = pl;
|
||||
spin_unlock(&kc->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* These three functions resize the page pool.
|
||||
*/
|
||||
static void drop_pages(struct page_list *pl)
|
||||
{
|
||||
struct page_list *next;
|
||||
|
||||
while (pl) {
|
||||
next = pl->next;
|
||||
free_pl(pl);
|
||||
pl = next;
|
||||
}
|
||||
}
|
||||
|
||||
static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
|
||||
{
|
||||
unsigned int i;
|
||||
struct page_list *pl = NULL, *next;
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
next = alloc_pl();
|
||||
if (!next) {
|
||||
if (pl)
|
||||
drop_pages(pl);
|
||||
return -ENOMEM;
|
||||
}
|
||||
next->next = pl;
|
||||
pl = next;
|
||||
}
|
||||
|
||||
kcopyd_put_pages(kc, pl);
|
||||
kc->nr_pages += nr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void client_free_pages(struct kcopyd_client *kc)
|
||||
{
|
||||
BUG_ON(kc->nr_free_pages != kc->nr_pages);
|
||||
drop_pages(kc->pages);
|
||||
kc->pages = NULL;
|
||||
kc->nr_free_pages = kc->nr_pages = 0;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* kcopyd_jobs need to be allocated by the *clients* of kcopyd,
|
||||
* for this reason we use a mempool to prevent the client from
|
||||
* ever having to do io (which could cause a deadlock).
|
||||
*---------------------------------------------------------------*/
|
||||
struct kcopyd_job {
|
||||
struct kcopyd_client *kc;
|
||||
struct list_head list;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Error state of the job.
|
||||
*/
|
||||
int read_err;
|
||||
unsigned int write_err;
|
||||
|
||||
/*
|
||||
* Either READ or WRITE
|
||||
*/
|
||||
int rw;
|
||||
struct io_region source;
|
||||
|
||||
/*
|
||||
* The destinations for the transfer.
|
||||
*/
|
||||
unsigned int num_dests;
|
||||
struct io_region dests[KCOPYD_MAX_REGIONS];
|
||||
|
||||
sector_t offset;
|
||||
unsigned int nr_pages;
|
||||
struct page_list *pages;
|
||||
|
||||
/*
|
||||
* Set this to ensure you are notified when the job has
|
||||
* completed. 'context' is for callback to use.
|
||||
*/
|
||||
kcopyd_notify_fn fn;
|
||||
void *context;
|
||||
|
||||
/*
|
||||
* These fields are only used if the job has been split
|
||||
* into more manageable parts.
|
||||
*/
|
||||
struct semaphore lock;
|
||||
atomic_t sub_jobs;
|
||||
sector_t progress;
|
||||
};
|
||||
|
||||
/* FIXME: this should scale with the number of pages */
|
||||
#define MIN_JOBS 512
|
||||
|
||||
static struct kmem_cache *_job_cache;
|
||||
static mempool_t *_job_pool;
|
||||
|
||||
/*
|
||||
* We maintain three lists of jobs:
|
||||
*
|
||||
* i) jobs waiting for pages
|
||||
* ii) jobs that have pages, and are waiting for the io to be issued.
|
||||
* iii) jobs that have completed.
|
||||
*
|
||||
* All three of these are protected by job_lock.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(_job_lock);
|
||||
|
||||
static LIST_HEAD(_complete_jobs);
|
||||
static LIST_HEAD(_io_jobs);
|
||||
static LIST_HEAD(_pages_jobs);
|
||||
|
||||
static int jobs_init(void)
|
||||
{
|
||||
_job_cache = kmem_cache_create("kcopyd-jobs",
|
||||
sizeof(struct kcopyd_job),
|
||||
__alignof__(struct kcopyd_job),
|
||||
0, NULL, NULL);
|
||||
if (!_job_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
_job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
|
||||
if (!_job_pool) {
|
||||
kmem_cache_destroy(_job_cache);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void jobs_exit(void)
|
||||
{
|
||||
BUG_ON(!list_empty(&_complete_jobs));
|
||||
BUG_ON(!list_empty(&_io_jobs));
|
||||
BUG_ON(!list_empty(&_pages_jobs));
|
||||
|
||||
mempool_destroy(_job_pool);
|
||||
kmem_cache_destroy(_job_cache);
|
||||
_job_pool = NULL;
|
||||
_job_cache = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Functions to push and pop a job onto the head of a given job
|
||||
* list.
|
||||
*/
|
||||
static inline struct kcopyd_job *pop(struct list_head *jobs)
|
||||
{
|
||||
struct kcopyd_job *job = NULL;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&_job_lock, flags);
|
||||
|
||||
if (!list_empty(jobs)) {
|
||||
job = list_entry(jobs->next, struct kcopyd_job, list);
|
||||
list_del(&job->list);
|
||||
}
|
||||
spin_unlock_irqrestore(&_job_lock, flags);
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
static inline void push(struct list_head *jobs, struct kcopyd_job *job)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&_job_lock, flags);
|
||||
list_add_tail(&job->list, jobs);
|
||||
spin_unlock_irqrestore(&_job_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* These three functions process 1 item from the corresponding
|
||||
* job list.
|
||||
*
|
||||
* They return:
|
||||
* < 0: error
|
||||
* 0: success
|
||||
* > 0: can't process yet.
|
||||
*/
|
||||
static int run_complete_job(struct kcopyd_job *job)
|
||||
{
|
||||
void *context = job->context;
|
||||
int read_err = job->read_err;
|
||||
unsigned int write_err = job->write_err;
|
||||
kcopyd_notify_fn fn = job->fn;
|
||||
struct kcopyd_client *kc = job->kc;
|
||||
|
||||
kcopyd_put_pages(kc, job->pages);
|
||||
mempool_free(job, _job_pool);
|
||||
fn(read_err, write_err, context);
|
||||
|
||||
if (atomic_dec_and_test(&kc->nr_jobs))
|
||||
wake_up(&kc->destroyq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void complete_io(unsigned long error, void *context)
|
||||
{
|
||||
struct kcopyd_job *job = (struct kcopyd_job *) context;
|
||||
|
||||
if (error) {
|
||||
if (job->rw == WRITE)
|
||||
job->write_err |= error;
|
||||
else
|
||||
job->read_err = 1;
|
||||
|
||||
if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
|
||||
push(&_complete_jobs, job);
|
||||
wake();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (job->rw == WRITE)
|
||||
push(&_complete_jobs, job);
|
||||
|
||||
else {
|
||||
job->rw = WRITE;
|
||||
push(&_io_jobs, job);
|
||||
}
|
||||
|
||||
wake();
|
||||
}
|
||||
|
||||
/*
|
||||
* Request io on as many buffer heads as we can currently get for
|
||||
* a particular job.
|
||||
*/
|
||||
static int run_io_job(struct kcopyd_job *job)
|
||||
{
|
||||
int r;
|
||||
|
||||
if (job->rw == READ)
|
||||
r = dm_io_async(1, &job->source, job->rw,
|
||||
job->pages,
|
||||
job->offset, complete_io, job);
|
||||
|
||||
else
|
||||
r = dm_io_async(job->num_dests, job->dests, job->rw,
|
||||
job->pages,
|
||||
job->offset, complete_io, job);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int run_pages_job(struct kcopyd_job *job)
|
||||
{
|
||||
int r;
|
||||
|
||||
job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
|
||||
PAGE_SIZE >> 9);
|
||||
r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
|
||||
if (!r) {
|
||||
/* this job is ready for io */
|
||||
push(&_io_jobs, job);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (r == -ENOMEM)
|
||||
/* can't complete now */
|
||||
return 1;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Run through a list for as long as possible. Returns the count
|
||||
* of successful jobs.
|
||||
*/
|
||||
static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
|
||||
{
|
||||
struct kcopyd_job *job;
|
||||
int r, count = 0;
|
||||
|
||||
while ((job = pop(jobs))) {
|
||||
|
||||
r = fn(job);
|
||||
|
||||
if (r < 0) {
|
||||
/* error this rogue job */
|
||||
if (job->rw == WRITE)
|
||||
job->write_err = (unsigned int) -1;
|
||||
else
|
||||
job->read_err = 1;
|
||||
push(&_complete_jobs, job);
|
||||
break;
|
||||
}
|
||||
|
||||
if (r > 0) {
|
||||
/*
|
||||
* We couldn't service this job ATM, so
|
||||
* push this job back onto the list.
|
||||
*/
|
||||
push(jobs, job);
|
||||
break;
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* kcopyd does this every time it's woken up.
|
||||
*/
|
||||
static void do_work(struct work_struct *ignored)
|
||||
{
|
||||
/*
|
||||
* The order that these are called is *very* important.
|
||||
* complete jobs can free some pages for pages jobs.
|
||||
* Pages jobs when successful will jump onto the io jobs
|
||||
* list. io jobs call wake when they complete and it all
|
||||
* starts again.
|
||||
*/
|
||||
process_jobs(&_complete_jobs, run_complete_job);
|
||||
process_jobs(&_pages_jobs, run_pages_job);
|
||||
process_jobs(&_io_jobs, run_io_job);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are copying a small region we just dispatch a single job
|
||||
* to do the copy, otherwise the io has to be split up into many
|
||||
* jobs.
|
||||
*/
|
||||
static void dispatch_job(struct kcopyd_job *job)
|
||||
{
|
||||
atomic_inc(&job->kc->nr_jobs);
|
||||
push(&_pages_jobs, job);
|
||||
wake();
|
||||
}
|
||||
|
||||
#define SUB_JOB_SIZE 128
|
||||
static void segment_complete(int read_err,
|
||||
unsigned int write_err, void *context)
|
||||
{
|
||||
/* FIXME: tidy this function */
|
||||
sector_t progress = 0;
|
||||
sector_t count = 0;
|
||||
struct kcopyd_job *job = (struct kcopyd_job *) context;
|
||||
|
||||
down(&job->lock);
|
||||
|
||||
/* update the error */
|
||||
if (read_err)
|
||||
job->read_err = 1;
|
||||
|
||||
if (write_err)
|
||||
job->write_err |= write_err;
|
||||
|
||||
/*
|
||||
* Only dispatch more work if there hasn't been an error.
|
||||
*/
|
||||
if ((!job->read_err && !job->write_err) ||
|
||||
test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
|
||||
/* get the next chunk of work */
|
||||
progress = job->progress;
|
||||
count = job->source.count - progress;
|
||||
if (count) {
|
||||
if (count > SUB_JOB_SIZE)
|
||||
count = SUB_JOB_SIZE;
|
||||
|
||||
job->progress += count;
|
||||
}
|
||||
}
|
||||
up(&job->lock);
|
||||
|
||||
if (count) {
|
||||
int i;
|
||||
struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
|
||||
|
||||
*sub_job = *job;
|
||||
sub_job->source.sector += progress;
|
||||
sub_job->source.count = count;
|
||||
|
||||
for (i = 0; i < job->num_dests; i++) {
|
||||
sub_job->dests[i].sector += progress;
|
||||
sub_job->dests[i].count = count;
|
||||
}
|
||||
|
||||
sub_job->fn = segment_complete;
|
||||
sub_job->context = job;
|
||||
dispatch_job(sub_job);
|
||||
|
||||
} else if (atomic_dec_and_test(&job->sub_jobs)) {
|
||||
|
||||
/*
|
||||
* To avoid a race we must keep the job around
|
||||
* until after the notify function has completed.
|
||||
* Otherwise the client may try and stop the job
|
||||
* after we've completed.
|
||||
*/
|
||||
job->fn(read_err, write_err, job->context);
|
||||
mempool_free(job, _job_pool);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create some little jobs that will do the move between
|
||||
* them.
|
||||
*/
|
||||
#define SPLIT_COUNT 8
|
||||
static void split_job(struct kcopyd_job *job)
|
||||
{
|
||||
int i;
|
||||
|
||||
atomic_set(&job->sub_jobs, SPLIT_COUNT);
|
||||
for (i = 0; i < SPLIT_COUNT; i++)
|
||||
segment_complete(0, 0u, job);
|
||||
}
|
||||
|
||||
int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
|
||||
unsigned int num_dests, struct io_region *dests,
|
||||
unsigned int flags, kcopyd_notify_fn fn, void *context)
|
||||
{
|
||||
struct kcopyd_job *job;
|
||||
|
||||
/*
|
||||
* Allocate a new job.
|
||||
*/
|
||||
job = mempool_alloc(_job_pool, GFP_NOIO);
|
||||
|
||||
/*
|
||||
* set up for the read.
|
||||
*/
|
||||
job->kc = kc;
|
||||
job->flags = flags;
|
||||
job->read_err = 0;
|
||||
job->write_err = 0;
|
||||
job->rw = READ;
|
||||
|
||||
job->source = *from;
|
||||
|
||||
job->num_dests = num_dests;
|
||||
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
|
||||
|
||||
job->offset = 0;
|
||||
job->nr_pages = 0;
|
||||
job->pages = NULL;
|
||||
|
||||
job->fn = fn;
|
||||
job->context = context;
|
||||
|
||||
if (job->source.count < SUB_JOB_SIZE)
|
||||
dispatch_job(job);
|
||||
|
||||
else {
|
||||
init_MUTEX(&job->lock);
|
||||
job->progress = 0;
|
||||
split_job(job);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cancels a kcopyd job, eg. someone might be deactivating a
|
||||
* mirror.
|
||||
*/
|
||||
#if 0
|
||||
int kcopyd_cancel(struct kcopyd_job *job, int block)
|
||||
{
|
||||
/* FIXME: finish */
|
||||
return -1;
|
||||
}
|
||||
#endif /* 0 */
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Unit setup
|
||||
*---------------------------------------------------------------*/
|
||||
static DEFINE_MUTEX(_client_lock);
|
||||
static LIST_HEAD(_clients);
|
||||
|
||||
static void client_add(struct kcopyd_client *kc)
|
||||
{
|
||||
mutex_lock(&_client_lock);
|
||||
list_add(&kc->list, &_clients);
|
||||
mutex_unlock(&_client_lock);
|
||||
}
|
||||
|
||||
static void client_del(struct kcopyd_client *kc)
|
||||
{
|
||||
mutex_lock(&_client_lock);
|
||||
list_del(&kc->list);
|
||||
mutex_unlock(&_client_lock);
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(kcopyd_init_lock);
|
||||
static int kcopyd_clients = 0;
|
||||
|
||||
static int kcopyd_init(void)
|
||||
{
|
||||
int r;
|
||||
|
||||
mutex_lock(&kcopyd_init_lock);
|
||||
|
||||
if (kcopyd_clients) {
|
||||
/* Already initialized. */
|
||||
kcopyd_clients++;
|
||||
mutex_unlock(&kcopyd_init_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
r = jobs_init();
|
||||
if (r) {
|
||||
mutex_unlock(&kcopyd_init_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
_kcopyd_wq = create_singlethread_workqueue("kcopyd");
|
||||
if (!_kcopyd_wq) {
|
||||
jobs_exit();
|
||||
mutex_unlock(&kcopyd_init_lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
kcopyd_clients++;
|
||||
INIT_WORK(&_kcopyd_work, do_work);
|
||||
mutex_unlock(&kcopyd_init_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kcopyd_exit(void)
|
||||
{
|
||||
mutex_lock(&kcopyd_init_lock);
|
||||
kcopyd_clients--;
|
||||
if (!kcopyd_clients) {
|
||||
jobs_exit();
|
||||
destroy_workqueue(_kcopyd_wq);
|
||||
_kcopyd_wq = NULL;
|
||||
}
|
||||
mutex_unlock(&kcopyd_init_lock);
|
||||
}
|
||||
|
||||
int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
|
||||
{
|
||||
int r = 0;
|
||||
struct kcopyd_client *kc;
|
||||
|
||||
r = kcopyd_init();
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
kc = kmalloc(sizeof(*kc), GFP_KERNEL);
|
||||
if (!kc) {
|
||||
kcopyd_exit();
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
spin_lock_init(&kc->lock);
|
||||
kc->pages = NULL;
|
||||
kc->nr_pages = kc->nr_free_pages = 0;
|
||||
r = client_alloc_pages(kc, nr_pages);
|
||||
if (r) {
|
||||
kfree(kc);
|
||||
kcopyd_exit();
|
||||
return r;
|
||||
}
|
||||
|
||||
r = dm_io_get(nr_pages);
|
||||
if (r) {
|
||||
client_free_pages(kc);
|
||||
kfree(kc);
|
||||
kcopyd_exit();
|
||||
return r;
|
||||
}
|
||||
|
||||
init_waitqueue_head(&kc->destroyq);
|
||||
atomic_set(&kc->nr_jobs, 0);
|
||||
|
||||
client_add(kc);
|
||||
*result = kc;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kcopyd_client_destroy(struct kcopyd_client *kc)
|
||||
{
|
||||
/* Wait for completion of all jobs submitted by this client. */
|
||||
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
|
||||
|
||||
dm_io_put(kc->nr_pages);
|
||||
client_free_pages(kc);
|
||||
client_del(kc);
|
||||
kfree(kc);
|
||||
kcopyd_exit();
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(kcopyd_client_create);
|
||||
EXPORT_SYMBOL(kcopyd_client_destroy);
|
||||
EXPORT_SYMBOL(kcopyd_copy);
|
||||
42
drivers/md/kcopyd.h
Normal file
42
drivers/md/kcopyd.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (C) 2001 Sistina Software
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*
|
||||
* Kcopyd provides a simple interface for copying an area of one
|
||||
* block-device to one or more other block-devices, with an asynchronous
|
||||
* completion notification.
|
||||
*/
|
||||
|
||||
#ifndef DM_KCOPYD_H
|
||||
#define DM_KCOPYD_H
|
||||
|
||||
#include "dm-io.h"
|
||||
|
||||
/* FIXME: make this configurable */
|
||||
#define KCOPYD_MAX_REGIONS 8
|
||||
|
||||
#define KCOPYD_IGNORE_ERROR 1
|
||||
|
||||
/*
|
||||
* To use kcopyd you must first create a kcopyd client object.
|
||||
*/
|
||||
struct kcopyd_client;
|
||||
int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
|
||||
void kcopyd_client_destroy(struct kcopyd_client *kc);
|
||||
|
||||
/*
|
||||
* Submit a copy job to kcopyd. This is built on top of the
|
||||
* previous three fns.
|
||||
*
|
||||
* read_err is a boolean,
|
||||
* write_err is a bitset, with 1 bit for each destination region
|
||||
*/
|
||||
typedef void (*kcopyd_notify_fn)(int read_err,
|
||||
unsigned int write_err, void *context);
|
||||
|
||||
int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
|
||||
unsigned int num_dests, struct io_region *dests,
|
||||
unsigned int flags, kcopyd_notify_fn fn, void *context);
|
||||
|
||||
#endif
|
||||
444
drivers/md/linear.c
Normal file
444
drivers/md/linear.c
Normal file
@@ -0,0 +1,444 @@
|
||||
/*
|
||||
linear.c : Multiple Devices driver for Linux
|
||||
Copyright (C) 1994-96 Marc ZYNGIER
|
||||
<zyngier@ufr-info-p7.ibp.fr> or
|
||||
<maz@gloups.fdn.fr>
|
||||
|
||||
Linear mode management functions.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
(for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <linux/raid/md.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/raid/linear.h>
|
||||
|
||||
#define MAJOR_NR MD_MAJOR
|
||||
#define MD_DRIVER
|
||||
#define MD_PERSONALITY
|
||||
|
||||
/*
|
||||
* find which device holds a particular offset
|
||||
*/
|
||||
static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
|
||||
{
|
||||
dev_info_t *hash;
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
sector_t block = sector >> 1;
|
||||
|
||||
/*
|
||||
* sector_div(a,b) returns the remainer and sets a to a/b
|
||||
*/
|
||||
block >>= conf->preshift;
|
||||
(void)sector_div(block, conf->hash_spacing);
|
||||
hash = conf->hash_table[block];
|
||||
|
||||
while ((sector>>1) >= (hash->size + hash->offset))
|
||||
hash++;
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* linear_mergeable_bvec -- tell bio layer if two requests can be merged
|
||||
* @q: request queue
|
||||
* @bio: the buffer head that's been built up so far
|
||||
* @biovec: the request that could be merged to it.
|
||||
*
|
||||
* Return amount of bytes we can take at this offset
|
||||
*/
|
||||
static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
dev_info_t *dev0;
|
||||
unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
|
||||
sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
|
||||
|
||||
dev0 = which_dev(mddev, sector);
|
||||
maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
|
||||
|
||||
if (maxsectors < bio_sectors)
|
||||
maxsectors = 0;
|
||||
else
|
||||
maxsectors -= bio_sectors;
|
||||
|
||||
if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
|
||||
return biovec->bv_len;
|
||||
/* The bytes available at this offset could be really big,
|
||||
* so we cap at 2^31 to avoid overflow */
|
||||
if (maxsectors > (1 << (31-9)))
|
||||
return 1<<31;
|
||||
return maxsectors << 9;
|
||||
}
|
||||
|
||||
static void linear_unplug(request_queue_t *q)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i;
|
||||
|
||||
for (i=0; i < mddev->raid_disks; i++) {
|
||||
request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
|
||||
if (r_queue->unplug_fn)
|
||||
r_queue->unplug_fn(r_queue);
|
||||
}
|
||||
}
|
||||
|
||||
static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
|
||||
sector_t *error_sector)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i, ret = 0;
|
||||
|
||||
for (i=0; i < mddev->raid_disks && ret == 0; i++) {
|
||||
struct block_device *bdev = conf->disks[i].rdev->bdev;
|
||||
request_queue_t *r_queue = bdev_get_queue(bdev);
|
||||
|
||||
if (!r_queue->issue_flush_fn)
|
||||
ret = -EOPNOTSUPP;
|
||||
else
|
||||
ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int linear_congested(void *data, int bits)
|
||||
{
|
||||
mddev_t *mddev = data;
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i, ret = 0;
|
||||
|
||||
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
|
||||
request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev);
|
||||
ret |= bdi_congested(&q->backing_dev_info, bits);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
|
||||
{
|
||||
linear_conf_t *conf;
|
||||
dev_info_t **table;
|
||||
mdk_rdev_t *rdev;
|
||||
int i, nb_zone, cnt;
|
||||
sector_t min_spacing;
|
||||
sector_t curr_offset;
|
||||
struct list_head *tmp;
|
||||
|
||||
conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
|
||||
GFP_KERNEL);
|
||||
if (!conf)
|
||||
return NULL;
|
||||
|
||||
mddev->private = conf;
|
||||
|
||||
cnt = 0;
|
||||
conf->array_size = 0;
|
||||
|
||||
ITERATE_RDEV(mddev,rdev,tmp) {
|
||||
int j = rdev->raid_disk;
|
||||
dev_info_t *disk = conf->disks + j;
|
||||
|
||||
if (j < 0 || j > raid_disks || disk->rdev) {
|
||||
printk("linear: disk numbering problem. Aborting!\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
disk->rdev = rdev;
|
||||
|
||||
blk_queue_stack_limits(mddev->queue,
|
||||
rdev->bdev->bd_disk->queue);
|
||||
/* as we don't honour merge_bvec_fn, we must never risk
|
||||
* violating it, so limit ->max_sector to one PAGE, as
|
||||
* a one page request is never in violation.
|
||||
*/
|
||||
if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
|
||||
mddev->queue->max_sectors > (PAGE_SIZE>>9))
|
||||
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
|
||||
|
||||
disk->size = rdev->size;
|
||||
conf->array_size += rdev->size;
|
||||
|
||||
cnt++;
|
||||
}
|
||||
if (cnt != raid_disks) {
|
||||
printk("linear: not enough drives present. Aborting!\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
min_spacing = conf->array_size;
|
||||
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
|
||||
|
||||
/* min_spacing is the minimum spacing that will fit the hash
|
||||
* table in one PAGE. This may be much smaller than needed.
|
||||
* We find the smallest non-terminal set of consecutive devices
|
||||
* that is larger than min_spacing as use the size of that as
|
||||
* the actual spacing
|
||||
*/
|
||||
conf->hash_spacing = conf->array_size;
|
||||
for (i=0; i < cnt-1 ; i++) {
|
||||
sector_t sz = 0;
|
||||
int j;
|
||||
for (j = i; j < cnt - 1 && sz < min_spacing; j++)
|
||||
sz += conf->disks[j].size;
|
||||
if (sz >= min_spacing && sz < conf->hash_spacing)
|
||||
conf->hash_spacing = sz;
|
||||
}
|
||||
|
||||
/* hash_spacing may be too large for sector_div to work with,
|
||||
* so we might need to pre-shift
|
||||
*/
|
||||
conf->preshift = 0;
|
||||
if (sizeof(sector_t) > sizeof(u32)) {
|
||||
sector_t space = conf->hash_spacing;
|
||||
while (space > (sector_t)(~(u32)0)) {
|
||||
space >>= 1;
|
||||
conf->preshift++;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* This code was restructured to work around a gcc-2.95.3 internal
|
||||
* compiler error. Alter it with care.
|
||||
*/
|
||||
{
|
||||
sector_t sz;
|
||||
unsigned round;
|
||||
unsigned long base;
|
||||
|
||||
sz = conf->array_size >> conf->preshift;
|
||||
sz += 1; /* force round-up */
|
||||
base = conf->hash_spacing >> conf->preshift;
|
||||
round = sector_div(sz, base);
|
||||
nb_zone = sz + (round ? 1 : 0);
|
||||
}
|
||||
BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
|
||||
|
||||
conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
|
||||
GFP_KERNEL);
|
||||
if (!conf->hash_table)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Here we generate the linear hash table
|
||||
* First calculate the device offsets.
|
||||
*/
|
||||
conf->disks[0].offset = 0;
|
||||
for (i=1; i<mddev->raid_disks; i++)
|
||||
conf->disks[i].offset =
|
||||
conf->disks[i-1].offset +
|
||||
conf->disks[i-1].size;
|
||||
|
||||
table = conf->hash_table;
|
||||
curr_offset = 0;
|
||||
i = 0;
|
||||
for (curr_offset = 0;
|
||||
curr_offset < conf->array_size;
|
||||
curr_offset += conf->hash_spacing) {
|
||||
|
||||
while (i < mddev->raid_disks-1 &&
|
||||
curr_offset >= conf->disks[i+1].offset)
|
||||
i++;
|
||||
|
||||
*table ++ = conf->disks + i;
|
||||
}
|
||||
|
||||
if (conf->preshift) {
|
||||
conf->hash_spacing >>= conf->preshift;
|
||||
/* round hash_spacing up so that when we divide by it,
|
||||
* we err on the side of "too-low", which is safest.
|
||||
*/
|
||||
conf->hash_spacing++;
|
||||
}
|
||||
|
||||
BUG_ON(table - conf->hash_table > nb_zone);
|
||||
|
||||
return conf;
|
||||
|
||||
out:
|
||||
kfree(conf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int linear_run (mddev_t *mddev)
|
||||
{
|
||||
linear_conf_t *conf;
|
||||
|
||||
conf = linear_conf(mddev, mddev->raid_disks);
|
||||
|
||||
if (!conf)
|
||||
return 1;
|
||||
mddev->private = conf;
|
||||
mddev->array_size = conf->array_size;
|
||||
|
||||
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
|
||||
mddev->queue->unplug_fn = linear_unplug;
|
||||
mddev->queue->issue_flush_fn = linear_issue_flush;
|
||||
mddev->queue->backing_dev_info.congested_fn = linear_congested;
|
||||
mddev->queue->backing_dev_info.congested_data = mddev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
|
||||
{
|
||||
/* Adding a drive to a linear array allows the array to grow.
|
||||
* It is permitted if the new drive has a matching superblock
|
||||
* already on it, with raid_disk equal to raid_disks.
|
||||
* It is achieved by creating a new linear_private_data structure
|
||||
* and swapping it in in-place of the current one.
|
||||
* The current one is never freed until the array is stopped.
|
||||
* This avoids races.
|
||||
*/
|
||||
linear_conf_t *newconf;
|
||||
|
||||
if (rdev->raid_disk != mddev->raid_disks)
|
||||
return -EINVAL;
|
||||
|
||||
newconf = linear_conf(mddev,mddev->raid_disks+1);
|
||||
|
||||
if (!newconf)
|
||||
return -ENOMEM;
|
||||
|
||||
newconf->prev = mddev_to_conf(mddev);
|
||||
mddev->private = newconf;
|
||||
mddev->raid_disks++;
|
||||
mddev->array_size = newconf->array_size;
|
||||
set_capacity(mddev->gendisk, mddev->array_size << 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linear_stop (mddev_t *mddev)
|
||||
{
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
|
||||
do {
|
||||
linear_conf_t *t = conf->prev;
|
||||
kfree(conf->hash_table);
|
||||
kfree(conf);
|
||||
conf = t;
|
||||
} while (conf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int linear_make_request (request_queue_t *q, struct bio *bio)
|
||||
{
|
||||
const int rw = bio_data_dir(bio);
|
||||
mddev_t *mddev = q->queuedata;
|
||||
dev_info_t *tmp_dev;
|
||||
sector_t block;
|
||||
|
||||
if (unlikely(bio_barrier(bio))) {
|
||||
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
disk_stat_inc(mddev->gendisk, ios[rw]);
|
||||
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
|
||||
|
||||
tmp_dev = which_dev(mddev, bio->bi_sector);
|
||||
block = bio->bi_sector >> 1;
|
||||
|
||||
if (unlikely(block >= (tmp_dev->size + tmp_dev->offset)
|
||||
|| block < tmp_dev->offset)) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
printk("linear_make_request: Block %llu out of bounds on "
|
||||
"dev %s size %llu offset %llu\n",
|
||||
(unsigned long long)block,
|
||||
bdevname(tmp_dev->rdev->bdev, b),
|
||||
(unsigned long long)tmp_dev->size,
|
||||
(unsigned long long)tmp_dev->offset);
|
||||
bio_io_error(bio, bio->bi_size);
|
||||
return 0;
|
||||
}
|
||||
if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
|
||||
(tmp_dev->offset + tmp_dev->size)<<1)) {
|
||||
/* This bio crosses a device boundary, so we have to
|
||||
* split it.
|
||||
*/
|
||||
struct bio_pair *bp;
|
||||
bp = bio_split(bio, bio_split_pool,
|
||||
((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector);
|
||||
if (linear_make_request(q, &bp->bio1))
|
||||
generic_make_request(&bp->bio1);
|
||||
if (linear_make_request(q, &bp->bio2))
|
||||
generic_make_request(&bp->bio2);
|
||||
bio_pair_release(bp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bio->bi_bdev = tmp_dev->rdev->bdev;
|
||||
bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void linear_status (struct seq_file *seq, mddev_t *mddev)
|
||||
{
|
||||
|
||||
#undef MD_DEBUG
|
||||
#ifdef MD_DEBUG
|
||||
int j;
|
||||
linear_conf_t *conf = mddev_to_conf(mddev);
|
||||
sector_t s = 0;
|
||||
|
||||
seq_printf(seq, " ");
|
||||
for (j = 0; j < mddev->raid_disks; j++)
|
||||
{
|
||||
char b[BDEVNAME_SIZE];
|
||||
s += conf->smallest_size;
|
||||
seq_printf(seq, "[%s",
|
||||
bdevname(conf->hash_table[j][0].rdev->bdev,b));
|
||||
|
||||
while (s > conf->hash_table[j][0].offset +
|
||||
conf->hash_table[j][0].size)
|
||||
seq_printf(seq, "/%s] ",
|
||||
bdevname(conf->hash_table[j][1].rdev->bdev,b));
|
||||
else
|
||||
seq_printf(seq, "] ");
|
||||
}
|
||||
seq_printf(seq, "\n");
|
||||
#endif
|
||||
seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
|
||||
}
|
||||
|
||||
|
||||
static struct mdk_personality linear_personality =
|
||||
{
|
||||
.name = "linear",
|
||||
.level = LEVEL_LINEAR,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = linear_make_request,
|
||||
.run = linear_run,
|
||||
.stop = linear_stop,
|
||||
.status = linear_status,
|
||||
.hot_add_disk = linear_add,
|
||||
};
|
||||
|
||||
static int __init linear_init (void)
|
||||
{
|
||||
return register_md_personality (&linear_personality);
|
||||
}
|
||||
|
||||
static void linear_exit (void)
|
||||
{
|
||||
unregister_md_personality (&linear_personality);
|
||||
}
|
||||
|
||||
|
||||
module_init(linear_init);
|
||||
module_exit(linear_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
|
||||
MODULE_ALIAS("md-linear");
|
||||
MODULE_ALIAS("md-level--1");
|
||||
5759
drivers/md/md.c
Normal file
5759
drivers/md/md.c
Normal file
File diff suppressed because it is too large
Load Diff
125
drivers/md/mktables.c
Normal file
125
drivers/md/mktables.c
Normal file
@@ -0,0 +1,125 @@
|
||||
#ident "$Id: mktables.c,v 1.1.1.1 2007/06/12 07:27:10 eyryu Exp $"
|
||||
/* ----------------------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* mktables.c
|
||||
*
|
||||
* Make RAID-6 tables. This is a host user space program to be run at
|
||||
* compile time.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
static uint8_t gfmul(uint8_t a, uint8_t b)
|
||||
{
|
||||
uint8_t v = 0;
|
||||
|
||||
while ( b ) {
|
||||
if ( b & 1 ) v ^= a;
|
||||
a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
|
||||
b >>= 1;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
static uint8_t gfpow(uint8_t a, int b)
|
||||
{
|
||||
uint8_t v = 1;
|
||||
|
||||
b %= 255;
|
||||
if ( b < 0 )
|
||||
b += 255;
|
||||
|
||||
while ( b ) {
|
||||
if ( b & 1 ) v = gfmul(v,a);
|
||||
a = gfmul(a,a);
|
||||
b >>= 1;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j, k;
|
||||
uint8_t v;
|
||||
uint8_t exptbl[256], invtbl[256];
|
||||
|
||||
printf("#include \"raid6.h\"\n");
|
||||
|
||||
/* Compute multiplication table */
|
||||
printf("\nconst u8 __attribute__((aligned(256)))\n"
|
||||
"raid6_gfmul[256][256] =\n"
|
||||
"{\n");
|
||||
for ( i = 0 ; i < 256 ; i++ ) {
|
||||
printf("\t{\n");
|
||||
for ( j = 0 ; j < 256 ; j += 8 ) {
|
||||
printf("\t\t");
|
||||
for ( k = 0 ; k < 8 ; k++ ) {
|
||||
printf("0x%02x, ", gfmul(i,j+k));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\t},\n");
|
||||
}
|
||||
printf("};\n");
|
||||
|
||||
/* Compute power-of-2 table (exponent) */
|
||||
v = 1;
|
||||
printf("\nconst u8 __attribute__((aligned(256)))\n"
|
||||
"raid6_gfexp[256] =\n"
|
||||
"{\n");
|
||||
for ( i = 0 ; i < 256 ; i += 8 ) {
|
||||
printf("\t");
|
||||
for ( j = 0 ; j < 8 ; j++ ) {
|
||||
exptbl[i+j] = v;
|
||||
printf("0x%02x, ", v);
|
||||
v = gfmul(v,2);
|
||||
if ( v == 1 ) v = 0; /* For entry 255, not a real entry */
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("};\n");
|
||||
|
||||
/* Compute inverse table x^-1 == x^254 */
|
||||
printf("\nconst u8 __attribute__((aligned(256)))\n"
|
||||
"raid6_gfinv[256] =\n"
|
||||
"{\n");
|
||||
for ( i = 0 ; i < 256 ; i += 8 ) {
|
||||
printf("\t");
|
||||
for ( j = 0 ; j < 8 ; j++ ) {
|
||||
invtbl[i+j] = v = gfpow(i+j,254);
|
||||
printf("0x%02x, ", v);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("};\n");
|
||||
|
||||
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
|
||||
printf("\nconst u8 __attribute__((aligned(256)))\n"
|
||||
"raid6_gfexi[256] =\n"
|
||||
"{\n");
|
||||
for ( i = 0 ; i < 256 ; i += 8 ) {
|
||||
printf("\t");
|
||||
for ( j = 0 ; j < 8 ; j++ ) {
|
||||
printf("0x%02x, ", invtbl[exptbl[i+j]^1]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
595
drivers/md/multipath.c
Normal file
595
drivers/md/multipath.c
Normal file
@@ -0,0 +1,595 @@
|
||||
/*
|
||||
* multipath.c : Multiple Devices driver for Linux
|
||||
*
|
||||
* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
|
||||
*
|
||||
* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
|
||||
*
|
||||
* MULTIPATH management functions.
|
||||
*
|
||||
* derived from raid1.c.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* (for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/raid/multipath.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <asm/atomic.h>
|
||||
|
||||
#define MAJOR_NR MD_MAJOR
|
||||
#define MD_DRIVER
|
||||
#define MD_PERSONALITY
|
||||
|
||||
#define MAX_WORK_PER_DISK 128
|
||||
|
||||
#define NR_RESERVED_BUFS 32
|
||||
|
||||
|
||||
static int multipath_map (multipath_conf_t *conf)
|
||||
{
|
||||
int i, disks = conf->raid_disks;
|
||||
|
||||
/*
|
||||
* Later we do read balancing on the read side
|
||||
* now we use the first available disk.
|
||||
*/
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < disks; i++) {
|
||||
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
if (rdev && test_bit(In_sync, &rdev->flags)) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
return i;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
|
||||
return (-1);
|
||||
}
|
||||
|
||||
static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
|
||||
{
|
||||
unsigned long flags;
|
||||
mddev_t *mddev = mp_bh->mddev;
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
list_add(&mp_bh->retry_list, &conf->retry_list);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* multipath_end_bh_io() is called when we have finished servicing a multipathed
|
||||
* operation and are ready to return a success/failure code to the buffer
|
||||
* cache layer.
|
||||
*/
|
||||
static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
|
||||
{
|
||||
struct bio *bio = mp_bh->master_bio;
|
||||
multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
|
||||
|
||||
bio_endio(bio, bio->bi_size, err);
|
||||
mempool_free(mp_bh, conf->pool);
|
||||
}
|
||||
|
||||
static int multipath_end_request(struct bio *bio, unsigned int bytes_done,
|
||||
int error)
|
||||
{
|
||||
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
|
||||
struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
|
||||
multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
|
||||
mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
|
||||
|
||||
if (bio->bi_size)
|
||||
return 1;
|
||||
|
||||
if (uptodate)
|
||||
multipath_end_bh_io(mp_bh, 0);
|
||||
else if (!bio_rw_ahead(bio)) {
|
||||
/*
|
||||
* oops, IO error:
|
||||
*/
|
||||
char b[BDEVNAME_SIZE];
|
||||
md_error (mp_bh->mddev, rdev);
|
||||
printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
|
||||
bdevname(rdev->bdev,b),
|
||||
(unsigned long long)bio->bi_sector);
|
||||
multipath_reschedule_retry(mp_bh);
|
||||
} else
|
||||
multipath_end_bh_io(mp_bh, error);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unplug_slaves(mddev_t *mddev)
|
||||
{
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i;
|
||||
|
||||
rcu_read_lock();
|
||||
for (i=0; i<mddev->raid_disks; i++) {
|
||||
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags)
|
||||
&& atomic_read(&rdev->nr_pending)) {
|
||||
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (r_queue->unplug_fn)
|
||||
r_queue->unplug_fn(r_queue);
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void multipath_unplug(request_queue_t *q)
|
||||
{
|
||||
unplug_slaves(q->queuedata);
|
||||
}
|
||||
|
||||
|
||||
static int multipath_make_request (request_queue_t *q, struct bio * bio)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
struct multipath_bh * mp_bh;
|
||||
struct multipath_info *multipath;
|
||||
const int rw = bio_data_dir(bio);
|
||||
|
||||
if (unlikely(bio_barrier(bio))) {
|
||||
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
|
||||
|
||||
mp_bh->master_bio = bio;
|
||||
mp_bh->mddev = mddev;
|
||||
|
||||
disk_stat_inc(mddev->gendisk, ios[rw]);
|
||||
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
|
||||
|
||||
mp_bh->path = multipath_map(conf);
|
||||
if (mp_bh->path < 0) {
|
||||
bio_endio(bio, bio->bi_size, -EIO);
|
||||
mempool_free(mp_bh, conf->pool);
|
||||
return 0;
|
||||
}
|
||||
multipath = conf->multipaths + mp_bh->path;
|
||||
|
||||
mp_bh->bio = *bio;
|
||||
mp_bh->bio.bi_sector += multipath->rdev->data_offset;
|
||||
mp_bh->bio.bi_bdev = multipath->rdev->bdev;
|
||||
mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
|
||||
mp_bh->bio.bi_end_io = multipath_end_request;
|
||||
mp_bh->bio.bi_private = mp_bh;
|
||||
generic_make_request(&mp_bh->bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void multipath_status (struct seq_file *seq, mddev_t *mddev)
|
||||
{
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i;
|
||||
|
||||
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
|
||||
conf->working_disks);
|
||||
for (i = 0; i < conf->raid_disks; i++)
|
||||
seq_printf (seq, "%s",
|
||||
conf->multipaths[i].rdev &&
|
||||
test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
|
||||
seq_printf (seq, "]");
|
||||
}
|
||||
|
||||
static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
|
||||
sector_t *error_sector)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i, ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
for (i=0; i<mddev->raid_disks && ret == 0; i++) {
|
||||
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags)) {
|
||||
struct block_device *bdev = rdev->bdev;
|
||||
request_queue_t *r_queue = bdev_get_queue(bdev);
|
||||
|
||||
if (!r_queue->issue_flush_fn)
|
||||
ret = -EOPNOTSUPP;
|
||||
else {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
|
||||
error_sector);
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
static int multipath_congested(void *data, int bits)
|
||||
{
|
||||
mddev_t *mddev = data;
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
int i, ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < mddev->raid_disks ; i++) {
|
||||
mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags)) {
|
||||
request_queue_t *q = bdev_get_queue(rdev->bdev);
|
||||
|
||||
ret |= bdi_congested(&q->backing_dev_info, bits);
|
||||
/* Just like multipath_map, we just check the
|
||||
* first available device
|
||||
*/
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Careful, this can execute in IRQ contexts as well!
|
||||
*/
|
||||
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
|
||||
{
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
if (conf->working_disks <= 1) {
|
||||
/*
|
||||
* Uh oh, we can do nothing if this is our last path, but
|
||||
* first check if this is a queued request for a device
|
||||
* which has just failed.
|
||||
*/
|
||||
printk(KERN_ALERT
|
||||
"multipath: only one IO path left and IO error.\n");
|
||||
/* leave it active... it's all we have */
|
||||
} else {
|
||||
/*
|
||||
* Mark disk as unusable
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags)) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||
conf->working_disks--;
|
||||
mddev->degraded++;
|
||||
printk(KERN_ALERT "multipath: IO failure on %s,"
|
||||
" disabling IO path. \n Operation continuing"
|
||||
" on %d IO paths.\n",
|
||||
bdevname (rdev->bdev,b),
|
||||
conf->working_disks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void print_multipath_conf (multipath_conf_t *conf)
|
||||
{
|
||||
int i;
|
||||
struct multipath_info *tmp;
|
||||
|
||||
printk("MULTIPATH conf printout:\n");
|
||||
if (!conf) {
|
||||
printk("(conf==NULL)\n");
|
||||
return;
|
||||
}
|
||||
printk(" --- wd:%d rd:%d\n", conf->working_disks,
|
||||
conf->raid_disks);
|
||||
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
tmp = conf->multipaths + i;
|
||||
if (tmp->rdev)
|
||||
printk(" disk%d, o:%d, dev:%s\n",
|
||||
i,!test_bit(Faulty, &tmp->rdev->flags),
|
||||
bdevname(tmp->rdev->bdev,b));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
|
||||
{
|
||||
multipath_conf_t *conf = mddev->private;
|
||||
struct request_queue *q;
|
||||
int found = 0;
|
||||
int path;
|
||||
struct multipath_info *p;
|
||||
|
||||
print_multipath_conf(conf);
|
||||
|
||||
for (path=0; path<mddev->raid_disks; path++)
|
||||
if ((p=conf->multipaths+path)->rdev == NULL) {
|
||||
q = rdev->bdev->bd_disk->queue;
|
||||
blk_queue_stack_limits(mddev->queue, q);
|
||||
|
||||
/* as we don't honour merge_bvec_fn, we must never risk
|
||||
* violating it, so limit ->max_sector to one PAGE, as
|
||||
* a one page request is never in violation.
|
||||
* (Note: it is very unlikely that a device with
|
||||
* merge_bvec_fn will be involved in multipath.)
|
||||
*/
|
||||
if (q->merge_bvec_fn &&
|
||||
mddev->queue->max_sectors > (PAGE_SIZE>>9))
|
||||
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
|
||||
|
||||
conf->working_disks++;
|
||||
mddev->degraded--;
|
||||
rdev->raid_disk = path;
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
rcu_assign_pointer(p->rdev, rdev);
|
||||
found = 1;
|
||||
}
|
||||
|
||||
print_multipath_conf(conf);
|
||||
return found;
|
||||
}
|
||||
|
||||
static int multipath_remove_disk(mddev_t *mddev, int number)
|
||||
{
|
||||
multipath_conf_t *conf = mddev->private;
|
||||
int err = 0;
|
||||
mdk_rdev_t *rdev;
|
||||
struct multipath_info *p = conf->multipaths + number;
|
||||
|
||||
print_multipath_conf(conf);
|
||||
|
||||
rdev = p->rdev;
|
||||
if (rdev) {
|
||||
if (test_bit(In_sync, &rdev->flags) ||
|
||||
atomic_read(&rdev->nr_pending)) {
|
||||
printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number);
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
p->rdev = NULL;
|
||||
synchronize_rcu();
|
||||
if (atomic_read(&rdev->nr_pending)) {
|
||||
/* lost the race, try later */
|
||||
err = -EBUSY;
|
||||
p->rdev = rdev;
|
||||
}
|
||||
}
|
||||
abort:
|
||||
|
||||
print_multipath_conf(conf);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* This is a kernel thread which:
|
||||
*
|
||||
* 1. Retries failed read operations on working multipaths.
|
||||
* 2. Updates the raid superblock when problems encounter.
|
||||
* 3. Performs writes following reads for array syncronising.
|
||||
*/
|
||||
|
||||
static void multipathd (mddev_t *mddev)
|
||||
{
|
||||
struct multipath_bh *mp_bh;
|
||||
struct bio *bio;
|
||||
unsigned long flags;
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
struct list_head *head = &conf->retry_list;
|
||||
|
||||
md_check_recovery(mddev);
|
||||
for (;;) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
if (list_empty(head))
|
||||
break;
|
||||
mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
|
||||
list_del(head->prev);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
|
||||
bio = &mp_bh->bio;
|
||||
bio->bi_sector = mp_bh->master_bio->bi_sector;
|
||||
|
||||
if ((mp_bh->path = multipath_map (conf))<0) {
|
||||
printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
|
||||
" error for block %llu\n",
|
||||
bdevname(bio->bi_bdev,b),
|
||||
(unsigned long long)bio->bi_sector);
|
||||
multipath_end_bh_io(mp_bh, -EIO);
|
||||
} else {
|
||||
printk(KERN_ERR "multipath: %s: redirecting sector %llu"
|
||||
" to another IO path\n",
|
||||
bdevname(bio->bi_bdev,b),
|
||||
(unsigned long long)bio->bi_sector);
|
||||
*bio = *(mp_bh->master_bio);
|
||||
bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
|
||||
bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
|
||||
bio->bi_rw |= (1 << BIO_RW_FAILFAST);
|
||||
bio->bi_end_io = multipath_end_request;
|
||||
bio->bi_private = mp_bh;
|
||||
generic_make_request(bio);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
}
|
||||
|
||||
static int multipath_run (mddev_t *mddev)
|
||||
{
|
||||
multipath_conf_t *conf;
|
||||
int disk_idx;
|
||||
struct multipath_info *disk;
|
||||
mdk_rdev_t *rdev;
|
||||
struct list_head *tmp;
|
||||
|
||||
if (mddev->level != LEVEL_MULTIPATH) {
|
||||
printk("multipath: %s: raid level not set to multipath IO (%d)\n",
|
||||
mdname(mddev), mddev->level);
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* copy the already verified devices into our private MULTIPATH
|
||||
* bookkeeping area. [whatever we allocate in multipath_run(),
|
||||
* should be freed in multipath_stop()]
|
||||
*/
|
||||
|
||||
conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
|
||||
mddev->private = conf;
|
||||
if (!conf) {
|
||||
printk(KERN_ERR
|
||||
"multipath: couldn't allocate memory for %s\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
}
|
||||
|
||||
conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
|
||||
GFP_KERNEL);
|
||||
if (!conf->multipaths) {
|
||||
printk(KERN_ERR
|
||||
"multipath: couldn't allocate memory for %s\n",
|
||||
mdname(mddev));
|
||||
goto out_free_conf;
|
||||
}
|
||||
|
||||
conf->working_disks = 0;
|
||||
ITERATE_RDEV(mddev,rdev,tmp) {
|
||||
disk_idx = rdev->raid_disk;
|
||||
if (disk_idx < 0 ||
|
||||
disk_idx >= mddev->raid_disks)
|
||||
continue;
|
||||
|
||||
disk = conf->multipaths + disk_idx;
|
||||
disk->rdev = rdev;
|
||||
|
||||
blk_queue_stack_limits(mddev->queue,
|
||||
rdev->bdev->bd_disk->queue);
|
||||
/* as we don't honour merge_bvec_fn, we must never risk
|
||||
* violating it, not that we ever expect a device with
|
||||
* a merge_bvec_fn to be involved in multipath */
|
||||
if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
|
||||
mddev->queue->max_sectors > (PAGE_SIZE>>9))
|
||||
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
|
||||
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
conf->working_disks++;
|
||||
}
|
||||
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
conf->mddev = mddev;
|
||||
spin_lock_init(&conf->device_lock);
|
||||
INIT_LIST_HEAD(&conf->retry_list);
|
||||
|
||||
if (!conf->working_disks) {
|
||||
printk(KERN_ERR "multipath: no operational IO paths for %s\n",
|
||||
mdname(mddev));
|
||||
goto out_free_conf;
|
||||
}
|
||||
mddev->degraded = conf->raid_disks - conf->working_disks;
|
||||
|
||||
conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
|
||||
sizeof(struct multipath_bh));
|
||||
if (conf->pool == NULL) {
|
||||
printk(KERN_ERR
|
||||
"multipath: couldn't allocate memory for %s\n",
|
||||
mdname(mddev));
|
||||
goto out_free_conf;
|
||||
}
|
||||
|
||||
{
|
||||
mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
|
||||
if (!mddev->thread) {
|
||||
printk(KERN_ERR "multipath: couldn't allocate thread"
|
||||
" for %s\n", mdname(mddev));
|
||||
goto out_free_conf;
|
||||
}
|
||||
}
|
||||
|
||||
printk(KERN_INFO
|
||||
"multipath: array %s active with %d out of %d IO paths\n",
|
||||
mdname(mddev), conf->working_disks, mddev->raid_disks);
|
||||
/*
|
||||
* Ok, everything is just fine now
|
||||
*/
|
||||
mddev->array_size = mddev->size;
|
||||
|
||||
mddev->queue->unplug_fn = multipath_unplug;
|
||||
mddev->queue->issue_flush_fn = multipath_issue_flush;
|
||||
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
|
||||
mddev->queue->backing_dev_info.congested_data = mddev;
|
||||
|
||||
return 0;
|
||||
|
||||
out_free_conf:
|
||||
if (conf->pool)
|
||||
mempool_destroy(conf->pool);
|
||||
kfree(conf->multipaths);
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
out:
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
|
||||
static int multipath_stop (mddev_t *mddev)
|
||||
{
|
||||
multipath_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
md_unregister_thread(mddev->thread);
|
||||
mddev->thread = NULL;
|
||||
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
|
||||
mempool_destroy(conf->pool);
|
||||
kfree(conf->multipaths);
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct mdk_personality multipath_personality =
|
||||
{
|
||||
.name = "multipath",
|
||||
.level = LEVEL_MULTIPATH,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = multipath_make_request,
|
||||
.run = multipath_run,
|
||||
.stop = multipath_stop,
|
||||
.status = multipath_status,
|
||||
.error_handler = multipath_error,
|
||||
.hot_add_disk = multipath_add_disk,
|
||||
.hot_remove_disk= multipath_remove_disk,
|
||||
};
|
||||
|
||||
static int __init multipath_init (void)
|
||||
{
|
||||
return register_md_personality (&multipath_personality);
|
||||
}
|
||||
|
||||
static void __exit multipath_exit (void)
|
||||
{
|
||||
unregister_md_personality (&multipath_personality);
|
||||
}
|
||||
|
||||
module_init(multipath_init);
|
||||
module_exit(multipath_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
|
||||
MODULE_ALIAS("md-multipath");
|
||||
MODULE_ALIAS("md-level--4");
|
||||
551
drivers/md/raid0.c
Normal file
551
drivers/md/raid0.c
Normal file
@@ -0,0 +1,551 @@
|
||||
/*
|
||||
raid0.c : Multiple Devices driver for Linux
|
||||
Copyright (C) 1994-96 Marc ZYNGIER
|
||||
<zyngier@ufr-info-p7.ibp.fr> or
|
||||
<maz@gloups.fdn.fr>
|
||||
Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
|
||||
|
||||
|
||||
RAID-0 management functions.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
(for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/raid/raid0.h>
|
||||
|
||||
#define MAJOR_NR MD_MAJOR
|
||||
#define MD_DRIVER
|
||||
#define MD_PERSONALITY
|
||||
|
||||
static void raid0_unplug(request_queue_t *q)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
mdk_rdev_t **devlist = conf->strip_zone[0].dev;
|
||||
int i;
|
||||
|
||||
for (i=0; i<mddev->raid_disks; i++) {
|
||||
request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
|
||||
|
||||
if (r_queue->unplug_fn)
|
||||
r_queue->unplug_fn(r_queue);
|
||||
}
|
||||
}
|
||||
|
||||
static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
|
||||
sector_t *error_sector)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
mdk_rdev_t **devlist = conf->strip_zone[0].dev;
|
||||
int i, ret = 0;
|
||||
|
||||
for (i=0; i<mddev->raid_disks && ret == 0; i++) {
|
||||
struct block_device *bdev = devlist[i]->bdev;
|
||||
request_queue_t *r_queue = bdev_get_queue(bdev);
|
||||
|
||||
if (!r_queue->issue_flush_fn)
|
||||
ret = -EOPNOTSUPP;
|
||||
else
|
||||
ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int raid0_congested(void *data, int bits)
|
||||
{
|
||||
mddev_t *mddev = data;
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
mdk_rdev_t **devlist = conf->strip_zone[0].dev;
|
||||
int i, ret = 0;
|
||||
|
||||
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
|
||||
request_queue_t *q = bdev_get_queue(devlist[i]->bdev);
|
||||
|
||||
ret |= bdi_congested(&q->backing_dev_info, bits);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int create_strip_zones (mddev_t *mddev)
|
||||
{
|
||||
int i, c, j;
|
||||
sector_t current_offset, curr_zone_offset;
|
||||
sector_t min_spacing;
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
|
||||
struct list_head *tmp1, *tmp2;
|
||||
struct strip_zone *zone;
|
||||
int cnt;
|
||||
char b[BDEVNAME_SIZE];
|
||||
|
||||
/*
|
||||
* The number of 'same size groups'
|
||||
*/
|
||||
conf->nr_strip_zones = 0;
|
||||
|
||||
ITERATE_RDEV(mddev,rdev1,tmp1) {
|
||||
printk("raid0: looking at %s\n",
|
||||
bdevname(rdev1->bdev,b));
|
||||
c = 0;
|
||||
ITERATE_RDEV(mddev,rdev2,tmp2) {
|
||||
printk("raid0: comparing %s(%llu)",
|
||||
bdevname(rdev1->bdev,b),
|
||||
(unsigned long long)rdev1->size);
|
||||
printk(" with %s(%llu)\n",
|
||||
bdevname(rdev2->bdev,b),
|
||||
(unsigned long long)rdev2->size);
|
||||
if (rdev2 == rdev1) {
|
||||
printk("raid0: END\n");
|
||||
break;
|
||||
}
|
||||
if (rdev2->size == rdev1->size)
|
||||
{
|
||||
/*
|
||||
* Not unique, don't count it as a new
|
||||
* group
|
||||
*/
|
||||
printk("raid0: EQUAL\n");
|
||||
c = 1;
|
||||
break;
|
||||
}
|
||||
printk("raid0: NOT EQUAL\n");
|
||||
}
|
||||
if (!c) {
|
||||
printk("raid0: ==> UNIQUE\n");
|
||||
conf->nr_strip_zones++;
|
||||
printk("raid0: %d zones\n", conf->nr_strip_zones);
|
||||
}
|
||||
}
|
||||
printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
|
||||
|
||||
conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
|
||||
conf->nr_strip_zones, GFP_KERNEL);
|
||||
if (!conf->strip_zone)
|
||||
return 1;
|
||||
conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
|
||||
conf->nr_strip_zones*mddev->raid_disks,
|
||||
GFP_KERNEL);
|
||||
if (!conf->devlist)
|
||||
return 1;
|
||||
|
||||
/* The first zone must contain all devices, so here we check that
|
||||
* there is a proper alignment of slots to devices and find them all
|
||||
*/
|
||||
zone = &conf->strip_zone[0];
|
||||
cnt = 0;
|
||||
smallest = NULL;
|
||||
zone->dev = conf->devlist;
|
||||
ITERATE_RDEV(mddev, rdev1, tmp1) {
|
||||
int j = rdev1->raid_disk;
|
||||
|
||||
if (j < 0 || j >= mddev->raid_disks) {
|
||||
printk("raid0: bad disk number %d - aborting!\n", j);
|
||||
goto abort;
|
||||
}
|
||||
if (zone->dev[j]) {
|
||||
printk("raid0: multiple devices for %d - aborting!\n",
|
||||
j);
|
||||
goto abort;
|
||||
}
|
||||
zone->dev[j] = rdev1;
|
||||
|
||||
blk_queue_stack_limits(mddev->queue,
|
||||
rdev1->bdev->bd_disk->queue);
|
||||
/* as we don't honour merge_bvec_fn, we must never risk
|
||||
* violating it, so limit ->max_sector to one PAGE, as
|
||||
* a one page request is never in violation.
|
||||
*/
|
||||
|
||||
if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
|
||||
mddev->queue->max_sectors > (PAGE_SIZE>>9))
|
||||
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
|
||||
|
||||
if (!smallest || (rdev1->size <smallest->size))
|
||||
smallest = rdev1;
|
||||
cnt++;
|
||||
}
|
||||
if (cnt != mddev->raid_disks) {
|
||||
printk("raid0: too few disks (%d of %d) - aborting!\n",
|
||||
cnt, mddev->raid_disks);
|
||||
goto abort;
|
||||
}
|
||||
zone->nb_dev = cnt;
|
||||
zone->size = smallest->size * cnt;
|
||||
zone->zone_offset = 0;
|
||||
|
||||
current_offset = smallest->size;
|
||||
curr_zone_offset = zone->size;
|
||||
|
||||
/* now do the other zones */
|
||||
for (i = 1; i < conf->nr_strip_zones; i++)
|
||||
{
|
||||
zone = conf->strip_zone + i;
|
||||
zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
|
||||
|
||||
printk("raid0: zone %d\n", i);
|
||||
zone->dev_offset = current_offset;
|
||||
smallest = NULL;
|
||||
c = 0;
|
||||
|
||||
for (j=0; j<cnt; j++) {
|
||||
char b[BDEVNAME_SIZE];
|
||||
rdev = conf->strip_zone[0].dev[j];
|
||||
printk("raid0: checking %s ...", bdevname(rdev->bdev,b));
|
||||
if (rdev->size > current_offset)
|
||||
{
|
||||
printk(" contained as device %d\n", c);
|
||||
zone->dev[c] = rdev;
|
||||
c++;
|
||||
if (!smallest || (rdev->size <smallest->size)) {
|
||||
smallest = rdev;
|
||||
printk(" (%llu) is smallest!.\n",
|
||||
(unsigned long long)rdev->size);
|
||||
}
|
||||
} else
|
||||
printk(" nope.\n");
|
||||
}
|
||||
|
||||
zone->nb_dev = c;
|
||||
zone->size = (smallest->size - current_offset) * c;
|
||||
printk("raid0: zone->nb_dev: %d, size: %llu\n",
|
||||
zone->nb_dev, (unsigned long long)zone->size);
|
||||
|
||||
zone->zone_offset = curr_zone_offset;
|
||||
curr_zone_offset += zone->size;
|
||||
|
||||
current_offset = smallest->size;
|
||||
printk("raid0: current zone offset: %llu\n",
|
||||
(unsigned long long)current_offset);
|
||||
}
|
||||
|
||||
/* Now find appropriate hash spacing.
|
||||
* We want a number which causes most hash entries to cover
|
||||
* at most two strips, but the hash table must be at most
|
||||
* 1 PAGE. We choose the smallest strip, or contiguous collection
|
||||
* of strips, that has big enough size. We never consider the last
|
||||
* strip though as it's size has no bearing on the efficacy of the hash
|
||||
* table.
|
||||
*/
|
||||
conf->hash_spacing = curr_zone_offset;
|
||||
min_spacing = curr_zone_offset;
|
||||
sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
|
||||
for (i=0; i < conf->nr_strip_zones-1; i++) {
|
||||
sector_t sz = 0;
|
||||
for (j=i; j<conf->nr_strip_zones-1 &&
|
||||
sz < min_spacing ; j++)
|
||||
sz += conf->strip_zone[j].size;
|
||||
if (sz >= min_spacing && sz < conf->hash_spacing)
|
||||
conf->hash_spacing = sz;
|
||||
}
|
||||
|
||||
mddev->queue->unplug_fn = raid0_unplug;
|
||||
|
||||
mddev->queue->issue_flush_fn = raid0_issue_flush;
|
||||
mddev->queue->backing_dev_info.congested_fn = raid0_congested;
|
||||
mddev->queue->backing_dev_info.congested_data = mddev;
|
||||
|
||||
printk("raid0: done.\n");
|
||||
return 0;
|
||||
abort:
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
|
||||
* @q: request queue
|
||||
* @bio: the buffer head that's been built up so far
|
||||
* @biovec: the request that could be merged to it.
|
||||
*
|
||||
* Return amount of bytes we can accept at this offset
|
||||
*/
|
||||
static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
|
||||
int max;
|
||||
unsigned int chunk_sectors = mddev->chunk_size >> 9;
|
||||
unsigned int bio_sectors = bio->bi_size >> 9;
|
||||
|
||||
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
|
||||
if (max < 0) max = 0; /* bio_add cannot handle a negative return */
|
||||
if (max <= biovec->bv_len && bio_sectors == 0)
|
||||
return biovec->bv_len;
|
||||
else
|
||||
return max;
|
||||
}
|
||||
|
||||
static int raid0_run (mddev_t *mddev)
|
||||
{
|
||||
unsigned cur=0, i=0, nb_zone;
|
||||
s64 size;
|
||||
raid0_conf_t *conf;
|
||||
mdk_rdev_t *rdev;
|
||||
struct list_head *tmp;
|
||||
|
||||
if (mddev->chunk_size == 0) {
|
||||
printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
|
||||
mdname(mddev),
|
||||
mddev->chunk_size >> 9,
|
||||
(mddev->chunk_size>>1)-1);
|
||||
blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
|
||||
blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
|
||||
|
||||
conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
|
||||
if (!conf)
|
||||
goto out;
|
||||
mddev->private = (void *)conf;
|
||||
|
||||
conf->strip_zone = NULL;
|
||||
conf->devlist = NULL;
|
||||
if (create_strip_zones (mddev))
|
||||
goto out_free_conf;
|
||||
|
||||
/* calculate array device size */
|
||||
mddev->array_size = 0;
|
||||
ITERATE_RDEV(mddev,rdev,tmp)
|
||||
mddev->array_size += rdev->size;
|
||||
|
||||
printk("raid0 : md_size is %llu blocks.\n",
|
||||
(unsigned long long)mddev->array_size);
|
||||
printk("raid0 : conf->hash_spacing is %llu blocks.\n",
|
||||
(unsigned long long)conf->hash_spacing);
|
||||
{
|
||||
sector_t s = mddev->array_size;
|
||||
sector_t space = conf->hash_spacing;
|
||||
int round;
|
||||
conf->preshift = 0;
|
||||
if (sizeof(sector_t) > sizeof(u32)) {
|
||||
/*shift down space and s so that sector_div will work */
|
||||
while (space > (sector_t) (~(u32)0)) {
|
||||
s >>= 1;
|
||||
space >>= 1;
|
||||
s += 1; /* force round-up */
|
||||
conf->preshift++;
|
||||
}
|
||||
}
|
||||
round = sector_div(s, (u32)space) ? 1 : 0;
|
||||
nb_zone = s + round;
|
||||
}
|
||||
printk("raid0 : nb_zone is %d.\n", nb_zone);
|
||||
|
||||
printk("raid0 : Allocating %Zd bytes for hash.\n",
|
||||
nb_zone*sizeof(struct strip_zone*));
|
||||
conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
|
||||
if (!conf->hash_table)
|
||||
goto out_free_conf;
|
||||
size = conf->strip_zone[cur].size;
|
||||
|
||||
conf->hash_table[0] = conf->strip_zone + cur;
|
||||
for (i=1; i< nb_zone; i++) {
|
||||
while (size <= conf->hash_spacing) {
|
||||
cur++;
|
||||
size += conf->strip_zone[cur].size;
|
||||
}
|
||||
size -= conf->hash_spacing;
|
||||
conf->hash_table[i] = conf->strip_zone + cur;
|
||||
}
|
||||
if (conf->preshift) {
|
||||
conf->hash_spacing >>= conf->preshift;
|
||||
/* round hash_spacing up so when we divide by it, we
|
||||
* err on the side of too-low, which is safest
|
||||
*/
|
||||
conf->hash_spacing++;
|
||||
}
|
||||
|
||||
/* calculate the max read-ahead size.
|
||||
* For read-ahead of large files to be effective, we need to
|
||||
* readahead at least twice a whole stripe. i.e. number of devices
|
||||
* multiplied by chunk size times 2.
|
||||
* If an individual device has an ra_pages greater than the
|
||||
* chunk size, then we will not drive that device as hard as it
|
||||
* wants. We consider this a configuration error: a larger
|
||||
* chunksize should be used in that case.
|
||||
*/
|
||||
{
|
||||
int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
|
||||
if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
|
||||
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
|
||||
}
|
||||
|
||||
|
||||
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
|
||||
return 0;
|
||||
|
||||
out_free_conf:
|
||||
kfree(conf->strip_zone);
|
||||
kfree(conf->devlist);
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
out:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int raid0_stop (mddev_t *mddev)
|
||||
{
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
|
||||
kfree(conf->hash_table);
|
||||
conf->hash_table = NULL;
|
||||
kfree(conf->strip_zone);
|
||||
conf->strip_zone = NULL;
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int raid0_make_request (request_queue_t *q, struct bio *bio)
|
||||
{
|
||||
mddev_t *mddev = q->queuedata;
|
||||
unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects;
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
struct strip_zone *zone;
|
||||
mdk_rdev_t *tmp_dev;
|
||||
sector_t chunk;
|
||||
sector_t block, rsect;
|
||||
const int rw = bio_data_dir(bio);
|
||||
|
||||
if (unlikely(bio_barrier(bio))) {
|
||||
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
disk_stat_inc(mddev->gendisk, ios[rw]);
|
||||
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
|
||||
|
||||
chunk_size = mddev->chunk_size >> 10;
|
||||
chunk_sects = mddev->chunk_size >> 9;
|
||||
chunksize_bits = ffz(~chunk_size);
|
||||
block = bio->bi_sector >> 1;
|
||||
|
||||
|
||||
if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
|
||||
struct bio_pair *bp;
|
||||
/* Sanity check -- queue functions should prevent this happening */
|
||||
if (bio->bi_vcnt != 1 ||
|
||||
bio->bi_idx != 0)
|
||||
goto bad_map;
|
||||
/* This is a one page bio that upper layers
|
||||
* refuse to split for us, so we need to split it.
|
||||
*/
|
||||
bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
|
||||
if (raid0_make_request(q, &bp->bio1))
|
||||
generic_make_request(&bp->bio1);
|
||||
if (raid0_make_request(q, &bp->bio2))
|
||||
generic_make_request(&bp->bio2);
|
||||
|
||||
bio_pair_release(bp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
sector_t x = block >> conf->preshift;
|
||||
sector_div(x, (u32)conf->hash_spacing);
|
||||
zone = conf->hash_table[x];
|
||||
}
|
||||
|
||||
while (block >= (zone->zone_offset + zone->size))
|
||||
zone++;
|
||||
|
||||
sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1);
|
||||
|
||||
|
||||
{
|
||||
sector_t x = (block - zone->zone_offset) >> chunksize_bits;
|
||||
|
||||
sector_div(x, zone->nb_dev);
|
||||
chunk = x;
|
||||
|
||||
x = block >> chunksize_bits;
|
||||
tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
|
||||
}
|
||||
rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
|
||||
+ sect_in_chunk;
|
||||
|
||||
bio->bi_bdev = tmp_dev->bdev;
|
||||
bio->bi_sector = rsect + tmp_dev->data_offset;
|
||||
|
||||
/*
|
||||
* Let the main block layer submit the IO and resolve recursion:
|
||||
*/
|
||||
return 1;
|
||||
|
||||
bad_map:
|
||||
printk("raid0_make_request bug: can't convert block across chunks"
|
||||
" or bigger than %dk %llu %d\n", chunk_size,
|
||||
(unsigned long long)bio->bi_sector, bio->bi_size >> 10);
|
||||
|
||||
bio_io_error(bio, bio->bi_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void raid0_status (struct seq_file *seq, mddev_t *mddev)
|
||||
{
|
||||
#undef MD_DEBUG
|
||||
#ifdef MD_DEBUG
|
||||
int j, k, h;
|
||||
char b[BDEVNAME_SIZE];
|
||||
raid0_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
h = 0;
|
||||
for (j = 0; j < conf->nr_strip_zones; j++) {
|
||||
seq_printf(seq, " z%d", j);
|
||||
if (conf->hash_table[h] == conf->strip_zone+j)
|
||||
seq_printf("(h%d)", h++);
|
||||
seq_printf(seq, "=[");
|
||||
for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
|
||||
seq_printf (seq, "%s/", bdevname(
|
||||
conf->strip_zone[j].dev[k]->bdev,b));
|
||||
|
||||
seq_printf (seq, "] zo=%d do=%d s=%d\n",
|
||||
conf->strip_zone[j].zone_offset,
|
||||
conf->strip_zone[j].dev_offset,
|
||||
conf->strip_zone[j].size);
|
||||
}
|
||||
#endif
|
||||
seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
|
||||
return;
|
||||
}
|
||||
|
||||
static struct mdk_personality raid0_personality=
|
||||
{
|
||||
.name = "raid0",
|
||||
.level = 0,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = raid0_make_request,
|
||||
.run = raid0_run,
|
||||
.stop = raid0_stop,
|
||||
.status = raid0_status,
|
||||
};
|
||||
|
||||
static int __init raid0_init (void)
|
||||
{
|
||||
return register_md_personality (&raid0_personality);
|
||||
}
|
||||
|
||||
static void raid0_exit (void)
|
||||
{
|
||||
unregister_md_personality (&raid0_personality);
|
||||
}
|
||||
|
||||
module_init(raid0_init);
|
||||
module_exit(raid0_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("md-personality-2"); /* RAID0 */
|
||||
MODULE_ALIAS("md-raid0");
|
||||
MODULE_ALIAS("md-level-0");
|
||||
2225
drivers/md/raid1.c
Normal file
2225
drivers/md/raid1.c
Normal file
File diff suppressed because it is too large
Load Diff
2221
drivers/md/raid10.c
Normal file
2221
drivers/md/raid10.c
Normal file
File diff suppressed because it is too large
Load Diff
4157
drivers/md/raid5.c
Normal file
4157
drivers/md/raid5.c
Normal file
File diff suppressed because it is too large
Load Diff
139
drivers/md/raid6.h
Normal file
139
drivers/md/raid6.h
Normal file
@@ -0,0 +1,139 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2003 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LINUX_RAID_RAID6_H
|
||||
#define LINUX_RAID_RAID6_H
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
/* Set to 1 to use kernel-wide empty_zero_page */
|
||||
#define RAID6_USE_EMPTY_ZERO_PAGE 0
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/raid/md.h>
|
||||
#include <linux/raid/raid5.h>
|
||||
|
||||
typedef raid5_conf_t raid6_conf_t; /* Same configuration */
|
||||
|
||||
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
|
||||
#define UPDATE_PARITY 4
|
||||
|
||||
/* We need a pre-zeroed page... if we don't want to use the kernel-provided
|
||||
one define it here */
|
||||
#if RAID6_USE_EMPTY_ZERO_PAGE
|
||||
# define raid6_empty_zero_page empty_zero_page
|
||||
#else
|
||||
extern const char raid6_empty_zero_page[PAGE_SIZE];
|
||||
#endif
|
||||
|
||||
#else /* ! __KERNEL__ */
|
||||
/* Used for testing in user space */
|
||||
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
/* Not standard, but glibc defines it */
|
||||
#define BITS_PER_LONG __WORDSIZE
|
||||
|
||||
typedef uint8_t u8;
|
||||
typedef uint16_t u16;
|
||||
typedef uint32_t u32;
|
||||
typedef uint64_t u64;
|
||||
|
||||
#ifndef PAGE_SIZE
|
||||
# define PAGE_SIZE 4096
|
||||
#endif
|
||||
extern const char raid6_empty_zero_page[PAGE_SIZE];
|
||||
|
||||
#define __init
|
||||
#define __exit
|
||||
#define __attribute_const__ __attribute__((const))
|
||||
#define noinline __attribute__((noinline))
|
||||
|
||||
#define preempt_enable()
|
||||
#define preempt_disable()
|
||||
#define cpu_has_feature(x) 1
|
||||
#define enable_kernel_altivec()
|
||||
#define disable_kernel_altivec()
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
/* Routine choices */
|
||||
struct raid6_calls {
|
||||
void (*gen_syndrome)(int, size_t, void **);
|
||||
int (*valid)(void); /* Returns 1 if this routine set is usable */
|
||||
const char *name; /* Name of this routine set */
|
||||
int prefer; /* Has special performance attribute */
|
||||
};
|
||||
|
||||
/* Selected algorithm */
|
||||
extern struct raid6_calls raid6_call;
|
||||
|
||||
/* Algorithm list */
|
||||
extern const struct raid6_calls * const raid6_algos[];
|
||||
int raid6_select_algo(void);
|
||||
|
||||
/* Return values from chk_syndrome */
|
||||
#define RAID6_OK 0
|
||||
#define RAID6_P_BAD 1
|
||||
#define RAID6_Q_BAD 2
|
||||
#define RAID6_PQ_BAD 3
|
||||
|
||||
/* Galois field tables */
|
||||
extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
|
||||
extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
|
||||
extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
|
||||
extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
|
||||
|
||||
/* Recovery routines */
|
||||
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
|
||||
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
|
||||
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
|
||||
|
||||
/* Some definitions to allow code to be compiled for testing in userspace */
|
||||
#ifndef __KERNEL__
|
||||
|
||||
# define jiffies raid6_jiffies()
|
||||
# define printk printf
|
||||
# define GFP_KERNEL 0
|
||||
# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
|
||||
# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE)
|
||||
|
||||
static inline void cpu_relax(void)
|
||||
{
|
||||
/* Nothing */
|
||||
}
|
||||
|
||||
#undef HZ
|
||||
#define HZ 1000
|
||||
static inline uint32_t raid6_jiffies(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec*1000 + tv.tv_usec/1000;
|
||||
}
|
||||
|
||||
#endif /* ! __KERNEL__ */
|
||||
|
||||
#endif /* LINUX_RAID_RAID6_H */
|
||||
153
drivers/md/raid6algos.c
Normal file
153
drivers/md/raid6algos.c
Normal file
@@ -0,0 +1,153 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6algos.c
|
||||
*
|
||||
* Algorithm list and algorithm selection for RAID-6
|
||||
*/
|
||||
|
||||
#include "raid6.h"
|
||||
#ifndef __KERNEL__
|
||||
#include <sys/mman.h>
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
struct raid6_calls raid6_call;
|
||||
|
||||
/* Various routine sets */
|
||||
extern const struct raid6_calls raid6_intx1;
|
||||
extern const struct raid6_calls raid6_intx2;
|
||||
extern const struct raid6_calls raid6_intx4;
|
||||
extern const struct raid6_calls raid6_intx8;
|
||||
extern const struct raid6_calls raid6_intx16;
|
||||
extern const struct raid6_calls raid6_intx32;
|
||||
extern const struct raid6_calls raid6_mmxx1;
|
||||
extern const struct raid6_calls raid6_mmxx2;
|
||||
extern const struct raid6_calls raid6_sse1x1;
|
||||
extern const struct raid6_calls raid6_sse1x2;
|
||||
extern const struct raid6_calls raid6_sse2x1;
|
||||
extern const struct raid6_calls raid6_sse2x2;
|
||||
extern const struct raid6_calls raid6_sse2x4;
|
||||
extern const struct raid6_calls raid6_altivec1;
|
||||
extern const struct raid6_calls raid6_altivec2;
|
||||
extern const struct raid6_calls raid6_altivec4;
|
||||
extern const struct raid6_calls raid6_altivec8;
|
||||
|
||||
const struct raid6_calls * const raid6_algos[] = {
|
||||
&raid6_intx1,
|
||||
&raid6_intx2,
|
||||
&raid6_intx4,
|
||||
&raid6_intx8,
|
||||
#if defined(__ia64__)
|
||||
&raid6_intx16,
|
||||
&raid6_intx32,
|
||||
#endif
|
||||
#if defined(__i386__)
|
||||
&raid6_mmxx1,
|
||||
&raid6_mmxx2,
|
||||
&raid6_sse1x1,
|
||||
&raid6_sse1x2,
|
||||
&raid6_sse2x1,
|
||||
&raid6_sse2x2,
|
||||
#endif
|
||||
#if defined(__x86_64__)
|
||||
&raid6_sse2x1,
|
||||
&raid6_sse2x2,
|
||||
&raid6_sse2x4,
|
||||
#endif
|
||||
#ifdef CONFIG_ALTIVEC
|
||||
&raid6_altivec1,
|
||||
&raid6_altivec2,
|
||||
&raid6_altivec4,
|
||||
&raid6_altivec8,
|
||||
#endif
|
||||
NULL
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#define RAID6_TIME_JIFFIES_LG2 4
|
||||
#else
|
||||
/* Need more time to be stable in userspace */
|
||||
#define RAID6_TIME_JIFFIES_LG2 9
|
||||
#endif
|
||||
|
||||
/* Try to pick the best algorithm */
|
||||
/* This code uses the gfmul table as convenient data set to abuse */
|
||||
|
||||
int __init raid6_select_algo(void)
|
||||
{
|
||||
const struct raid6_calls * const * algo;
|
||||
const struct raid6_calls * best;
|
||||
char *syndromes;
|
||||
void *dptrs[(65536/PAGE_SIZE)+2];
|
||||
int i, disks;
|
||||
unsigned long perf, bestperf;
|
||||
int bestprefer;
|
||||
unsigned long j0, j1;
|
||||
|
||||
disks = (65536/PAGE_SIZE)+2;
|
||||
for ( i = 0 ; i < disks-2 ; i++ ) {
|
||||
dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
|
||||
}
|
||||
|
||||
/* Normal code - use a 2-page allocation to avoid D$ conflict */
|
||||
syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
|
||||
|
||||
if ( !syndromes ) {
|
||||
printk("raid6: Yikes! No memory available.\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
dptrs[disks-2] = syndromes;
|
||||
dptrs[disks-1] = syndromes + PAGE_SIZE;
|
||||
|
||||
bestperf = 0; bestprefer = 0; best = NULL;
|
||||
|
||||
for ( algo = raid6_algos ; *algo ; algo++ ) {
|
||||
if ( !(*algo)->valid || (*algo)->valid() ) {
|
||||
perf = 0;
|
||||
|
||||
preempt_disable();
|
||||
j0 = jiffies;
|
||||
while ( (j1 = jiffies) == j0 )
|
||||
cpu_relax();
|
||||
while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) {
|
||||
(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
|
||||
perf++;
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
if ( (*algo)->prefer > bestprefer ||
|
||||
((*algo)->prefer == bestprefer &&
|
||||
perf > bestperf) ) {
|
||||
best = *algo;
|
||||
bestprefer = best->prefer;
|
||||
bestperf = perf;
|
||||
}
|
||||
printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
|
||||
(perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
|
||||
}
|
||||
}
|
||||
|
||||
if (best) {
|
||||
printk("raid6: using algorithm %s (%ld MB/s)\n",
|
||||
best->name,
|
||||
(bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
|
||||
raid6_call = *best;
|
||||
} else
|
||||
printk("raid6: Yikes! No algorithm found!\n");
|
||||
|
||||
free_pages((unsigned long)syndromes, 1);
|
||||
|
||||
return best ? 0 : -EINVAL;
|
||||
}
|
||||
130
drivers/md/raid6altivec.uc
Normal file
130
drivers/md/raid6altivec.uc
Normal file
@@ -0,0 +1,130 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6altivec$#.c
|
||||
*
|
||||
* $#-way unrolled portable integer math RAID-6 instruction set
|
||||
*
|
||||
* This file is postprocessed using unroll.pl
|
||||
*
|
||||
* <benh> hpa: in process,
|
||||
* you can just "steal" the vec unit with enable_kernel_altivec() (but
|
||||
* bracked this with preempt_disable/enable or in a lock)
|
||||
*/
|
||||
|
||||
#include "raid6.h"
|
||||
|
||||
#ifdef CONFIG_ALTIVEC
|
||||
|
||||
#include <altivec.h>
|
||||
#ifdef __KERNEL__
|
||||
# include <asm/system.h>
|
||||
# include <asm/cputable.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This is the C data type to use. We use a vector of
|
||||
* signed char so vec_cmpgt() will generate the right
|
||||
* instruction.
|
||||
*/
|
||||
|
||||
typedef vector signed char unative_t;
|
||||
|
||||
#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
|
||||
#define NSIZE sizeof(unative_t)
|
||||
|
||||
/*
|
||||
* The SHLBYTE() operation shifts each byte left by 1, *not*
|
||||
* rolling over into the next byte
|
||||
*/
|
||||
static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
|
||||
{
|
||||
return vec_add(v,v);
|
||||
}
|
||||
|
||||
/*
|
||||
* The MASK() operation returns 0xFF in any byte for which the high
|
||||
* bit is 1, 0x00 for any byte for which the high bit is 0.
|
||||
*/
|
||||
static inline __attribute_const__ unative_t MASK(unative_t v)
|
||||
{
|
||||
unative_t zv = NBYTES(0);
|
||||
|
||||
/* vec_cmpgt returns a vector bool char; thus the need for the cast */
|
||||
return (unative_t)vec_cmpgt(zv, v);
|
||||
}
|
||||
|
||||
|
||||
/* This is noinline to make damned sure that gcc doesn't move any of the
|
||||
Altivec code around the enable/disable code */
|
||||
static void noinline
|
||||
raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
|
||||
unative_t x1d = NBYTES(0x1d);
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
|
||||
wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
|
||||
wp$$ = vec_xor(wp$$, wd$$);
|
||||
w2$$ = MASK(wq$$);
|
||||
w1$$ = SHLBYTE(wq$$);
|
||||
w2$$ = vec_and(w2$$, x1d);
|
||||
w1$$ = vec_xor(w1$$, w2$$);
|
||||
wq$$ = vec_xor(w1$$, wd$$);
|
||||
}
|
||||
*(unative_t *)&p[d+NSIZE*$$] = wp$$;
|
||||
*(unative_t *)&q[d+NSIZE*$$] = wq$$;
|
||||
}
|
||||
}
|
||||
|
||||
static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
preempt_disable();
|
||||
enable_kernel_altivec();
|
||||
|
||||
raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
int raid6_have_altivec(void);
|
||||
#if $# == 1
|
||||
int raid6_have_altivec(void)
|
||||
{
|
||||
/* This assumes either all CPUs have Altivec or none does */
|
||||
# ifdef __KERNEL__
|
||||
return cpu_has_feature(CPU_FTR_ALTIVEC);
|
||||
# else
|
||||
return 1;
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
||||
const struct raid6_calls raid6_altivec$# = {
|
||||
raid6_altivec$#_gen_syndrome,
|
||||
raid6_have_altivec,
|
||||
"altivecx$#",
|
||||
0
|
||||
};
|
||||
|
||||
#endif /* CONFIG_ALTIVEC */
|
||||
117
drivers/md/raid6int.uc
Normal file
117
drivers/md/raid6int.uc
Normal file
@@ -0,0 +1,117 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6int$#.c
|
||||
*
|
||||
* $#-way unrolled portable integer math RAID-6 instruction set
|
||||
*
|
||||
* This file is postprocessed using unroll.pl
|
||||
*/
|
||||
|
||||
#include "raid6.h"
|
||||
|
||||
/*
|
||||
* This is the C data type to use
|
||||
*/
|
||||
|
||||
/* Change this from BITS_PER_LONG if there is something better... */
|
||||
#if BITS_PER_LONG == 64
|
||||
# define NBYTES(x) ((x) * 0x0101010101010101UL)
|
||||
# define NSIZE 8
|
||||
# define NSHIFT 3
|
||||
# define NSTRING "64"
|
||||
typedef u64 unative_t;
|
||||
#else
|
||||
# define NBYTES(x) ((x) * 0x01010101U)
|
||||
# define NSIZE 4
|
||||
# define NSHIFT 2
|
||||
# define NSTRING "32"
|
||||
typedef u32 unative_t;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* IA-64 wants insane amounts of unrolling. On other architectures that
|
||||
* is just a waste of space.
|
||||
*/
|
||||
#if ($# <= 8) || defined(__ia64__)
|
||||
|
||||
|
||||
/*
|
||||
* These sub-operations are separate inlines since they can sometimes be
|
||||
* specially optimized using architecture-specific hacks.
|
||||
*/
|
||||
|
||||
/*
|
||||
* The SHLBYTE() operation shifts each byte left by 1, *not*
|
||||
* rolling over into the next byte
|
||||
*/
|
||||
static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
|
||||
{
|
||||
unative_t vv;
|
||||
|
||||
vv = (v << 1) & NBYTES(0xfe);
|
||||
return vv;
|
||||
}
|
||||
|
||||
/*
|
||||
* The MASK() operation returns 0xFF in any byte for which the high
|
||||
* bit is 1, 0x00 for any byte for which the high bit is 0.
|
||||
*/
|
||||
static inline __attribute_const__ unative_t MASK(unative_t v)
|
||||
{
|
||||
unative_t vv;
|
||||
|
||||
vv = v & NBYTES(0x80);
|
||||
vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
|
||||
return vv;
|
||||
}
|
||||
|
||||
|
||||
static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
|
||||
wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
|
||||
wp$$ ^= wd$$;
|
||||
w2$$ = MASK(wq$$);
|
||||
w1$$ = SHLBYTE(wq$$);
|
||||
w2$$ &= NBYTES(0x1d);
|
||||
w1$$ ^= w2$$;
|
||||
wq$$ = w1$$ ^ wd$$;
|
||||
}
|
||||
*(unative_t *)&p[d+NSIZE*$$] = wp$$;
|
||||
*(unative_t *)&q[d+NSIZE*$$] = wq$$;
|
||||
}
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_intx$# = {
|
||||
raid6_int$#_gen_syndrome,
|
||||
NULL, /* always valid */
|
||||
"int" NSTRING "x$#",
|
||||
0
|
||||
};
|
||||
|
||||
#endif
|
||||
142
drivers/md/raid6mmx.c
Normal file
142
drivers/md/raid6mmx.c
Normal file
@@ -0,0 +1,142 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6mmx.c
|
||||
*
|
||||
* MMX implementation of RAID-6 syndrome functions
|
||||
*/
|
||||
|
||||
#if defined(__i386__)
|
||||
|
||||
#include "raid6.h"
|
||||
#include "raid6x86.h"
|
||||
|
||||
/* Shared with raid6sse1.c */
|
||||
const struct raid6_mmx_constants {
|
||||
u64 x1d;
|
||||
} raid6_mmx_constants = {
|
||||
0x1d1d1d1d1d1d1d1dULL,
|
||||
};
|
||||
|
||||
static int raid6_have_mmx(void)
|
||||
{
|
||||
/* Not really "boot_cpu" but "all_cpus" */
|
||||
return boot_cpu_has(X86_FEATURE_MMX);
|
||||
}
|
||||
|
||||
/*
|
||||
* Plain MMX implementation
|
||||
*/
|
||||
static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
|
||||
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += 8 ) {
|
||||
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("movq %mm2,%mm4"); /* Q[0] */
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
|
||||
asm volatile("pcmpgtb %mm4,%mm5");
|
||||
asm volatile("paddb %mm4,%mm4");
|
||||
asm volatile("pand %mm0,%mm5");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm5,%mm5");
|
||||
asm volatile("pxor %mm6,%mm2");
|
||||
asm volatile("pxor %mm6,%mm4");
|
||||
}
|
||||
asm volatile("movq %%mm2,%0" : "=m" (p[d]));
|
||||
asm volatile("pxor %mm2,%mm2");
|
||||
asm volatile("movq %%mm4,%0" : "=m" (q[d]));
|
||||
asm volatile("pxor %mm4,%mm4");
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_mmxx1 = {
|
||||
raid6_mmx1_gen_syndrome,
|
||||
raid6_have_mmx,
|
||||
"mmxx1",
|
||||
0
|
||||
};
|
||||
|
||||
/*
|
||||
* Unrolled-by-2 MMX implementation
|
||||
*/
|
||||
static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
|
||||
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
|
||||
asm volatile("pxor %mm7,%mm7"); /* Zero temp */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += 16 ) {
|
||||
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
|
||||
asm volatile("movq %mm2,%mm4"); /* Q[0] */
|
||||
asm volatile("movq %mm3,%mm6"); /* Q[1] */
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
asm volatile("pcmpgtb %mm4,%mm5");
|
||||
asm volatile("pcmpgtb %mm6,%mm7");
|
||||
asm volatile("paddb %mm4,%mm4");
|
||||
asm volatile("paddb %mm6,%mm6");
|
||||
asm volatile("pand %mm0,%mm5");
|
||||
asm volatile("pand %mm0,%mm7");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm7,%mm6");
|
||||
asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
|
||||
asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
|
||||
asm volatile("pxor %mm5,%mm2");
|
||||
asm volatile("pxor %mm7,%mm3");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm7,%mm6");
|
||||
asm volatile("pxor %mm5,%mm5");
|
||||
asm volatile("pxor %mm7,%mm7");
|
||||
}
|
||||
asm volatile("movq %%mm2,%0" : "=m" (p[d]));
|
||||
asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
|
||||
asm volatile("movq %%mm4,%0" : "=m" (q[d]));
|
||||
asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_mmxx2 = {
|
||||
raid6_mmx2_gen_syndrome,
|
||||
raid6_have_mmx,
|
||||
"mmxx2",
|
||||
0
|
||||
};
|
||||
|
||||
#endif
|
||||
133
drivers/md/raid6recov.c
Normal file
133
drivers/md/raid6recov.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6recov.c
|
||||
*
|
||||
* RAID-6 data recovery in dual failure mode. In single failure mode,
|
||||
* use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
|
||||
* the syndrome.)
|
||||
*/
|
||||
|
||||
#include "raid6.h"
|
||||
|
||||
/* Recover two failed data blocks. */
|
||||
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
|
||||
void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dp, *dq;
|
||||
u8 px, qx, db;
|
||||
const u8 *pbmul; /* P multiplier table for B data */
|
||||
const u8 *qmul; /* Q multiplier table (for both) */
|
||||
|
||||
p = (u8 *)ptrs[disks-2];
|
||||
q = (u8 *)ptrs[disks-1];
|
||||
|
||||
/* Compute syndrome with zero for the missing data pages
|
||||
Use the dead data pages as temporary storage for
|
||||
delta p and delta q */
|
||||
dp = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks-2] = dp;
|
||||
dq = (u8 *)ptrs[failb];
|
||||
ptrs[failb] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks-1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dp;
|
||||
ptrs[failb] = dq;
|
||||
ptrs[disks-2] = p;
|
||||
ptrs[disks-1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
|
||||
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
|
||||
|
||||
/* Now do it... */
|
||||
while ( bytes-- ) {
|
||||
px = *p ^ *dp;
|
||||
qx = qmul[*q ^ *dq];
|
||||
*dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
|
||||
*dp++ = db ^ px; /* Reconstructed A */
|
||||
p++; q++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* Recover failure of one data block plus the P block */
|
||||
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
|
||||
{
|
||||
u8 *p, *q, *dq;
|
||||
const u8 *qmul; /* Q multiplier table */
|
||||
|
||||
p = (u8 *)ptrs[disks-2];
|
||||
q = (u8 *)ptrs[disks-1];
|
||||
|
||||
/* Compute syndrome with zero for the missing data page
|
||||
Use the dead data page as temporary storage for delta q */
|
||||
dq = (u8 *)ptrs[faila];
|
||||
ptrs[faila] = (void *)raid6_empty_zero_page;
|
||||
ptrs[disks-1] = dq;
|
||||
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
|
||||
/* Restore pointer table */
|
||||
ptrs[faila] = dq;
|
||||
ptrs[disks-1] = q;
|
||||
|
||||
/* Now, pick the proper data tables */
|
||||
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
|
||||
|
||||
/* Now do it... */
|
||||
while ( bytes-- ) {
|
||||
*p++ ^= *dq = qmul[*q ^ *dq];
|
||||
q++; dq++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifndef __KERNEL__ /* Testing only */
|
||||
|
||||
/* Recover two failed blocks. */
|
||||
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
|
||||
{
|
||||
if ( faila > failb ) {
|
||||
int tmp = faila;
|
||||
faila = failb;
|
||||
failb = tmp;
|
||||
}
|
||||
|
||||
if ( failb == disks-1 ) {
|
||||
if ( faila == disks-2 ) {
|
||||
/* P+Q failure. Just rebuild the syndrome. */
|
||||
raid6_call.gen_syndrome(disks, bytes, ptrs);
|
||||
} else {
|
||||
/* data+Q failure. Reconstruct data from P,
|
||||
then rebuild syndrome. */
|
||||
/* NOT IMPLEMENTED - equivalent to RAID-5 */
|
||||
}
|
||||
} else {
|
||||
if ( failb == disks-2 ) {
|
||||
/* data+P failure. */
|
||||
raid6_datap_recov(disks, bytes, faila, ptrs);
|
||||
} else {
|
||||
/* data+data failure. */
|
||||
raid6_2data_recov(disks, bytes, faila, failb, ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
162
drivers/md/raid6sse1.c
Normal file
162
drivers/md/raid6sse1.c
Normal file
@@ -0,0 +1,162 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6sse1.c
|
||||
*
|
||||
* SSE-1/MMXEXT implementation of RAID-6 syndrome functions
|
||||
*
|
||||
* This is really an MMX implementation, but it requires SSE-1 or
|
||||
* AMD MMXEXT for prefetch support and a few other features. The
|
||||
* support for nontemporal memory accesses is enough to make this
|
||||
* worthwhile as a separate implementation.
|
||||
*/
|
||||
|
||||
#if defined(__i386__)
|
||||
|
||||
#include "raid6.h"
|
||||
#include "raid6x86.h"
|
||||
|
||||
/* Defined in raid6mmx.c */
|
||||
extern const struct raid6_mmx_constants {
|
||||
u64 x1d;
|
||||
} raid6_mmx_constants;
|
||||
|
||||
static int raid6_have_sse1_or_mmxext(void)
|
||||
{
|
||||
/* Not really boot_cpu but "all_cpus" */
|
||||
return boot_cpu_has(X86_FEATURE_MMX) &&
|
||||
(boot_cpu_has(X86_FEATURE_XMM) ||
|
||||
boot_cpu_has(X86_FEATURE_MMXEXT));
|
||||
}
|
||||
|
||||
/*
|
||||
* Plain SSE1 implementation
|
||||
*/
|
||||
static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
|
||||
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += 8 ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
|
||||
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
|
||||
asm volatile("movq %mm2,%mm4"); /* Q[0] */
|
||||
asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
|
||||
for ( z = z0-2 ; z >= 0 ; z-- ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
|
||||
asm volatile("pcmpgtb %mm4,%mm5");
|
||||
asm volatile("paddb %mm4,%mm4");
|
||||
asm volatile("pand %mm0,%mm5");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm5,%mm5");
|
||||
asm volatile("pxor %mm6,%mm2");
|
||||
asm volatile("pxor %mm6,%mm4");
|
||||
asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
|
||||
}
|
||||
asm volatile("pcmpgtb %mm4,%mm5");
|
||||
asm volatile("paddb %mm4,%mm4");
|
||||
asm volatile("pand %mm0,%mm5");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm5,%mm5");
|
||||
asm volatile("pxor %mm6,%mm2");
|
||||
asm volatile("pxor %mm6,%mm4");
|
||||
|
||||
asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
|
||||
asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
|
||||
}
|
||||
|
||||
asm volatile("sfence" : : : "memory");
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_sse1x1 = {
|
||||
raid6_sse11_gen_syndrome,
|
||||
raid6_have_sse1_or_mmxext,
|
||||
"sse1x1",
|
||||
1 /* Has cache hints */
|
||||
};
|
||||
|
||||
/*
|
||||
* Unrolled-by-2 SSE1 implementation
|
||||
*/
|
||||
static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
|
||||
asm volatile("pxor %mm5,%mm5"); /* Zero temp */
|
||||
asm volatile("pxor %mm7,%mm7"); /* Zero temp */
|
||||
|
||||
/* We uniformly assume a single prefetch covers at least 16 bytes */
|
||||
for ( d = 0 ; d < bytes ; d += 16 ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
|
||||
asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
|
||||
asm volatile("movq %mm2,%mm4"); /* Q[0] */
|
||||
asm volatile("movq %mm3,%mm6"); /* Q[1] */
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
|
||||
asm volatile("pcmpgtb %mm4,%mm5");
|
||||
asm volatile("pcmpgtb %mm6,%mm7");
|
||||
asm volatile("paddb %mm4,%mm4");
|
||||
asm volatile("paddb %mm6,%mm6");
|
||||
asm volatile("pand %mm0,%mm5");
|
||||
asm volatile("pand %mm0,%mm7");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm7,%mm6");
|
||||
asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
|
||||
asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
|
||||
asm volatile("pxor %mm5,%mm2");
|
||||
asm volatile("pxor %mm7,%mm3");
|
||||
asm volatile("pxor %mm5,%mm4");
|
||||
asm volatile("pxor %mm7,%mm6");
|
||||
asm volatile("pxor %mm5,%mm5");
|
||||
asm volatile("pxor %mm7,%mm7");
|
||||
}
|
||||
asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
|
||||
asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
|
||||
asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
|
||||
asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
|
||||
}
|
||||
|
||||
asm volatile("sfence" : :: "memory");
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_sse1x2 = {
|
||||
raid6_sse12_gen_syndrome,
|
||||
raid6_have_sse1_or_mmxext,
|
||||
"sse1x2",
|
||||
1 /* Has cache hints */
|
||||
};
|
||||
|
||||
#endif
|
||||
262
drivers/md/raid6sse2.c
Normal file
262
drivers/md/raid6sse2.c
Normal file
@@ -0,0 +1,262 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6sse2.c
|
||||
*
|
||||
* SSE-2 implementation of RAID-6 syndrome functions
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#include "raid6.h"
|
||||
#include "raid6x86.h"
|
||||
|
||||
static const struct raid6_sse_constants {
|
||||
u64 x1d[2];
|
||||
} raid6_sse_constants __attribute__((aligned(16))) = {
|
||||
{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
|
||||
};
|
||||
|
||||
static int raid6_have_sse2(void)
|
||||
{
|
||||
/* Not really boot_cpu but "all_cpus" */
|
||||
return boot_cpu_has(X86_FEATURE_MMX) &&
|
||||
boot_cpu_has(X86_FEATURE_FXSR) &&
|
||||
boot_cpu_has(X86_FEATURE_XMM) &&
|
||||
boot_cpu_has(X86_FEATURE_XMM2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Plain SSE2 implementation
|
||||
*/
|
||||
static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
|
||||
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += 16 ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
|
||||
asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
|
||||
asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
|
||||
asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
|
||||
for ( z = z0-2 ; z >= 0 ; z-- ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
|
||||
asm volatile("pcmpgtb %xmm4,%xmm5");
|
||||
asm volatile("paddb %xmm4,%xmm4");
|
||||
asm volatile("pand %xmm0,%xmm5");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm5,%xmm5");
|
||||
asm volatile("pxor %xmm6,%xmm2");
|
||||
asm volatile("pxor %xmm6,%xmm4");
|
||||
asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
|
||||
}
|
||||
asm volatile("pcmpgtb %xmm4,%xmm5");
|
||||
asm volatile("paddb %xmm4,%xmm4");
|
||||
asm volatile("pand %xmm0,%xmm5");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm5,%xmm5");
|
||||
asm volatile("pxor %xmm6,%xmm2");
|
||||
asm volatile("pxor %xmm6,%xmm4");
|
||||
|
||||
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
|
||||
asm volatile("pxor %xmm2,%xmm2");
|
||||
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
|
||||
asm volatile("pxor %xmm4,%xmm4");
|
||||
}
|
||||
|
||||
asm volatile("sfence" : : : "memory");
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_sse2x1 = {
|
||||
raid6_sse21_gen_syndrome,
|
||||
raid6_have_sse2,
|
||||
"sse2x1",
|
||||
1 /* Has cache hints */
|
||||
};
|
||||
|
||||
/*
|
||||
* Unrolled-by-2 SSE2 implementation
|
||||
*/
|
||||
static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
|
||||
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
|
||||
asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
|
||||
|
||||
/* We uniformly assume a single prefetch covers at least 32 bytes */
|
||||
for ( d = 0 ; d < bytes ; d += 32 ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
|
||||
asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
|
||||
asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
|
||||
asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
|
||||
asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
|
||||
for ( z = z0-1 ; z >= 0 ; z-- ) {
|
||||
asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
|
||||
asm volatile("pcmpgtb %xmm4,%xmm5");
|
||||
asm volatile("pcmpgtb %xmm6,%xmm7");
|
||||
asm volatile("paddb %xmm4,%xmm4");
|
||||
asm volatile("paddb %xmm6,%xmm6");
|
||||
asm volatile("pand %xmm0,%xmm5");
|
||||
asm volatile("pand %xmm0,%xmm7");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm7,%xmm6");
|
||||
asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
|
||||
asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
|
||||
asm volatile("pxor %xmm5,%xmm2");
|
||||
asm volatile("pxor %xmm7,%xmm3");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm7,%xmm6");
|
||||
asm volatile("pxor %xmm5,%xmm5");
|
||||
asm volatile("pxor %xmm7,%xmm7");
|
||||
}
|
||||
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
|
||||
asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
|
||||
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
|
||||
asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
|
||||
}
|
||||
|
||||
asm volatile("sfence" : : : "memory");
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_sse2x2 = {
|
||||
raid6_sse22_gen_syndrome,
|
||||
raid6_have_sse2,
|
||||
"sse2x2",
|
||||
1 /* Has cache hints */
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/*
|
||||
* Unrolled-by-4 SSE2 implementation
|
||||
*/
|
||||
static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
|
||||
{
|
||||
u8 **dptr = (u8 **)ptrs;
|
||||
u8 *p, *q;
|
||||
int d, z, z0;
|
||||
|
||||
z0 = disks - 3; /* Highest data disk */
|
||||
p = dptr[z0+1]; /* XOR parity */
|
||||
q = dptr[z0+2]; /* RS syndrome */
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
|
||||
asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
|
||||
asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
|
||||
asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
|
||||
asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
|
||||
asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
|
||||
asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
|
||||
asm volatile("pxor %xmm10,%xmm10"); /* P[2] */
|
||||
asm volatile("pxor %xmm11,%xmm11"); /* P[3] */
|
||||
asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */
|
||||
asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */
|
||||
asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */
|
||||
asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */
|
||||
|
||||
for ( d = 0 ; d < bytes ; d += 64 ) {
|
||||
for ( z = z0 ; z >= 0 ; z-- ) {
|
||||
/* The second prefetch seems to improve performance... */
|
||||
asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
|
||||
asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
|
||||
asm volatile("pcmpgtb %xmm4,%xmm5");
|
||||
asm volatile("pcmpgtb %xmm6,%xmm7");
|
||||
asm volatile("pcmpgtb %xmm12,%xmm13");
|
||||
asm volatile("pcmpgtb %xmm14,%xmm15");
|
||||
asm volatile("paddb %xmm4,%xmm4");
|
||||
asm volatile("paddb %xmm6,%xmm6");
|
||||
asm volatile("paddb %xmm12,%xmm12");
|
||||
asm volatile("paddb %xmm14,%xmm14");
|
||||
asm volatile("pand %xmm0,%xmm5");
|
||||
asm volatile("pand %xmm0,%xmm7");
|
||||
asm volatile("pand %xmm0,%xmm13");
|
||||
asm volatile("pand %xmm0,%xmm15");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm7,%xmm6");
|
||||
asm volatile("pxor %xmm13,%xmm12");
|
||||
asm volatile("pxor %xmm15,%xmm14");
|
||||
asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
|
||||
asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
|
||||
asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
|
||||
asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
|
||||
asm volatile("pxor %xmm5,%xmm2");
|
||||
asm volatile("pxor %xmm7,%xmm3");
|
||||
asm volatile("pxor %xmm13,%xmm10");
|
||||
asm volatile("pxor %xmm15,%xmm11");
|
||||
asm volatile("pxor %xmm5,%xmm4");
|
||||
asm volatile("pxor %xmm7,%xmm6");
|
||||
asm volatile("pxor %xmm13,%xmm12");
|
||||
asm volatile("pxor %xmm15,%xmm14");
|
||||
asm volatile("pxor %xmm5,%xmm5");
|
||||
asm volatile("pxor %xmm7,%xmm7");
|
||||
asm volatile("pxor %xmm13,%xmm13");
|
||||
asm volatile("pxor %xmm15,%xmm15");
|
||||
}
|
||||
asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
|
||||
asm volatile("pxor %xmm2,%xmm2");
|
||||
asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
|
||||
asm volatile("pxor %xmm3,%xmm3");
|
||||
asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
|
||||
asm volatile("pxor %xmm10,%xmm10");
|
||||
asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
|
||||
asm volatile("pxor %xmm11,%xmm11");
|
||||
asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
|
||||
asm volatile("pxor %xmm4,%xmm4");
|
||||
asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
|
||||
asm volatile("pxor %xmm6,%xmm6");
|
||||
asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
|
||||
asm volatile("pxor %xmm12,%xmm12");
|
||||
asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
|
||||
asm volatile("pxor %xmm14,%xmm14");
|
||||
}
|
||||
|
||||
asm volatile("sfence" : : : "memory");
|
||||
kernel_fpu_end();
|
||||
}
|
||||
|
||||
const struct raid6_calls raid6_sse2x4 = {
|
||||
raid6_sse24_gen_syndrome,
|
||||
raid6_have_sse2,
|
||||
"sse2x4",
|
||||
1 /* Has cache hints */
|
||||
};
|
||||
|
||||
#endif
|
||||
75
drivers/md/raid6test/Makefile
Normal file
75
drivers/md/raid6test/Makefile
Normal file
@@ -0,0 +1,75 @@
|
||||
#
|
||||
# This is a simple Makefile to test some of the RAID-6 code
|
||||
# from userspace.
|
||||
#
|
||||
|
||||
CC = gcc
|
||||
OPTFLAGS = -O2 # Adjust as desired
|
||||
CFLAGS = -I.. -g $(OPTFLAGS)
|
||||
LD = ld
|
||||
PERL = perl
|
||||
AR = ar
|
||||
RANLIB = ranlib
|
||||
|
||||
.c.o:
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
|
||||
%.c: ../%.c
|
||||
cp -f $< $@
|
||||
|
||||
%.uc: ../%.uc
|
||||
cp -f $< $@
|
||||
|
||||
all: raid6.a raid6test
|
||||
|
||||
raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
|
||||
raid6int32.o \
|
||||
raid6mmx.o raid6sse1.o raid6sse2.o \
|
||||
raid6altivec1.o raid6altivec2.o raid6altivec4.o raid6altivec8.o \
|
||||
raid6recov.o raid6algos.o \
|
||||
raid6tables.o
|
||||
rm -f $@
|
||||
$(AR) cq $@ $^
|
||||
$(RANLIB) $@
|
||||
|
||||
raid6test: test.c raid6.a
|
||||
$(CC) $(CFLAGS) -o raid6test $^
|
||||
|
||||
raid6altivec1.c: raid6altivec.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 1 < raid6altivec.uc > $@
|
||||
|
||||
raid6altivec2.c: raid6altivec.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 2 < raid6altivec.uc > $@
|
||||
|
||||
raid6altivec4.c: raid6altivec.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 4 < raid6altivec.uc > $@
|
||||
|
||||
raid6altivec8.c: raid6altivec.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 8 < raid6altivec.uc > $@
|
||||
|
||||
raid6int1.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 1 < raid6int.uc > $@
|
||||
|
||||
raid6int2.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 2 < raid6int.uc > $@
|
||||
|
||||
raid6int4.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 4 < raid6int.uc > $@
|
||||
|
||||
raid6int8.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 8 < raid6int.uc > $@
|
||||
|
||||
raid6int16.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 16 < raid6int.uc > $@
|
||||
|
||||
raid6int32.c: raid6int.uc ../unroll.pl
|
||||
$(PERL) ../unroll.pl 32 < raid6int.uc > $@
|
||||
|
||||
raid6tables.c: mktables
|
||||
./mktables > raid6tables.c
|
||||
|
||||
clean:
|
||||
rm -f *.o *.a mktables mktables.c raid6int.uc raid6*.c raid6test
|
||||
|
||||
spotless: clean
|
||||
rm -f *~
|
||||
103
drivers/md/raid6test/test.c
Normal file
103
drivers/md/raid6test/test.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/* -*- linux-c -*- ------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6test.c
|
||||
*
|
||||
* Test RAID-6 recovery with various algorithms
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "raid6.h"
|
||||
|
||||
#define NDISKS 16 /* Including P and Q */
|
||||
|
||||
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
|
||||
struct raid6_calls raid6_call;
|
||||
|
||||
char *dataptrs[NDISKS];
|
||||
char data[NDISKS][PAGE_SIZE];
|
||||
char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
|
||||
|
||||
void makedata(void)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for ( i = 0 ; i < NDISKS ; i++ ) {
|
||||
for ( j = 0 ; j < PAGE_SIZE ; j++ ) {
|
||||
data[i][j] = rand();
|
||||
}
|
||||
dataptrs[i] = data[i];
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const struct raid6_calls * const * algo;
|
||||
int i, j;
|
||||
int erra, errb;
|
||||
|
||||
makedata();
|
||||
|
||||
for ( algo = raid6_algos ; *algo ; algo++ ) {
|
||||
if ( !(*algo)->valid || (*algo)->valid() ) {
|
||||
raid6_call = **algo;
|
||||
|
||||
/* Nuke syndromes */
|
||||
memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
|
||||
|
||||
/* Generate assumed good syndrome */
|
||||
raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs);
|
||||
|
||||
for ( i = 0 ; i < NDISKS-1 ; i++ ) {
|
||||
for ( j = i+1 ; j < NDISKS ; j++ ) {
|
||||
memset(recovi, 0xf0, PAGE_SIZE);
|
||||
memset(recovj, 0xba, PAGE_SIZE);
|
||||
|
||||
dataptrs[i] = recovi;
|
||||
dataptrs[j] = recovj;
|
||||
|
||||
raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
|
||||
|
||||
erra = memcmp(data[i], recovi, PAGE_SIZE);
|
||||
errb = memcmp(data[j], recovj, PAGE_SIZE);
|
||||
|
||||
if ( i < NDISKS-2 && j == NDISKS-1 ) {
|
||||
/* We don't implement the DQ failure scenario, since it's
|
||||
equivalent to a RAID-5 failure (XOR, then recompute Q) */
|
||||
} else {
|
||||
printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n",
|
||||
raid6_call.name,
|
||||
i, (i==NDISKS-2)?'P':'D',
|
||||
j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
|
||||
(!erra && !errb) ? "OK" :
|
||||
!erra ? "ERRB" :
|
||||
!errb ? "ERRA" :
|
||||
"ERRAB");
|
||||
}
|
||||
|
||||
dataptrs[i] = data[i];
|
||||
dataptrs[j] = data[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
/* Pick the best algorithm test */
|
||||
raid6_select_algo();
|
||||
|
||||
return 0;
|
||||
}
|
||||
61
drivers/md/raid6x86.h
Normal file
61
drivers/md/raid6x86.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/* ----------------------------------------------------------------------- *
|
||||
*
|
||||
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
|
||||
* Bostom MA 02111-1307, USA; either version 2 of the License, or
|
||||
* (at your option) any later version; incorporated herein by reference.
|
||||
*
|
||||
* ----------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* raid6x86.h
|
||||
*
|
||||
* Definitions common to x86 and x86-64 RAID-6 code only
|
||||
*/
|
||||
|
||||
#ifndef LINUX_RAID_RAID6X86_H
|
||||
#define LINUX_RAID_RAID6X86_H
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#ifdef __KERNEL__ /* Real code */
|
||||
|
||||
#include <asm/i387.h>
|
||||
|
||||
#else /* Dummy code for user space testing */
|
||||
|
||||
static inline void kernel_fpu_begin(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void kernel_fpu_end(void)
|
||||
{
|
||||
}
|
||||
|
||||
#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
|
||||
#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions
|
||||
* (fast save and restore) */
|
||||
#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
|
||||
#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
|
||||
#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
|
||||
|
||||
/* Should work well enough on modern CPUs for testing */
|
||||
static inline int boot_cpu_has(int flag)
|
||||
{
|
||||
u32 eax = (flag >> 5) ? 0x80000001 : 1;
|
||||
u32 edx;
|
||||
|
||||
asm volatile("cpuid"
|
||||
: "+a" (eax), "=d" (edx)
|
||||
: : "ecx", "ebx");
|
||||
|
||||
return (edx >> (flag & 31)) & 1;
|
||||
}
|
||||
|
||||
#endif /* ndef __KERNEL__ */
|
||||
|
||||
#endif
|
||||
#endif
|
||||
24
drivers/md/unroll.pl
Normal file
24
drivers/md/unroll.pl
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
# Take a piece of C code and for each line which contains the sequence $$
|
||||
# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced
|
||||
# by the unrolling factor, and $* with a single $
|
||||
#
|
||||
|
||||
($n) = @ARGV;
|
||||
$n += 0;
|
||||
|
||||
while ( defined($line = <STDIN>) ) {
|
||||
if ( $line =~ /\$\$/ ) {
|
||||
$rep = $n;
|
||||
} else {
|
||||
$rep = 1;
|
||||
}
|
||||
for ( $i = 0 ; $i < $rep ; $i++ ) {
|
||||
$tmp = $line;
|
||||
$tmp =~ s/\$\$/$i/g;
|
||||
$tmp =~ s/\$\#/$n/g;
|
||||
$tmp =~ s/\$\*/\$/g;
|
||||
print $tmp;
|
||||
}
|
||||
}
|
||||
154
drivers/md/xor.c
Normal file
154
drivers/md/xor.c
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* xor.c : Multiple Devices driver for Linux
|
||||
*
|
||||
* Copyright (C) 1996, 1997, 1998, 1999, 2000,
|
||||
* Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
|
||||
*
|
||||
* Dispatch optimized RAID-5 checksumming functions.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* (for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#define BH_TRACE 0
|
||||
#include <linux/module.h>
|
||||
#include <linux/raid/md.h>
|
||||
#include <linux/raid/xor.h>
|
||||
#include <asm/xor.h>
|
||||
|
||||
/* The xor routines to use. */
|
||||
static struct xor_block_template *active_template;
|
||||
|
||||
void
|
||||
xor_block(unsigned int count, unsigned int bytes, void **ptr)
|
||||
{
|
||||
unsigned long *p0, *p1, *p2, *p3, *p4;
|
||||
|
||||
p0 = (unsigned long *) ptr[0];
|
||||
p1 = (unsigned long *) ptr[1];
|
||||
if (count == 2) {
|
||||
active_template->do_2(bytes, p0, p1);
|
||||
return;
|
||||
}
|
||||
|
||||
p2 = (unsigned long *) ptr[2];
|
||||
if (count == 3) {
|
||||
active_template->do_3(bytes, p0, p1, p2);
|
||||
return;
|
||||
}
|
||||
|
||||
p3 = (unsigned long *) ptr[3];
|
||||
if (count == 4) {
|
||||
active_template->do_4(bytes, p0, p1, p2, p3);
|
||||
return;
|
||||
}
|
||||
|
||||
p4 = (unsigned long *) ptr[4];
|
||||
active_template->do_5(bytes, p0, p1, p2, p3, p4);
|
||||
}
|
||||
|
||||
/* Set of all registered templates. */
|
||||
static struct xor_block_template *template_list;
|
||||
|
||||
#define BENCH_SIZE (PAGE_SIZE)
|
||||
|
||||
static void
|
||||
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
|
||||
{
|
||||
int speed;
|
||||
unsigned long now;
|
||||
int i, count, max;
|
||||
|
||||
tmpl->next = template_list;
|
||||
template_list = tmpl;
|
||||
|
||||
/*
|
||||
* Count the number of XORs done during a whole jiffy, and use
|
||||
* this to calculate the speed of checksumming. We use a 2-page
|
||||
* allocation to have guaranteed color L1-cache layout.
|
||||
*/
|
||||
max = 0;
|
||||
for (i = 0; i < 5; i++) {
|
||||
now = jiffies;
|
||||
count = 0;
|
||||
while (jiffies == now) {
|
||||
mb();
|
||||
tmpl->do_2(BENCH_SIZE, b1, b2);
|
||||
mb();
|
||||
count++;
|
||||
mb();
|
||||
}
|
||||
if (count > max)
|
||||
max = count;
|
||||
}
|
||||
|
||||
speed = max * (HZ * BENCH_SIZE / 1024);
|
||||
tmpl->speed = speed;
|
||||
|
||||
printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
|
||||
speed / 1000, speed % 1000);
|
||||
}
|
||||
|
||||
static int
|
||||
calibrate_xor_block(void)
|
||||
{
|
||||
void *b1, *b2;
|
||||
struct xor_block_template *f, *fastest;
|
||||
|
||||
b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
|
||||
if (! b1) {
|
||||
printk("raid5: Yikes! No memory available.\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
|
||||
|
||||
/*
|
||||
* If this arch/cpu has a short-circuited selection, don't loop through all
|
||||
* the possible functions, just test the best one
|
||||
*/
|
||||
|
||||
fastest = NULL;
|
||||
|
||||
#ifdef XOR_SELECT_TEMPLATE
|
||||
fastest = XOR_SELECT_TEMPLATE(fastest);
|
||||
#endif
|
||||
|
||||
#define xor_speed(templ) do_xor_speed((templ), b1, b2)
|
||||
|
||||
if (fastest) {
|
||||
printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
|
||||
fastest->name);
|
||||
xor_speed(fastest);
|
||||
} else {
|
||||
printk(KERN_INFO "raid5: measuring checksumming speed\n");
|
||||
XOR_TRY_TEMPLATES;
|
||||
fastest = template_list;
|
||||
for (f = fastest; f; f = f->next)
|
||||
if (f->speed > fastest->speed)
|
||||
fastest = f;
|
||||
}
|
||||
|
||||
printk("raid5: using function: %s (%d.%03d MB/sec)\n",
|
||||
fastest->name, fastest->speed / 1000, fastest->speed % 1000);
|
||||
|
||||
#undef xor_speed
|
||||
|
||||
free_pages((unsigned long)b1, 2);
|
||||
|
||||
active_template = fastest;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __exit void xor_exit(void) { }
|
||||
|
||||
EXPORT_SYMBOL(xor_block);
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(calibrate_xor_block);
|
||||
module_exit(xor_exit);
|
||||
Reference in New Issue
Block a user