Creation of Cybook 2416 (actually Gen4) repository

This commit is contained in:
mlt
2009-12-18 17:10:00 +00:00
committed by godzil
commit 76f20f4d40
13791 changed files with 6812321 additions and 0 deletions

54
block/Kconfig Normal file
View File

@@ -0,0 +1,54 @@
#
# Block layer core configuration
#
config BLOCK
bool "Enable the block layer" if EMBEDDED
default y
help
This permits the block layer to be removed from the kernel if it's not
needed (on some embedded devices for example). If this option is
disabled, then blockdev files will become unusable and some
filesystems (such as ext3) will become unavailable.
This option will also disable SCSI character devices and USB storage
since they make use of various block layer definitions and
facilities.
Say Y here unless you know you really don't want to mount disks and
suchlike.
if BLOCK
config LBD
bool "Support for Large Block Devices"
depends on !64BIT
help
Say Y here if you want to attach large (bigger than 2TB) discs to
your machine, or if you want to have a raid or loopback device
bigger than 2TB. Otherwise say N.
config BLK_DEV_IO_TRACE
bool "Support for tracing block io actions"
depends on SYSFS
select RELAY
select DEBUG_FS
help
Say Y here, if you want to be able to trace the block layer actions
on a given queue. Tracing allows you to see any traffic happening
on a block device queue. For more information (and the user space
support tools needed), fetch the blktrace app from:
git://brick.kernel.dk/data/git/blktrace.git
config LSF
bool "Support for Large Single Files"
depends on !64BIT
help
Say Y here if you want to be able to handle very large files (bigger
than 2TB), otherwise say N.
If unsure, say Y.
endif
source block/Kconfig.iosched

73
block/Kconfig.iosched Normal file
View File

@@ -0,0 +1,73 @@
if BLOCK
menu "IO Schedulers"
config IOSCHED_NOOP
bool
default y
---help---
The no-op I/O scheduler is a minimal scheduler that does basic merging
and sorting. Its main uses include non-disk based block devices like
memory devices, and specialised software or hardware environments
that do their own scheduling and require only minimal assistance from
the kernel.
config IOSCHED_AS
tristate "Anticipatory I/O scheduler"
default y
---help---
The anticipatory I/O scheduler is generally a good choice for most
environments, but is quite large and complex when compared to the
deadline I/O scheduler, it can also be slower in some cases
especially some database loads.
config IOSCHED_DEADLINE
tristate "Deadline I/O scheduler"
default y
---help---
The deadline I/O scheduler is simple and compact, and is often as
good as the anticipatory I/O scheduler, and in some database
workloads, better. In the case of a single process performing I/O to
a disk at any one time, its behaviour is almost identical to the
anticipatory I/O scheduler and so is a good choice.
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
among all processes in the system. It should provide a fair
working environment, suitable for desktop systems.
This is the default I/O scheduler.
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
help
Select the I/O scheduler which will be used by default for all
block devices.
config DEFAULT_AS
bool "Anticipatory" if IOSCHED_AS=y
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
config DEFAULT_NOOP
bool "No-op"
endchoice
config DEFAULT_IOSCHED
string
default "anticipatory" if DEFAULT_AS
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP
endmenu
endif

12
block/Makefile Normal file
View File

@@ -0,0 +1,12 @@
#
# Makefile for the kernel block layer
#
obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o

1485
block/as-iosched.c Normal file

File diff suppressed because it is too large Load Diff

562
block/blktrace.c Normal file
View File

@@ -0,0 +1,562 @@
/*
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/time.h>
#include <asm/uaccess.h>
static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
static unsigned int blktrace_seq __read_mostly = 1;
/*
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
const void *data, size_t len)
{
struct blk_io_trace *t;
t = relay_reserve(bt->rchan, sizeof(*t) + len);
if (t) {
const int cpu = smp_processor_id();
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
t->device = bt->dev;
t->action = action;
t->pid = pid;
t->cpu = cpu;
t->pdu_len = len;
memcpy((void *) t + sizeof(*t), data, len);
}
}
/*
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
{
tsk->btrace_seq = blktrace_seq;
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
}
static void trace_note_time(struct blk_trace *bt)
{
struct timespec now;
unsigned long flags;
u32 words[2];
getnstimeofday(&now);
words[0] = now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
local_irq_restore(flags);
}
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
return 1;
if (sector < bt->start_lba || sector > bt->end_lba)
return 1;
if (bt->pid && pid != bt->pid)
return 1;
return 0;
}
/*
* Data direction bit lookup
*/
static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
/*
* Bio action bits of interest
*/
static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
/*
* More could be added as needed, taking care to increment the decrementer
* to get correct indexing
*/
#define trace_barrier_bit(rw) \
(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
#define trace_sync_bit(rw) \
(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
#define trace_ahead_bit(rw) \
(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
#define trace_meta_bit(rw) \
(((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
/*
* The worker for the various blk_add_trace*() types. Fills out a
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int rw, u32 what, int error, int pdu_len, void *pdu_data)
{
struct task_struct *tsk = current;
struct blk_io_trace *t;
unsigned long flags;
unsigned long *sequence;
pid_t pid;
int cpu;
if (unlikely(bt->trace_state != Blktrace_running))
return;
what |= ddir_act[rw & WRITE];
what |= bio_act[trace_barrier_bit(rw)];
what |= bio_act[trace_sync_bit(rw)];
what |= bio_act[trace_ahead_bit(rw)];
what |= bio_act[trace_meta_bit(rw)];
pid = tsk->pid;
if (unlikely(act_log_check(bt, what, sector, pid)))
return;
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes. Once reserved, it's
* enough to get preemption disabled to prevent read of this data
* before we are through filling it. get_cpu()/put_cpu() does this
* for us
*/
local_irq_save(flags);
if (unlikely(tsk->btrace_seq != blktrace_seq))
trace_note_tsk(bt, tsk);
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
if (t) {
cpu = smp_processor_id();
sequence = per_cpu_ptr(bt->sequence, cpu);
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
t->sector = sector;
t->bytes = bytes;
t->action = what;
t->pid = pid;
t->device = bt->dev;
t->cpu = cpu;
t->error = error;
t->pdu_len = pdu_len;
if (pdu_len)
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__blk_add_trace);
static struct dentry *blk_tree_root;
static struct mutex blk_tree_mutex;
static unsigned int root_users;
static inline void blk_remove_root(void)
{
if (blk_tree_root) {
debugfs_remove(blk_tree_root);
blk_tree_root = NULL;
}
}
static void blk_remove_tree(struct dentry *dir)
{
mutex_lock(&blk_tree_mutex);
debugfs_remove(dir);
if (--root_users == 0)
blk_remove_root();
mutex_unlock(&blk_tree_mutex);
}
static struct dentry *blk_create_tree(const char *blk_name)
{
struct dentry *dir = NULL;
mutex_lock(&blk_tree_mutex);
if (!blk_tree_root) {
blk_tree_root = debugfs_create_dir("block", NULL);
if (!blk_tree_root)
goto err;
}
dir = debugfs_create_dir(blk_name, blk_tree_root);
if (dir)
root_users++;
else
blk_remove_root();
err:
mutex_unlock(&blk_tree_mutex);
return dir;
}
static void blk_trace_cleanup(struct blk_trace *bt)
{
relay_close(bt->rchan);
debugfs_remove(bt->dropped_file);
blk_remove_tree(bt->dir);
free_percpu(bt->sequence);
kfree(bt);
}
static int blk_trace_remove(request_queue_t *q)
{
struct blk_trace *bt;
bt = xchg(&q->blk_trace, NULL);
if (!bt)
return -EINVAL;
if (bt->trace_state == Blktrace_setup ||
bt->trace_state == Blktrace_stopped)
blk_trace_cleanup(bt);
return 0;
}
static int blk_dropped_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
return 0;
}
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
char buf[16];
snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = blk_dropped_open,
.read = blk_dropped_read,
};
/*
* Keep track of how many times we encountered a full subbuffer, to aid
* the user space app in telling how many lost events there were.
*/
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
void *prev_subbuf, size_t prev_padding)
{
struct blk_trace *bt;
if (!relay_buf_full(buf))
return 1;
bt = buf->chan->private_data;
atomic_inc(&bt->dropped);
return 0;
}
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
return 0;
}
static struct dentry *blk_create_buf_file_callback(const char *filename,
struct dentry *parent,
int mode,
struct rchan_buf *buf,
int *is_global)
{
return debugfs_create_file(filename, mode, parent, buf,
&relay_file_operations);
}
static struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
/*
* Setup everything required to start tracing
*/
static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
char __user *arg)
{
struct blk_user_trace_setup buts;
struct blk_trace *old_bt, *bt = NULL;
struct dentry *dir = NULL;
char b[BDEVNAME_SIZE];
int ret, i;
if (copy_from_user(&buts, arg, sizeof(buts)))
return -EFAULT;
if (!buts.buf_size || !buts.buf_nr)
return -EINVAL;
strcpy(buts.name, bdevname(bdev, b));
/*
* some device names have larger paths - convert the slashes
* to underscores for this to work as expected
*/
for (i = 0; i < strlen(buts.name); i++)
if (buts.name[i] == '/')
buts.name[i] = '_';
if (copy_to_user(arg, &buts, sizeof(buts)))
return -EFAULT;
ret = -ENOMEM;
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
goto err;
bt->sequence = alloc_percpu(unsigned long);
if (!bt->sequence)
goto err;
ret = -ENOENT;
dir = blk_create_tree(buts.name);
if (!dir)
goto err;
bt->dir = dir;
bt->dev = bdev->bd_dev;
atomic_set(&bt->dropped, 0);
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
if (!bt->dropped_file)
goto err;
bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
bt->act_mask = buts.act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
bt->start_lba = buts.start_lba;
bt->end_lba = buts.end_lba;
if (!bt->end_lba)
bt->end_lba = -1ULL;
bt->pid = buts.pid;
bt->trace_state = Blktrace_setup;
ret = -EBUSY;
old_bt = xchg(&q->blk_trace, bt);
if (old_bt) {
(void) xchg(&q->blk_trace, old_bt);
goto err;
}
return 0;
err:
if (dir)
blk_remove_tree(dir);
if (bt) {
if (bt->dropped_file)
debugfs_remove(bt->dropped_file);
free_percpu(bt->sequence);
if (bt->rchan)
relay_close(bt->rchan);
kfree(bt);
}
return ret;
}
static int blk_trace_startstop(request_queue_t *q, int start)
{
struct blk_trace *bt;
int ret;
if ((bt = q->blk_trace) == NULL)
return -EINVAL;
/*
* For starting a trace, we can transition from a setup or stopped
* trace. For stopping a trace, the state must be running
*/
ret = -EINVAL;
if (start) {
if (bt->trace_state == Blktrace_setup ||
bt->trace_state == Blktrace_stopped) {
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
trace_note_time(bt);
ret = 0;
}
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
relay_flush(bt->rchan);
ret = 0;
}
}
return ret;
}
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
*
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
request_queue_t *q;
int ret, start = 0;
q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
mutex_lock(&bdev->bd_mutex);
switch (cmd) {
case BLKTRACESETUP:
ret = blk_trace_setup(q, bdev, arg);
break;
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&bdev->bd_mutex);
return ret;
}
/**
* blk_trace_shutdown: - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
void blk_trace_shutdown(request_queue_t *q)
{
if (q->blk_trace) {
blk_trace_startstop(q, 0);
blk_trace_remove(q);
}
}
/*
* Average offset over two calls to sched_clock() with a gettimeofday()
* in the middle
*/
static void blk_check_time(unsigned long long *t)
{
unsigned long long a, b;
struct timeval tv;
a = sched_clock();
do_gettimeofday(&tv);
b = sched_clock();
*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
*t -= (a + b) / 2;
}
/*
* calibrate our inter-CPU timings
*/
static void blk_trace_check_cpu_time(void *data)
{
unsigned long long *t;
int cpu = get_cpu();
t = &per_cpu(blk_trace_cpu_offset, cpu);
/*
* Just call it twice, hopefully the second call will be cache hot
* and a little more precise
*/
blk_check_time(t);
blk_check_time(t);
put_cpu();
}
static void blk_trace_set_ht_offsets(void)
{
#if defined(CONFIG_SCHED_SMT)
int cpu, i;
/*
* now make sure HT siblings have the same time offset
*/
preempt_disable();
for_each_online_cpu(cpu) {
unsigned long long *cpu_off, *sibling_off;
for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
if (i == cpu)
continue;
cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
sibling_off = &per_cpu(blk_trace_cpu_offset, i);
*sibling_off = *cpu_off;
}
}
preempt_enable();
#endif
}
static __init int blk_trace_init(void)
{
mutex_init(&blk_tree_mutex);
on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
blk_trace_set_ht_offsets();
return 0;
}
module_init(blk_trace_init);

2238
block/cfq-iosched.c Normal file

File diff suppressed because it is too large Load Diff

485
block/deadline-iosched.c Normal file
View File

@@ -0,0 +1,485 @@
/*
* Deadline i/o scheduler.
*
* Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/rbtree.h>
/*
* See Documentation/block/deadline-iosched.txt
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2];
struct list_head fifo_list[2];
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2];
unsigned int batching; /* number of sequential requests made */
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2];
int fifo_batch;
int writes_starved;
int front_merges;
};
static void deadline_move_request(struct deadline_data *, struct request *);
#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))])
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
struct rb_root *root = RQ_RB_ROOT(dd, rq);
struct request *__alias;
retry:
__alias = elv_rb_add(root, rq);
if (unlikely(__alias)) {
deadline_move_request(dd, __alias);
goto retry;
}
}
static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
if (dd->next_rq[data_dir] == rq) {
struct rb_node *rbnext = rb_next(&rq->rb_node);
dd->next_rq[data_dir] = NULL;
if (rbnext)
dd->next_rq[data_dir] = rb_entry_rq(rbnext);
}
elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
}
/*
* add rq to rbtree and fifo
*/
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
deadline_add_rq_rb(dd, rq);
/*
* set expire time (only used for reads) and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}
/*
* remove rq from rbtree and fifo.
*/
static void deadline_remove_request(request_queue_t *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
rq_fifo_clear(rq);
deadline_del_rq_rb(dd, rq);
}
static int
deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
struct request *__rq;
int ret;
/*
* check for front merge
*/
if (dd->front_merges) {
sector_t sector = bio->bi_sector + bio_sectors(bio);
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
if (__rq) {
BUG_ON(sector != __rq->sector);
if (elv_rq_merge_ok(__rq, bio)) {
ret = ELEVATOR_FRONT_MERGE;
goto out;
}
}
}
return ELEVATOR_NO_MERGE;
out:
*req = __rq;
return ret;
}
static void deadline_merged_request(request_queue_t *q, struct request *req,
int type)
{
struct deadline_data *dd = q->elevator->elevator_data;
/*
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
elv_rb_del(RQ_RB_ROOT(dd, req), req);
deadline_add_rq_rb(dd, req);
}
}
static void
deadline_merged_requests(request_queue_t *q, struct request *req,
struct request *next)
{
/*
* if next expires before rq, assign its expire time to rq
* and move into next position (next will be deleted) in fifo
*/
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
list_move(&req->queuelist, &next->queuelist);
rq_set_fifo_time(req, rq_fifo_time(next));
}
}
/*
* kill knowledge of next, this one is a goner
*/
deadline_remove_request(q, next);
}
/*
* move request from sort list to dispatch queue.
*/
static inline void
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{
request_queue_t *q = rq->q;
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
}
/*
* move an entry to dispatch queue
*/
static void
deadline_move_request(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
struct rb_node *rbnext = rb_next(&rq->rb_node);
dd->next_rq[READ] = NULL;
dd->next_rq[WRITE] = NULL;
if (rbnext)
dd->next_rq[data_dir] = rb_entry_rq(rbnext);
dd->last_sector = rq->sector + rq->nr_sectors;
/*
* take it off the sort and fifo list, move
* to dispatch queue
*/
deadline_move_to_dispatch(dd, rq);
}
/*
* deadline_check_fifo returns 0 if there are no expired reads on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
*/
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
{
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
/*
* rq is expired!
*/
if (time_after(jiffies, rq_fifo_time(rq)))
return 1;
return 0;
}
/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
*/
static int deadline_dispatch_requests(request_queue_t *q, int force)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
if (rq) {
/* we have a "next request" */
if (dd->last_sector != rq->sector)
/* end the batch on a non sequential request */
dd->batching += dd->fifo_batch;
if (dd->batching < dd->fifo_batch)
/* we are still entitled to batch */
goto dispatch_request;
}
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
goto dispatch_find_request;
}
/*
* there are either no reads or writes have been starved
*/
if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
dd->starved = 0;
data_dir = WRITE;
goto dispatch_find_request;
}
return 0;
dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
if (deadline_check_fifo(dd, data_dir)) {
/* An expired request exists - satisfy it */
dd->batching = 0;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
} else if (dd->next_rq[data_dir]) {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
} else {
struct rb_node *node;
/*
* The last req was the other direction or we have run out of
* higher-sectored requests. Go back to the lowest sectored
* request (1 way elevator) and start a new batch.
*/
dd->batching = 0;
node = rb_first(&dd->sort_list[data_dir]);
if (node)
rq = rb_entry_rq(node);
}
dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++;
deadline_move_request(dd, rq);
return 1;
}
static int deadline_queue_empty(request_queue_t *q)
{
struct deadline_data *dd = q->elevator->elevator_data;
return list_empty(&dd->fifo_list[WRITE])
&& list_empty(&dd->fifo_list[READ]);
}
static void deadline_exit_queue(elevator_t *e)
{
struct deadline_data *dd = e->elevator_data;
BUG_ON(!list_empty(&dd->fifo_list[READ]));
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
kfree(dd);
}
/*
* initialize elevator private data (deadline_data).
*/
static void *deadline_init_queue(request_queue_t *q)
{
struct deadline_data *dd;
dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd)
return NULL;
memset(dd, 0, sizeof(*dd));
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
dd->sort_list[WRITE] = RB_ROOT;
dd->fifo_expire[READ] = read_expire;
dd->fifo_expire[WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
return dd;
}
/*
* sysfs parts below
*/
static ssize_t
deadline_var_show(int var, char *page)
{
return sprintf(page, "%d\n", var);
}
static ssize_t
deadline_var_store(int *var, const char *page, size_t count)
{
char *p = (char *) page;
*var = simple_strtol(p, &p, 10);
return count;
}
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
static ssize_t __FUNC(elevator_t *e, char *page) \
{ \
struct deadline_data *dd = e->elevator_data; \
int __data = __VAR; \
if (__CONV) \
__data = jiffies_to_msecs(__data); \
return deadline_var_show(__data, (page)); \
}
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
{ \
struct deadline_data *dd = e->elevator_data; \
int __data; \
int ret = deadline_var_store(&__data, (page), count); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX)) \
__data = (MAX); \
if (__CONV) \
*(__PTR) = msecs_to_jiffies(__data); \
else \
*(__PTR) = __data; \
return ret; \
}
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
#undef STORE_FUNCTION
#define DD_ATTR(name) \
__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
deadline_##name##_store)
static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
DD_ATTR(write_expire),
DD_ATTR(writes_starved),
DD_ATTR(front_merges),
DD_ATTR(fifo_batch),
__ATTR_NULL
};
static struct elevator_type iosched_deadline = {
.ops = {
.elevator_merge_fn = deadline_merge,
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_add_req_fn = deadline_add_request,
.elevator_queue_empty_fn = deadline_queue_empty,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
.elevator_init_fn = deadline_init_queue,
.elevator_exit_fn = deadline_exit_queue,
},
.elevator_attrs = deadline_attrs,
.elevator_name = "deadline",
.elevator_owner = THIS_MODULE,
};
static int __init deadline_init(void)
{
return elv_register(&iosched_deadline);
}
static void __exit deadline_exit(void)
{
elv_unregister(&iosched_deadline);
}
module_init(deadline_init);
module_exit(deadline_exit);
MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("deadline IO scheduler");

1159
block/elevator.c Normal file

File diff suppressed because it is too large Load Diff

747
block/genhd.c Normal file
View File

@@ -0,0 +1,747 @@
/*
* gendisk handling
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/kobj_map.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
struct subsystem block_subsys;
static DEFINE_MUTEX(block_subsys_lock);
/*
* Can be deleted altogether. Later.
*
*/
static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(int major)
{
return major % BLKDEV_MAJOR_HASH_SIZE;
}
#ifdef CONFIG_PROC_FS
void blkdev_show(struct seq_file *f, off_t offset)
{
struct blk_major_name *dp;
if (offset < BLKDEV_MAJOR_HASH_SIZE) {
mutex_lock(&block_subsys_lock);
for (dp = major_names[offset]; dp; dp = dp->next)
seq_printf(f, "%3d %s\n", dp->major, dp->name);
mutex_unlock(&block_subsys_lock);
}
}
#endif /* CONFIG_PROC_FS */
int register_blkdev(unsigned int major, const char *name)
{
struct blk_major_name **n, *p;
int index, ret = 0;
mutex_lock(&block_subsys_lock);
/* temporary */
if (major == 0) {
for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
if (major_names[index] == NULL)
break;
}
if (index == 0) {
printk("register_blkdev: failed to get major for %s\n",
name);
ret = -EBUSY;
goto out;
}
major = index;
ret = major;
}
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
if (p == NULL) {
ret = -ENOMEM;
goto out;
}
p->major = major;
strlcpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
for (n = &major_names[index]; *n; n = &(*n)->next) {
if ((*n)->major == major)
break;
}
if (!*n)
*n = p;
else
ret = -EBUSY;
if (ret < 0) {
printk("register_blkdev: cannot get major %d for %s\n",
major, name);
kfree(p);
}
out:
mutex_unlock(&block_subsys_lock);
return ret;
}
EXPORT_SYMBOL(register_blkdev);
/* todo: make void - error printk here */
int unregister_blkdev(unsigned int major, const char *name)
{
struct blk_major_name **n;
struct blk_major_name *p = NULL;
int index = major_to_index(major);
int ret = 0;
mutex_lock(&block_subsys_lock);
for (n = &major_names[index]; *n; n = &(*n)->next)
if ((*n)->major == major)
break;
if (!*n || strcmp((*n)->name, name))
ret = -EINVAL;
else {
p = *n;
*n = p->next;
}
mutex_unlock(&block_subsys_lock);
kfree(p);
return ret;
}
EXPORT_SYMBOL(unregister_blkdev);
static struct kobj_map *bdev_map;
/*
* Register device numbers dev..(dev+range-1)
* range must be nonzero
* The hash chain is sorted on range, so that subranges can override.
*/
void blk_register_region(dev_t dev, unsigned long range, struct module *module,
struct kobject *(*probe)(dev_t, int *, void *),
int (*lock)(dev_t, void *), void *data)
{
kobj_map(bdev_map, dev, range, module, probe, lock, data);
}
EXPORT_SYMBOL(blk_register_region);
void blk_unregister_region(dev_t dev, unsigned long range)
{
kobj_unmap(bdev_map, dev, range);
}
EXPORT_SYMBOL(blk_unregister_region);
static struct kobject *exact_match(dev_t dev, int *part, void *data)
{
struct gendisk *p = data;
return &p->kobj;
}
static int exact_lock(dev_t dev, void *data)
{
struct gendisk *p = data;
if (!get_disk(p))
return -1;
return 0;
}
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
*
* This function registers the partitioning information in @disk
* with the kernel.
*/
void add_disk(struct gendisk *disk)
{
disk->flags |= GENHD_FL_UP;
blk_register_region(MKDEV(disk->major, disk->first_minor),
disk->minors, NULL, exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
}
EXPORT_SYMBOL(add_disk);
EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
void unlink_gendisk(struct gendisk *disk)
{
blk_unregister_queue(disk);
blk_unregister_region(MKDEV(disk->major, disk->first_minor),
disk->minors);
}
#define to_disk(obj) container_of(obj,struct gendisk,kobj)
/**
* get_gendisk - get partitioning information for a given device
* @dev: device to get partitioning information for
*
* This function gets the structure containing partitioning
* information for the given device @dev.
*/
struct gendisk *get_gendisk(dev_t dev, int *part)
{
struct kobject *kobj = kobj_lookup(bdev_map, dev, part);
return kobj ? to_disk(kobj) : NULL;
}
#ifdef CONFIG_PROC_FS
/* iterator */
static void *part_start(struct seq_file *part, loff_t *pos)
{
struct list_head *p;
loff_t l = *pos;
mutex_lock(&block_subsys_lock);
list_for_each(p, &block_subsys.kset.list)
if (!l--)
return list_entry(p, struct gendisk, kobj.entry);
return NULL;
}
static void *part_next(struct seq_file *part, void *v, loff_t *pos)
{
struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
++*pos;
return p==&block_subsys.kset.list ? NULL :
list_entry(p, struct gendisk, kobj.entry);
}
static void part_stop(struct seq_file *part, void *v)
{
mutex_unlock(&block_subsys_lock);
}
static int show_partition(struct seq_file *part, void *v)
{
struct gendisk *sgp = v;
int n;
char buf[BDEVNAME_SIZE];
if (&sgp->kobj.entry == block_subsys.kset.list.next)
seq_puts(part, "major minor #blocks name\n\n");
/* Don't show non-partitionable removeable devices or empty devices */
if (!get_capacity(sgp) ||
(sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
return 0;
/* show the full disk and all non-0 size partitions of it */
seq_printf(part, "%4d %4d %10llu %s\n",
sgp->major, sgp->first_minor,
(unsigned long long)get_capacity(sgp) >> 1,
disk_name(sgp, 0, buf));
for (n = 0; n < sgp->minors - 1; n++) {
if (!sgp->part[n])
continue;
if (sgp->part[n]->nr_sects == 0)
continue;
seq_printf(part, "%4d %4d %10llu %s\n",
sgp->major, n + 1 + sgp->first_minor,
(unsigned long long)sgp->part[n]->nr_sects >> 1 ,
disk_name(sgp, n + 1, buf));
}
return 0;
}
struct seq_operations partitions_op = {
.start =part_start,
.next = part_next,
.stop = part_stop,
.show = show_partition
};
#endif
extern int blk_dev_init(void);
static struct kobject *base_probe(dev_t dev, int *part, void *data)
{
if (request_module("block-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
/* Make old-style 2.4 aliases work */
request_module("block-major-%d", MAJOR(dev));
return NULL;
}
static int __init genhd_device_init(void)
{
int err;
bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
blk_dev_init();
err = subsystem_register(&block_subsys);
if (err < 0)
printk(KERN_WARNING "%s: subsystem_register error: %d\n",
__FUNCTION__, err);
return err;
}
subsys_initcall(genhd_device_init);
/*
* kobject & sysfs bindings for block devices
*/
static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr,
char *page)
{
struct gendisk *disk = to_disk(kobj);
struct disk_attribute *disk_attr =
container_of(attr,struct disk_attribute,attr);
ssize_t ret = -EIO;
if (disk_attr->show)
ret = disk_attr->show(disk,page);
return ret;
}
static ssize_t disk_attr_store(struct kobject * kobj, struct attribute * attr,
const char *page, size_t count)
{
struct gendisk *disk = to_disk(kobj);
struct disk_attribute *disk_attr =
container_of(attr,struct disk_attribute,attr);
ssize_t ret = 0;
if (disk_attr->store)
ret = disk_attr->store(disk, page, count);
return ret;
}
static struct sysfs_ops disk_sysfs_ops = {
.show = &disk_attr_show,
.store = &disk_attr_store,
};
static ssize_t disk_uevent_store(struct gendisk * disk,
const char *buf, size_t count)
{
kobject_uevent(&disk->kobj, KOBJ_ADD);
return count;
}
static ssize_t disk_dev_read(struct gendisk * disk, char *page)
{
dev_t base = MKDEV(disk->major, disk->first_minor);
return print_dev_t(page, base);
}
static ssize_t disk_range_read(struct gendisk * disk, char *page)
{
return sprintf(page, "%d\n", disk->minors);
}
static ssize_t disk_removable_read(struct gendisk * disk, char *page)
{
return sprintf(page, "%d\n",
(disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}
static ssize_t disk_size_read(struct gendisk * disk, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk));
}
static ssize_t disk_stats_read(struct gendisk * disk, char *page)
{
preempt_disable();
disk_round_stats(disk);
preempt_enable();
return sprintf(page,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
"%8u %8u %8u"
"\n",
disk_stat_read(disk, ios[READ]),
disk_stat_read(disk, merges[READ]),
(unsigned long long)disk_stat_read(disk, sectors[READ]),
jiffies_to_msecs(disk_stat_read(disk, ticks[READ])),
disk_stat_read(disk, ios[WRITE]),
disk_stat_read(disk, merges[WRITE]),
(unsigned long long)disk_stat_read(disk, sectors[WRITE]),
jiffies_to_msecs(disk_stat_read(disk, ticks[WRITE])),
disk->in_flight,
jiffies_to_msecs(disk_stat_read(disk, io_ticks)),
jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
}
static struct disk_attribute disk_attr_uevent = {
.attr = {.name = "uevent", .mode = S_IWUSR },
.store = disk_uevent_store
};
static struct disk_attribute disk_attr_dev = {
.attr = {.name = "dev", .mode = S_IRUGO },
.show = disk_dev_read
};
static struct disk_attribute disk_attr_range = {
.attr = {.name = "range", .mode = S_IRUGO },
.show = disk_range_read
};
static struct disk_attribute disk_attr_removable = {
.attr = {.name = "removable", .mode = S_IRUGO },
.show = disk_removable_read
};
static struct disk_attribute disk_attr_size = {
.attr = {.name = "size", .mode = S_IRUGO },
.show = disk_size_read
};
static struct disk_attribute disk_attr_stat = {
.attr = {.name = "stat", .mode = S_IRUGO },
.show = disk_stats_read
};
#ifdef CONFIG_FAIL_MAKE_REQUEST
static ssize_t disk_fail_store(struct gendisk * disk,
const char *buf, size_t count)
{
int i;
if (count > 0 && sscanf(buf, "%d", &i) > 0) {
if (i == 0)
disk->flags &= ~GENHD_FL_FAIL;
else
disk->flags |= GENHD_FL_FAIL;
}
return count;
}
static ssize_t disk_fail_read(struct gendisk * disk, char *page)
{
return sprintf(page, "%d\n", disk->flags & GENHD_FL_FAIL ? 1 : 0);
}
static struct disk_attribute disk_attr_fail = {
.attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
.store = disk_fail_store,
.show = disk_fail_read
};
#endif
static struct attribute * default_attrs[] = {
&disk_attr_uevent.attr,
&disk_attr_dev.attr,
&disk_attr_range.attr,
&disk_attr_removable.attr,
&disk_attr_size.attr,
&disk_attr_stat.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&disk_attr_fail.attr,
#endif
NULL,
};
static void disk_release(struct kobject * kobj)
{
struct gendisk *disk = to_disk(kobj);
kfree(disk->random);
kfree(disk->part);
free_disk_stats(disk);
kfree(disk);
}
static struct kobj_type ktype_block = {
.release = disk_release,
.sysfs_ops = &disk_sysfs_ops,
.default_attrs = default_attrs,
};
extern struct kobj_type ktype_part;
static int block_uevent_filter(struct kset *kset, struct kobject *kobj)
{
struct kobj_type *ktype = get_ktype(kobj);
return ((ktype == &ktype_block) || (ktype == &ktype_part));
}
static int block_uevent(struct kset *kset, struct kobject *kobj, char **envp,
int num_envp, char *buffer, int buffer_size)
{
struct kobj_type *ktype = get_ktype(kobj);
struct device *physdev;
struct gendisk *disk;
struct hd_struct *part;
int length = 0;
int i = 0;
if (ktype == &ktype_block) {
disk = container_of(kobj, struct gendisk, kobj);
add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
&length, "MINOR=%u", disk->first_minor);
} else if (ktype == &ktype_part) {
disk = container_of(kobj->parent, struct gendisk, kobj);
part = container_of(kobj, struct hd_struct, kobj);
add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
&length, "MINOR=%u",
disk->first_minor + part->partno);
} else
return 0;
add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
"MAJOR=%u", disk->major);
/* add physical device, backing this device */
physdev = disk->driverfs_dev;
if (physdev) {
char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
&length, "PHYSDEVPATH=%s", path);
kfree(path);
if (physdev->bus)
add_uevent_var(envp, num_envp, &i,
buffer, buffer_size, &length,
"PHYSDEVBUS=%s",
physdev->bus->name);
if (physdev->driver)
add_uevent_var(envp, num_envp, &i,
buffer, buffer_size, &length,
"PHYSDEVDRIVER=%s",
physdev->driver->name);
}
/* terminate, set to next free slot, shrink available space */
envp[i] = NULL;
envp = &envp[i];
num_envp -= i;
buffer = &buffer[length];
buffer_size -= length;
return 0;
}
static struct kset_uevent_ops block_uevent_ops = {
.filter = block_uevent_filter,
.uevent = block_uevent,
};
decl_subsys(block, &ktype_block, &block_uevent_ops);
/*
* aggregate disk stat collector. Uses the same stats that the sysfs
* entries do, above, but makes them available through one seq_file.
* Watching a few disks may be efficient through sysfs, but watching
* all of them will be more efficient through this interface.
*
* The output looks suspiciously like /proc/partitions with a bunch of
* extra fields.
*/
/* iterator */
static void *diskstats_start(struct seq_file *part, loff_t *pos)
{
loff_t k = *pos;
struct list_head *p;
mutex_lock(&block_subsys_lock);
list_for_each(p, &block_subsys.kset.list)
if (!k--)
return list_entry(p, struct gendisk, kobj.entry);
return NULL;
}
static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
{
struct list_head *p = ((struct gendisk *)v)->kobj.entry.next;
++*pos;
return p==&block_subsys.kset.list ? NULL :
list_entry(p, struct gendisk, kobj.entry);
}
static void diskstats_stop(struct seq_file *part, void *v)
{
mutex_unlock(&block_subsys_lock);
}
static int diskstats_show(struct seq_file *s, void *v)
{
struct gendisk *gp = v;
char buf[BDEVNAME_SIZE];
int n = 0;
/*
if (&sgp->kobj.entry == block_subsys.kset.list.next)
seq_puts(s, "major minor name"
" rio rmerge rsect ruse wio wmerge "
"wsect wuse running use aveq"
"\n\n");
*/
preempt_disable();
disk_round_stats(gp);
preempt_enable();
seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
gp->major, n + gp->first_minor, disk_name(gp, n, buf),
disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
(unsigned long long)disk_stat_read(gp, sectors[0]),
jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
(unsigned long long)disk_stat_read(gp, sectors[1]),
jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
gp->in_flight,
jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
/* now show all non-0 size partitions of it */
for (n = 0; n < gp->minors - 1; n++) {
struct hd_struct *hd = gp->part[n];
if (hd && hd->nr_sects)
seq_printf(s, "%4d %4d %s %u %u %u %u\n",
gp->major, n + gp->first_minor + 1,
disk_name(gp, n + 1, buf),
hd->ios[0], hd->sectors[0],
hd->ios[1], hd->sectors[1]);
}
return 0;
}
struct seq_operations diskstats_op = {
.start = diskstats_start,
.next = diskstats_next,
.stop = diskstats_stop,
.show = diskstats_show
};
struct gendisk *alloc_disk(int minors)
{
return alloc_disk_node(minors, -1);
}
struct gendisk *alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) {
memset(disk, 0, sizeof(struct gendisk));
if (!init_disk_stats(disk)) {
kfree(disk);
return NULL;
}
if (minors > 1) {
int size = (minors - 1) * sizeof(struct hd_struct *);
disk->part = kmalloc_node(size, GFP_KERNEL, node_id);
if (!disk->part) {
kfree(disk);
return NULL;
}
memset(disk->part, 0, size);
}
disk->minors = minors;
kobj_set_kset_s(disk,block_subsys);
kobject_init(&disk->kobj);
rand_initialize_disk(disk);
}
return disk;
}
EXPORT_SYMBOL(alloc_disk);
EXPORT_SYMBOL(alloc_disk_node);
struct kobject *get_disk(struct gendisk *disk)
{
struct module *owner;
struct kobject *kobj;
if (!disk->fops)
return NULL;
owner = disk->fops->owner;
if (owner && !try_module_get(owner))
return NULL;
kobj = kobject_get(&disk->kobj);
if (kobj == NULL) {
module_put(owner);
return NULL;
}
return kobj;
}
EXPORT_SYMBOL(get_disk);
void put_disk(struct gendisk *disk)
{
if (disk)
kobject_put(&disk->kobj);
}
EXPORT_SYMBOL(put_disk);
void set_device_ro(struct block_device *bdev, int flag)
{
if (bdev->bd_contains != bdev)
bdev->bd_part->policy = flag;
else
bdev->bd_disk->policy = flag;
}
EXPORT_SYMBOL(set_device_ro);
void set_disk_ro(struct gendisk *disk, int flag)
{
int i;
disk->policy = flag;
for (i = 0; i < disk->minors - 1; i++)
if (disk->part[i]) disk->part[i]->policy = flag;
}
EXPORT_SYMBOL(set_disk_ro);
int bdev_read_only(struct block_device *bdev)
{
if (!bdev)
return 0;
else if (bdev->bd_contains != bdev)
return bdev->bd_part->policy;
else
return bdev->bd_disk->policy;
}
EXPORT_SYMBOL(bdev_read_only);
int invalidate_partition(struct gendisk *disk, int index)
{
int res = 0;
struct block_device *bdev = bdget_disk(disk, index);
if (bdev) {
fsync_bdev(bdev);
res = __invalidate_device(bdev);
bdput(bdev);
}
return res;
}
EXPORT_SYMBOL(invalidate_partition);

304
block/ioctl.c Normal file
View File

@@ -0,0 +1,304 @@
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/smp_lock.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
{
struct block_device *bdevp;
struct gendisk *disk;
struct blkpg_ioctl_arg a;
struct blkpg_partition p;
long long start, length;
int part;
int i;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
return -EFAULT;
if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition)))
return -EFAULT;
disk = bdev->bd_disk;
if (bdev != bdev->bd_contains)
return -EINVAL;
part = p.pno;
if (part <= 0 || part >= disk->minors)
return -EINVAL;
switch (a.op) {
case BLKPG_ADD_PARTITION:
start = p.start >> 9;
length = p.length >> 9;
/* check for fit in a hd_struct */
if (sizeof(sector_t) == sizeof(long) &&
sizeof(long long) > sizeof(long)) {
long pstart = start, plength = length;
if (pstart != start || plength != length
|| pstart < 0 || plength < 0)
return -EINVAL;
}
/* partition number in use? */
mutex_lock(&bdev->bd_mutex);
if (disk->part[part - 1]) {
mutex_unlock(&bdev->bd_mutex);
return -EBUSY;
}
/* overlap? */
for (i = 0; i < disk->minors - 1; i++) {
struct hd_struct *s = disk->part[i];
if (!s)
continue;
if (!(start+length <= s->start_sect ||
start >= s->start_sect + s->nr_sects)) {
mutex_unlock(&bdev->bd_mutex);
return -EBUSY;
}
}
/* all seems OK */
add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
mutex_unlock(&bdev->bd_mutex);
return 0;
case BLKPG_DEL_PARTITION:
if (!disk->part[part-1])
return -ENXIO;
if (disk->part[part - 1]->nr_sects == 0)
return -ENXIO;
bdevp = bdget_disk(disk, part);
if (!bdevp)
return -ENOMEM;
mutex_lock(&bdevp->bd_mutex);
if (bdevp->bd_openers) {
mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp);
return -EBUSY;
}
/* all seems OK */
fsync_bdev(bdevp);
invalidate_bdev(bdevp, 0);
mutex_lock_nested(&bdev->bd_mutex, 1);
delete_partition(disk, part);
mutex_unlock(&bdev->bd_mutex);
mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp);
return 0;
default:
return -EINVAL;
}
}
static int blkdev_reread_part(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
int res;
if (disk->minors == 1 || bdev != bdev->bd_contains)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!mutex_trylock(&bdev->bd_mutex))
return -EBUSY;
res = rescan_partitions(disk, bdev);
mutex_unlock(&bdev->bd_mutex);
return res;
}
static int put_ushort(unsigned long arg, unsigned short val)
{
return put_user(val, (unsigned short __user *)arg);
}
static int put_int(unsigned long arg, int val)
{
return put_user(val, (int __user *)arg);
}
static int put_long(unsigned long arg, long val)
{
return put_user(val, (long __user *)arg);
}
static int put_ulong(unsigned long arg, unsigned long val)
{
return put_user(val, (unsigned long __user *)arg);
}
static int put_u64(unsigned long arg, u64 val)
{
return put_user(val, (u64 __user *)arg);
}
static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
unsigned cmd, unsigned long arg)
{
struct backing_dev_info *bdi;
int ret, n;
switch (cmd) {
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
return -ENOTTY;
return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
case BLKROGET:
return put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
return put_int(arg, block_size(bdev));
case BLKSSZGET: /* get block device hardware sector size */
return put_int(arg, bdev_hardsect_size(bdev));
case BLKSECTGET:
return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
return -ENOTTY;
bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKBSZSET:
/* set the logical block size */
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!arg)
return -EINVAL;
if (get_user(n, (int __user *) arg))
return -EFAULT;
if (bd_claim(bdev, file) < 0)
return -EBUSY;
ret = set_blocksize(bdev, n);
bd_release(bdev);
return ret;
case BLKPG:
return blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
case BLKRRPART:
return blkdev_reread_part(bdev);
case BLKGETSIZE:
if ((bdev->bd_inode->i_size >> 9) > ~0UL)
return -EFBIG;
return put_ulong(arg, bdev->bd_inode->i_size >> 9);
case BLKGETSIZE64:
return put_u64(arg, bdev->bd_inode->i_size);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
}
return -ENOIOCTLCMD;
}
int blkdev_driver_ioctl(struct inode *inode, struct file *file,
struct gendisk *disk, unsigned cmd, unsigned long arg)
{
int ret;
if (disk->fops->unlocked_ioctl)
return disk->fops->unlocked_ioctl(file, cmd, arg);
if (disk->fops->ioctl) {
lock_kernel();
ret = disk->fops->ioctl(inode, file, cmd, arg);
unlock_kernel();
return ret;
}
return -ENOTTY;
}
EXPORT_SYMBOL_GPL(blkdev_driver_ioctl);
int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
unsigned long arg)
{
struct block_device *bdev = inode->i_bdev;
struct gendisk *disk = bdev->bd_disk;
int ret, n;
switch(cmd) {
case BLKFLSBUF:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
/* -EINVAL to handle old uncorrected drivers */
if (ret != -EINVAL && ret != -ENOTTY)
return ret;
lock_kernel();
fsync_bdev(bdev);
invalidate_bdev(bdev, 0);
unlock_kernel();
return 0;
case BLKROSET:
ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
/* -EINVAL to handle old uncorrected drivers */
if (ret != -EINVAL && ret != -ENOTTY)
return ret;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (get_user(n, (int __user *)(arg)))
return -EFAULT;
lock_kernel();
set_device_ro(bdev, n);
unlock_kernel();
return 0;
case HDIO_GETGEO: {
struct hd_geometry geo;
if (!arg)
return -EINVAL;
if (!disk->fops->getgeo)
return -ENOTTY;
/*
* We need to set the startsect first, the driver may
* want to override it.
*/
geo.start = get_start_sect(bdev);
ret = disk->fops->getgeo(bdev, &geo);
if (ret)
return ret;
if (copy_to_user((struct hd_geometry __user *)arg, &geo,
sizeof(geo)))
return -EFAULT;
return 0;
}
}
lock_kernel();
ret = blkdev_locked_ioctl(file, bdev, cmd, arg);
unlock_kernel();
if (ret != -ENOIOCTLCMD)
return ret;
return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
}
/* Most of the generic ioctls are handled in the normal fallback path.
This assumes the blkdev's low level compat_ioctl always returns
ENOIOCTLCMD for unknown ioctls. */
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct block_device *bdev = file->f_path.dentry->d_inode->i_bdev;
struct gendisk *disk = bdev->bd_disk;
int ret = -ENOIOCTLCMD;
if (disk->fops->compat_ioctl) {
lock_kernel();
ret = disk->fops->compat_ioctl(file, cmd, arg);
unlock_kernel();
}
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_ioctl);

4055
block/ll_rw_blk.c Normal file

File diff suppressed because it is too large Load Diff

118
block/noop-iosched.c Normal file
View File

@@ -0,0 +1,118 @@
/*
* elevator noop
*/
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/init.h>
struct noop_data {
struct list_head queue;
};
static void noop_merged_requests(request_queue_t *q, struct request *rq,
struct request *next)
{
list_del_init(&next->queuelist);
}
static int noop_dispatch(request_queue_t *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
if (!list_empty(&nd->queue)) {
struct request *rq;
rq = list_entry(nd->queue.next, struct request, queuelist);
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
}
return 0;
}
static void noop_add_request(request_queue_t *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
list_add_tail(&rq->queuelist, &nd->queue);
}
static int noop_queue_empty(request_queue_t *q)
{
struct noop_data *nd = q->elevator->elevator_data;
return list_empty(&nd->queue);
}
static struct request *
noop_former_request(request_queue_t *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
if (rq->queuelist.prev == &nd->queue)
return NULL;
return list_entry(rq->queuelist.prev, struct request, queuelist);
}
static struct request *
noop_latter_request(request_queue_t *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
if (rq->queuelist.next == &nd->queue)
return NULL;
return list_entry(rq->queuelist.next, struct request, queuelist);
}
static void *noop_init_queue(request_queue_t *q)
{
struct noop_data *nd;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd)
return NULL;
INIT_LIST_HEAD(&nd->queue);
return nd;
}
static void noop_exit_queue(elevator_t *e)
{
struct noop_data *nd = e->elevator_data;
BUG_ON(!list_empty(&nd->queue));
kfree(nd);
}
static struct elevator_type elevator_noop = {
.ops = {
.elevator_merge_req_fn = noop_merged_requests,
.elevator_dispatch_fn = noop_dispatch,
.elevator_add_req_fn = noop_add_request,
.elevator_queue_empty_fn = noop_queue_empty,
.elevator_former_req_fn = noop_former_request,
.elevator_latter_req_fn = noop_latter_request,
.elevator_init_fn = noop_init_queue,
.elevator_exit_fn = noop_exit_queue,
},
.elevator_name = "noop",
.elevator_owner = THIS_MODULE,
};
static int __init noop_init(void)
{
return elv_register(&elevator_noop);
}
static void __exit noop_exit(void)
{
elv_unregister(&elevator_noop);
}
module_init(noop_init);
module_exit(noop_exit);
MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("No-op IO scheduler");

652
block/scsi_ioctl.c Normal file
View File

@@ -0,0 +1,652 @@
/*
* Copyright (C) 2001 Jens Axboe <axboe@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
*
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public Licens
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/cdrom.h>
#include <linux/slab.h>
#include <linux/times.h>
#include <asm/uaccess.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_cmnd.h>
/* Command group 3 is reserved and should never be used. */
const unsigned char scsi_command_size[8] =
{
6, 10, 10, 12,
16, 12, 10, 10
};
EXPORT_SYMBOL(scsi_command_size);
#define BLK_DEFAULT_TIMEOUT (60 * HZ)
#include <scsi/sg.h>
static int sg_get_version(int __user *p)
{
static const int sg_version_num = 30527;
return put_user(sg_version_num, p);
}
static int scsi_get_idlun(request_queue_t *q, int __user *p)
{
return put_user(0, p);
}
static int scsi_get_bus(request_queue_t *q, int __user *p)
{
return put_user(0, p);
}
static int sg_get_timeout(request_queue_t *q)
{
return q->sg_timeout / (HZ / USER_HZ);
}
static int sg_set_timeout(request_queue_t *q, int __user *p)
{
int timeout, err = get_user(timeout, p);
if (!err)
q->sg_timeout = timeout * (HZ / USER_HZ);
return err;
}
static int sg_get_reserved_size(request_queue_t *q, int __user *p)
{
return put_user(q->sg_reserved_size, p);
}
static int sg_set_reserved_size(request_queue_t *q, int __user *p)
{
int size, err = get_user(size, p);
if (err)
return err;
if (size < 0)
return -EINVAL;
if (size > (q->max_sectors << 9))
size = q->max_sectors << 9;
q->sg_reserved_size = size;
return 0;
}
/*
* will always return that we are ATAPI even for a real SCSI drive, I'm not
* so sure this is worth doing anything about (why would you care??)
*/
static int sg_emulated_host(request_queue_t *q, int __user *p)
{
return put_user(1, p);
}
#define CMD_READ_SAFE 0x01
#define CMD_WRITE_SAFE 0x02
#define CMD_WARNED 0x04
#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE
#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE
static int verify_command(struct file *file, unsigned char *cmd)
{
static unsigned char cmd_type[256] = {
/* Basic read-only commands */
safe_for_read(TEST_UNIT_READY),
safe_for_read(REQUEST_SENSE),
safe_for_read(READ_6),
safe_for_read(READ_10),
safe_for_read(READ_12),
safe_for_read(READ_16),
safe_for_read(READ_BUFFER),
safe_for_read(READ_DEFECT_DATA),
safe_for_read(READ_LONG),
safe_for_read(INQUIRY),
safe_for_read(MODE_SENSE),
safe_for_read(MODE_SENSE_10),
safe_for_read(LOG_SENSE),
safe_for_read(START_STOP),
safe_for_read(GPCMD_VERIFY_10),
safe_for_read(VERIFY_16),
/* Audio CD commands */
safe_for_read(GPCMD_PLAY_CD),
safe_for_read(GPCMD_PLAY_AUDIO_10),
safe_for_read(GPCMD_PLAY_AUDIO_MSF),
safe_for_read(GPCMD_PLAY_AUDIO_TI),
safe_for_read(GPCMD_PAUSE_RESUME),
/* CD/DVD data reading */
safe_for_read(GPCMD_READ_BUFFER_CAPACITY),
safe_for_read(GPCMD_READ_CD),
safe_for_read(GPCMD_READ_CD_MSF),
safe_for_read(GPCMD_READ_DISC_INFO),
safe_for_read(GPCMD_READ_CDVD_CAPACITY),
safe_for_read(GPCMD_READ_DVD_STRUCTURE),
safe_for_read(GPCMD_READ_HEADER),
safe_for_read(GPCMD_READ_TRACK_RZONE_INFO),
safe_for_read(GPCMD_READ_SUBCHANNEL),
safe_for_read(GPCMD_READ_TOC_PMA_ATIP),
safe_for_read(GPCMD_REPORT_KEY),
safe_for_read(GPCMD_SCAN),
safe_for_read(GPCMD_GET_CONFIGURATION),
safe_for_read(GPCMD_READ_FORMAT_CAPACITIES),
safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION),
safe_for_read(GPCMD_GET_PERFORMANCE),
safe_for_read(GPCMD_SEEK),
safe_for_read(GPCMD_STOP_PLAY_SCAN),
/* Basic writing commands */
safe_for_write(WRITE_6),
safe_for_write(WRITE_10),
safe_for_write(WRITE_VERIFY),
safe_for_write(WRITE_12),
safe_for_write(WRITE_VERIFY_12),
safe_for_write(WRITE_16),
safe_for_write(WRITE_LONG),
safe_for_write(WRITE_LONG_2),
safe_for_write(ERASE),
safe_for_write(GPCMD_MODE_SELECT_10),
safe_for_write(MODE_SELECT),
safe_for_write(LOG_SELECT),
safe_for_write(GPCMD_BLANK),
safe_for_write(GPCMD_CLOSE_TRACK),
safe_for_write(GPCMD_FLUSH_CACHE),
safe_for_write(GPCMD_FORMAT_UNIT),
safe_for_write(GPCMD_REPAIR_RZONE_TRACK),
safe_for_write(GPCMD_RESERVE_RZONE_TRACK),
safe_for_write(GPCMD_SEND_DVD_STRUCTURE),
safe_for_write(GPCMD_SEND_EVENT),
safe_for_write(GPCMD_SEND_KEY),
safe_for_write(GPCMD_SEND_OPC),
safe_for_write(GPCMD_SEND_CUE_SHEET),
safe_for_write(GPCMD_SET_SPEED),
safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL),
safe_for_write(GPCMD_LOAD_UNLOAD),
safe_for_write(GPCMD_SET_STREAMING),
};
unsigned char type = cmd_type[cmd[0]];
int has_write_perm = 0;
/* Anybody who can open the device can do a read-safe command */
if (type & CMD_READ_SAFE)
return 0;
/*
* file can be NULL from ioctl_by_bdev()...
*/
if (file)
has_write_perm = file->f_mode & FMODE_WRITE;
/* Write-safe commands just require a writable open.. */
if ((type & CMD_WRITE_SAFE) && has_write_perm)
return 0;
/* And root can do any command.. */
if (capable(CAP_SYS_RAWIO))
return 0;
if (!type) {
cmd_type[cmd[0]] = CMD_WARNED;
printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]);
}
/* Otherwise fail it with an "Operation not permitted" */
return -EPERM;
}
static int sg_io(struct file *file, request_queue_t *q,
struct gendisk *bd_disk, struct sg_io_hdr *hdr)
{
unsigned long start_time, timeout;
int writing = 0, ret = 0;
struct request *rq;
char sense[SCSI_SENSE_BUFFERSIZE];
unsigned char cmd[BLK_MAX_CDB];
struct bio *bio;
if (hdr->interface_id != 'S')
return -EINVAL;
if (hdr->cmd_len > BLK_MAX_CDB)
return -EINVAL;
if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
return -EFAULT;
if (verify_command(file, cmd))
return -EPERM;
if (hdr->dxfer_len > (q->max_hw_sectors << 9))
return -EIO;
if (hdr->dxfer_len)
switch (hdr->dxfer_direction) {
default:
return -EINVAL;
case SG_DXFER_TO_DEV:
writing = 1;
break;
case SG_DXFER_TO_FROM_DEV:
case SG_DXFER_FROM_DEV:
break;
}
rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
if (!rq)
return -ENOMEM;
/*
* fill in request structure
*/
rq->cmd_len = hdr->cmd_len;
memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
memcpy(rq->cmd, cmd, hdr->cmd_len);
memset(sense, 0, sizeof(sense));
rq->sense = sense;
rq->sense_len = 0;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
timeout = msecs_to_jiffies(hdr->timeout);
rq->timeout = (timeout < INT_MAX) ? timeout : INT_MAX;
if (!rq->timeout)
rq->timeout = q->sg_timeout;
if (!rq->timeout)
rq->timeout = BLK_DEFAULT_TIMEOUT;
if (hdr->iovec_count) {
const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
struct sg_iovec *iov;
iov = kmalloc(size, GFP_KERNEL);
if (!iov) {
ret = -ENOMEM;
goto out;
}
if (copy_from_user(iov, hdr->dxferp, size)) {
kfree(iov);
ret = -EFAULT;
goto out;
}
ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count,
hdr->dxfer_len);
kfree(iov);
} else if (hdr->dxfer_len)
ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
if (ret)
goto out;
bio = rq->bio;
rq->retries = 0;
start_time = jiffies;
/* ignore return value. All information is passed back to caller
* (if he doesn't check that is his problem).
* N.B. a non-zero SCSI status is _not_ necessarily an error.
*/
blk_execute_rq(q, bd_disk, rq, 0);
/* write to all output members */
hdr->status = 0xff & rq->errors;
hdr->masked_status = status_byte(rq->errors);
hdr->msg_status = msg_byte(rq->errors);
hdr->host_status = host_byte(rq->errors);
hdr->driver_status = driver_byte(rq->errors);
hdr->info = 0;
if (hdr->masked_status || hdr->host_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
hdr->resid = rq->data_len;
hdr->duration = ((jiffies - start_time) * 1000) / HZ;
hdr->sb_len_wr = 0;
if (rq->sense_len && hdr->sbp) {
int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
if (!copy_to_user(hdr->sbp, rq->sense, len))
hdr->sb_len_wr = len;
}
if (blk_rq_unmap_user(bio))
ret = -EFAULT;
/* may not have succeeded, but output values written to control
* structure (struct sg_io_hdr). */
out:
blk_put_request(rq);
return ret;
}
/**
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
* @file: file this ioctl operates on (optional)
* @q: request queue to send scsi commands down
* @disk: gendisk to operate on (option)
* @sic: userspace structure describing the command to perform
*
* Send down the scsi command described by @sic to the device below
* the request queue @q. If @file is non-NULL it's used to perform
* fine-grained permission checks that allow users to send down
* non-destructive SCSI commands. If the caller has a struct gendisk
* available it should be passed in as @disk to allow the low level
* driver to use the information contained in it. A non-NULL @disk
* is only allowed if the caller knows that the low level driver doesn't
* need it (e.g. in the scsi subsystem).
*
* Notes:
* - This interface is deprecated - users should use the SG_IO
* interface instead, as this is a more flexible approach to
* performing SCSI commands on a device.
* - The SCSI command length is determined by examining the 1st byte
* of the given command. There is no way to override this.
* - Data transfers are limited to PAGE_SIZE
* - The length (x + y) must be at least OMAX_SB_LEN bytes long to
* accommodate the sense buffer when an error occurs.
* The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
* old code will not be surprised.
* - If a Unix error occurs (e.g. ENOMEM) then the user will receive
* a negative return and the Unix error code in 'errno'.
* If the SCSI command succeeds then 0 is returned.
* Positive numbers returned are the compacted SCSI error codes (4
* bytes in one int) where the lowest byte is the SCSI status.
*/
#define OMAX_SB_LEN 16 /* For backward compatibility */
int sg_scsi_ioctl(struct file *file, struct request_queue *q,
struct gendisk *disk, struct scsi_ioctl_command __user *sic)
{
struct request *rq;
int err;
unsigned int in_len, out_len, bytes, opcode, cmdlen;
char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
if (!sic)
return -EINVAL;
/*
* get in an out lengths, verify they don't exceed a page worth of data
*/
if (get_user(in_len, &sic->inlen))
return -EFAULT;
if (get_user(out_len, &sic->outlen))
return -EFAULT;
if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
return -EINVAL;
if (get_user(opcode, sic->data))
return -EFAULT;
bytes = max(in_len, out_len);
if (bytes) {
buffer = kmalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
if (!buffer)
return -ENOMEM;
memset(buffer, 0, bytes);
}
rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
cmdlen = COMMAND_SIZE(opcode);
/*
* get command and data to send to device, if any
*/
err = -EFAULT;
rq->cmd_len = cmdlen;
if (copy_from_user(rq->cmd, sic->data, cmdlen))
goto error;
if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
goto error;
err = verify_command(file, rq->cmd);
if (err)
goto error;
/* default. possible overriden later */
rq->retries = 5;
switch (opcode) {
case SEND_DIAGNOSTIC:
case FORMAT_UNIT:
rq->timeout = FORMAT_UNIT_TIMEOUT;
rq->retries = 1;
break;
case START_STOP:
rq->timeout = START_STOP_TIMEOUT;
break;
case MOVE_MEDIUM:
rq->timeout = MOVE_MEDIUM_TIMEOUT;
break;
case READ_ELEMENT_STATUS:
rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
break;
case READ_DEFECT_DATA:
rq->timeout = READ_DEFECT_DATA_TIMEOUT;
rq->retries = 1;
break;
default:
rq->timeout = BLK_DEFAULT_TIMEOUT;
break;
}
if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
err = DRIVER_ERROR << 24;
goto out;
}
memset(sense, 0, sizeof(sense));
rq->sense = sense;
rq->sense_len = 0;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
blk_execute_rq(q, disk, rq, 0);
out:
err = rq->errors & 0xff; /* only 8 bit SCSI status */
if (err) {
if (rq->sense_len && rq->sense) {
bytes = (OMAX_SB_LEN > rq->sense_len) ?
rq->sense_len : OMAX_SB_LEN;
if (copy_to_user(sic->data, rq->sense, bytes))
err = -EFAULT;
}
} else {
if (copy_to_user(sic->data, buffer, out_len))
err = -EFAULT;
}
error:
kfree(buffer);
blk_put_request(rq);
return err;
}
EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
/* Send basic block requests */
static int __blk_send_generic(request_queue_t *q, struct gendisk *bd_disk, int cmd, int data)
{
struct request *rq;
int err;
rq = blk_get_request(q, WRITE, __GFP_WAIT);
rq->cmd_type = REQ_TYPE_BLOCK_PC;
rq->data = NULL;
rq->data_len = 0;
rq->timeout = BLK_DEFAULT_TIMEOUT;
memset(rq->cmd, 0, sizeof(rq->cmd));
rq->cmd[0] = cmd;
rq->cmd[4] = data;
rq->cmd_len = 6;
err = blk_execute_rq(q, bd_disk, rq, 0);
blk_put_request(rq);
return err;
}
static inline int blk_send_start_stop(request_queue_t *q, struct gendisk *bd_disk, int data)
{
return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
}
int scsi_cmd_ioctl(struct file *file, struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
{
request_queue_t *q;
int err;
q = bd_disk->queue;
if (!q)
return -ENXIO;
if (blk_get_queue(q))
return -ENXIO;
switch (cmd) {
/*
* new sgv3 interface
*/
case SG_GET_VERSION_NUM:
err = sg_get_version(arg);
break;
case SCSI_IOCTL_GET_IDLUN:
err = scsi_get_idlun(q, arg);
break;
case SCSI_IOCTL_GET_BUS_NUMBER:
err = scsi_get_bus(q, arg);
break;
case SG_SET_TIMEOUT:
err = sg_set_timeout(q, arg);
break;
case SG_GET_TIMEOUT:
err = sg_get_timeout(q);
break;
case SG_GET_RESERVED_SIZE:
err = sg_get_reserved_size(q, arg);
break;
case SG_SET_RESERVED_SIZE:
err = sg_set_reserved_size(q, arg);
break;
case SG_EMULATED_HOST:
err = sg_emulated_host(q, arg);
break;
case SG_IO: {
struct sg_io_hdr hdr;
err = -EFAULT;
if (copy_from_user(&hdr, arg, sizeof(hdr)))
break;
err = sg_io(file, q, bd_disk, &hdr);
if (err == -EFAULT)
break;
if (copy_to_user(arg, &hdr, sizeof(hdr)))
err = -EFAULT;
break;
}
case CDROM_SEND_PACKET: {
struct cdrom_generic_command cgc;
struct sg_io_hdr hdr;
err = -EFAULT;
if (copy_from_user(&cgc, arg, sizeof(cgc)))
break;
cgc.timeout = clock_t_to_jiffies(cgc.timeout);
memset(&hdr, 0, sizeof(hdr));
hdr.interface_id = 'S';
hdr.cmd_len = sizeof(cgc.cmd);
hdr.dxfer_len = cgc.buflen;
err = 0;
switch (cgc.data_direction) {
case CGC_DATA_UNKNOWN:
hdr.dxfer_direction = SG_DXFER_UNKNOWN;
break;
case CGC_DATA_WRITE:
hdr.dxfer_direction = SG_DXFER_TO_DEV;
break;
case CGC_DATA_READ:
hdr.dxfer_direction = SG_DXFER_FROM_DEV;
break;
case CGC_DATA_NONE:
hdr.dxfer_direction = SG_DXFER_NONE;
break;
default:
err = -EINVAL;
}
if (err)
break;
hdr.dxferp = cgc.buffer;
hdr.sbp = cgc.sense;
if (hdr.sbp)
hdr.mx_sb_len = sizeof(struct request_sense);
hdr.timeout = cgc.timeout;
hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
hdr.cmd_len = sizeof(cgc.cmd);
err = sg_io(file, q, bd_disk, &hdr);
if (err == -EFAULT)
break;
if (hdr.status)
err = -EIO;
cgc.stat = err;
cgc.buflen = hdr.resid;
if (copy_to_user(arg, &cgc, sizeof(cgc)))
err = -EFAULT;
break;
}
/*
* old junk scsi send command ioctl
*/
case SCSI_IOCTL_SEND_COMMAND:
printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
err = -EINVAL;
if (!arg)
break;
err = sg_scsi_ioctl(file, q, bd_disk, arg);
break;
case CDROMCLOSETRAY:
err = blk_send_start_stop(q, bd_disk, 0x03);
break;
case CDROMEJECT:
err = blk_send_start_stop(q, bd_disk, 0x02);
break;
default:
err = -ENOTTY;
}
blk_put_queue(q);
return err;
}
EXPORT_SYMBOL(scsi_cmd_ioctl);