linux/drivers/block/virtio_blk.c
Asias He a98755c559 virtio-blk: Add bio-based IO path for virtio-blk
This patch introduces bio-based IO path for virtio-blk.

Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.

When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.

Using a slow device e.g. normal SATA disk, the bio-based IO path for
sequential read and write are slower than req-based IO path due to lack
of merge in guest kernel. So we make the bio-based path optional.

Performance evaluation:
-----------------------------
1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.

Short version:
 With bio-based IO path, sequential read/write, random read/write
 IOPS boost         : 28%, 24%, 21%, 16%
 Latency improvement: 32%, 17%, 21%, 16%

Long version:
 With bio-based IO path:
  seq-read  : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
  seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
  rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
  rand-write: io=3095.7MB, bw=96198KB/s,  iops=192396 , runt= 32952msec
    clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
    clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
    clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
    clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
  cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
  cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
  cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
  cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
 With request-based IO path:
  seq-read  : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
  seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
  rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
  rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
    clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
    clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
    clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
    clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
  cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
  cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
  cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
  cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985

2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using
kvm tool.

Short version:
 With bio-based IO path, sequential read/write, random read/write
 IOPS boost         : 11%, 11%, 13%, 10%
 Latency improvement: 10%, 10%, 12%, 10%
Long Version:
 With bio-based IO path:
  read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
  write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
  read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
  write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
    clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
    clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
    clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
    clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
  cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
  cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
  cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
  cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105
 With request-based IO path:
  read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
  write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
  read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
  write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
    clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
    clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
    clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
    clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
  cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
  cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
  cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
  cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409

3) Fio test is performed in a 8 vcpu guest with normal SATA based guest
using kvm tool.

Short version:
 With bio-based IO path, sequential read/write, random read/write
 IOPS boost         : -10%, -10%, 4.4%, 0.5%
 Latency improvement: -12%, -15%, 2.5%, 0.8%
Long Version:
 With bio-based IO path:
  read : io=124812KB, bw=36537KB/s, iops=9060 , runt=  3416msec
  write: io=169180KB, bw=24406KB/s, iops=6065 , runt=  6932msec
  read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec
  write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec
    clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54
    clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70
    clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28
    clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71
  cpu          : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7
  cpu          : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0
  cpu          : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16
  cpu          : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0

 With request-based IO path:
  read : io=150120KB, bw=40420KB/s, iops=10037 , runt=  3714msec
  write: io=194932KB, bw=27029KB/s, iops=6722 , runt=  7212msec
  read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec
  write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec
    clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45
    clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71
    clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97
    clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61
  cpu          : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23
  cpu          : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0
  cpu          : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16
  cpu          : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0

How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-09-28 15:05:13 +09:30

913 lines
22 KiB
C

//#define DEBUG
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
#include <linux/scatterlist.h>
#include <linux/string_helpers.h>
#include <scsi/scsi_cmnd.h>
#include <linux/idr.h>
#define PART_BITS 4
static bool use_bio;
module_param(use_bio, bool, S_IRUGO);
static int major;
static DEFINE_IDA(vd_index_ida);
struct workqueue_struct *virtblk_wq;
struct virtio_blk
{
struct virtio_device *vdev;
struct virtqueue *vq;
wait_queue_head_t queue_wait;
/* The disk structure for the kernel. */
struct gendisk *disk;
mempool_t *pool;
/* Process context for config space updates */
struct work_struct config_work;
/* Lock for config space updates */
struct mutex config_lock;
/* enable config space updates */
bool config_enable;
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
/* Ida index - used to track minor number allocations. */
int index;
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[/*sg_elems*/];
};
struct virtblk_req
{
struct request *req;
struct bio *bio;
struct virtio_blk_outhdr out_hdr;
struct virtio_scsi_inhdr in_hdr;
u8 status;
struct scatterlist sg[];
};
static inline int virtblk_result(struct virtblk_req *vbr)
{
switch (vbr->status) {
case VIRTIO_BLK_S_OK:
return 0;
case VIRTIO_BLK_S_UNSUPP:
return -ENOTTY;
default:
return -EIO;
}
}
static inline void virtblk_request_done(struct virtio_blk *vblk,
struct virtblk_req *vbr)
{
struct request *req = vbr->req;
int error = virtblk_result(vbr);
if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
req->resid_len = vbr->in_hdr.residual;
req->sense_len = vbr->in_hdr.sense_len;
req->errors = vbr->in_hdr.errors;
} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
req->errors = (error != 0);
}
__blk_end_request_all(req, error);
mempool_free(vbr, vblk->pool);
}
static inline void virtblk_bio_done(struct virtio_blk *vblk,
struct virtblk_req *vbr)
{
bio_endio(vbr->bio, virtblk_result(vbr));
mempool_free(vbr, vblk->pool);
}
static void virtblk_done(struct virtqueue *vq)
{
struct virtio_blk *vblk = vq->vdev->priv;
unsigned long bio_done = 0, req_done = 0;
struct virtblk_req *vbr;
unsigned long flags;
unsigned int len;
spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
if (vbr->bio) {
virtblk_bio_done(vblk, vbr);
bio_done++;
} else {
virtblk_request_done(vblk, vbr);
req_done++;
}
}
/* In case queue is stopped waiting for more buffers. */
if (req_done)
blk_start_queue(vblk->disk->queue);
spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
if (bio_done)
wake_up(&vblk->queue_wait);
}
static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
gfp_t gfp_mask)
{
struct virtblk_req *vbr;
vbr = mempool_alloc(vblk->pool, gfp_mask);
if (vbr && use_bio)
sg_init_table(vbr->sg, vblk->sg_elems);
return vbr;
}
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req)
{
unsigned long num, out = 0, in = 0;
struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
if (!vbr)
/* When another request finishes we'll try again. */
return false;
vbr->req = req;
vbr->bio = NULL;
if (req->cmd_flags & REQ_FLUSH) {
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
} else {
switch (req->cmd_type) {
case REQ_TYPE_FS:
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = blk_rq_pos(vbr->req);
vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
break;
case REQ_TYPE_BLOCK_PC:
vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
break;
case REQ_TYPE_SPECIAL:
vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
break;
default:
/* We don't put anything else in the queue. */
BUG();
}
}
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
/*
* If this is a packet command we need a couple of additional headers.
* Behind the normal outhdr we put a segment with the scsi command
* block, and before the normal inhdr we put the sense data and the
* inhdr with additional status information before the normal inhdr.
*/
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
sizeof(vbr->in_hdr));
}
sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) {
if (rq_data_dir(vbr->req) == WRITE) {
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num;
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num;
}
}
if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
GFP_ATOMIC) < 0) {
mempool_free(vbr, vblk->pool);
return false;
}
return true;
}
static void virtblk_request(struct request_queue *q)
{
struct virtio_blk *vblk = q->queuedata;
struct request *req;
unsigned int issued = 0;
while ((req = blk_peek_request(q)) != NULL) {
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
/* If this request fails, stop queue and wait for something to
finish to restart it. */
if (!do_req(q, vblk, req)) {
blk_stop_queue(q);
break;
}
blk_start_request(req);
issued++;
}
if (issued)
virtqueue_kick(vblk->vq);
}
static void virtblk_add_buf_wait(struct virtio_blk *vblk,
struct virtblk_req *vbr,
unsigned long out,
unsigned long in)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
TASK_UNINTERRUPTIBLE);
spin_lock_irq(vblk->disk->queue->queue_lock);
if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0) {
spin_unlock_irq(vblk->disk->queue->queue_lock);
io_schedule();
} else {
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
break;
}
}
finish_wait(&vblk->queue_wait, &wait);
}
static void virtblk_make_request(struct request_queue *q, struct bio *bio)
{
struct virtio_blk *vblk = q->queuedata;
unsigned int num, out = 0, in = 0;
struct virtblk_req *vbr;
BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
vbr = virtblk_alloc_req(vblk, GFP_NOIO);
if (!vbr) {
bio_endio(bio, -ENOMEM);
return;
}
vbr->bio = bio;
vbr->req = NULL;
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio);
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
num = blk_bio_map_sg(q, bio, vbr->sg + out);
sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) {
if (bio->bi_rw & REQ_WRITE) {
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num;
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num;
}
}
spin_lock_irq(vblk->disk->queue->queue_lock);
if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0)) {
spin_unlock_irq(vblk->disk->queue->queue_lock);
virtblk_add_buf_wait(vblk, vbr, out, in);
return;
}
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
/* return id (s/n) string for *disk to *id_str
*/
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
struct virtio_blk *vblk = disk->private_data;
struct request *req;
struct bio *bio;
int err;
bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
if (IS_ERR(req)) {
bio_put(bio);
return PTR_ERR(req);
}
req->cmd_type = REQ_TYPE_SPECIAL;
err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
blk_put_request(req);
return err;
}
static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
/*
* Only allow the generic SCSI ioctls if the host can support it.
*/
if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
return -ENOTTY;
return scsi_cmd_blk_ioctl(bdev, mode, cmd,
(void __user *)data);
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
struct virtio_blk *vblk = bd->bd_disk->private_data;
struct virtio_blk_geometry vgeo;
int err;
/* see if the host passed in geometry config */
err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY,
offsetof(struct virtio_blk_config, geometry),
&vgeo);
if (!err) {
geo->heads = vgeo.heads;
geo->sectors = vgeo.sectors;
geo->cylinders = vgeo.cylinders;
} else {
/* some standard values, similar to sd */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
geo->cylinders = get_capacity(bd->bd_disk) >> 11;
}
return 0;
}
static const struct block_device_operations virtblk_fops = {
.ioctl = virtblk_ioctl,
.owner = THIS_MODULE,
.getgeo = virtblk_getgeo,
};
static int index_to_minor(int index)
{
return index << PART_BITS;
}
static int minor_to_index(int minor)
{
return minor >> PART_BITS;
}
static ssize_t virtblk_serial_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
int err;
/* sysfs gives us a PAGE_SIZE buffer */
BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
buf[VIRTIO_BLK_ID_BYTES] = '\0';
err = virtblk_get_id(disk, buf);
if (!err)
return strlen(buf);
if (err == -EIO) /* Unsupported? Make it empty. */
return 0;
return err;
}
DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
static void virtblk_config_changed_work(struct work_struct *work)
{
struct virtio_blk *vblk =
container_of(work, struct virtio_blk, config_work);
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
u64 capacity, size;
mutex_lock(&vblk->config_lock);
if (!vblk->config_enable)
goto done;
/* Host must always specify the capacity. */
vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
&capacity, sizeof(capacity));
/* If capacity is too big, truncate with warning. */
if ((sector_t)capacity != capacity) {
dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
(unsigned long long)capacity);
capacity = (sector_t)-1;
}
size = capacity * queue_logical_block_size(q);
string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
dev_notice(&vdev->dev,
"new size: %llu %d-byte logical blocks (%s/%s)\n",
(unsigned long long)capacity,
queue_logical_block_size(q),
cap_str_10, cap_str_2);
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
done:
mutex_unlock(&vblk->config_lock);
}
static void virtblk_config_changed(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
queue_work(virtblk_wq, &vblk->config_work);
}
static int init_vq(struct virtio_blk *vblk)
{
int err = 0;
/* We expect one virtqueue, for output. */
vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
if (IS_ERR(vblk->vq))
err = PTR_ERR(vblk->vq);
return err;
}
/*
* Legacy naming scheme used for virtio devices. We are stuck with it for
* virtio blk but don't ever use it for any new driver.
*/
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
const int base = 'z' - 'a' + 1;
char *begin = buf + strlen(prefix);
char *end = buf + buflen;
char *p;
int unit;
p = end - 1;
*p = '\0';
unit = base;
do {
if (p == begin)
return -EINVAL;
*--p = 'a' + (index % unit);
index = (index / unit) - 1;
} while (index >= 0);
memmove(begin, p, end - p);
memcpy(buf, prefix, strlen(prefix));
return 0;
}
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
u8 writeback;
int err;
err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE,
offsetof(struct virtio_blk_config, wce),
&writeback);
if (err)
writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
return writeback;
}
static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
u8 writeback = virtblk_get_cache_mode(vdev);
struct virtio_blk *vblk = vdev->priv;
if (writeback && !use_bio)
blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
else
blk_queue_flush(vblk->disk->queue, 0);
revalidate_disk(vblk->disk);
}
static const char *const virtblk_cache_types[] = {
"write through", "write back"
};
static ssize_t
virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
struct virtio_device *vdev = vblk->vdev;
int i;
u8 writeback;
BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
if (sysfs_streq(buf, virtblk_cache_types[i]))
break;
if (i < 0)
return -EINVAL;
writeback = i;
vdev->config->set(vdev,
offsetof(struct virtio_blk_config, wce),
&writeback, sizeof(writeback));
virtblk_update_cache_mode(vdev);
return count;
}
static ssize_t
virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
u8 writeback = virtblk_get_cache_mode(vblk->vdev);
BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}
static const struct device_attribute dev_attr_cache_type_ro =
__ATTR(cache_type, S_IRUGO,
virtblk_cache_type_show, NULL);
static const struct device_attribute dev_attr_cache_type_rw =
__ATTR(cache_type, S_IRUGO|S_IWUSR,
virtblk_cache_type_show, virtblk_cache_type_store);
static int __devinit virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct request_queue *q;
int err, index;
int pool_size;
u64 cap;
u32 v, blk_size, sg_elems, opt_io_size;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
GFP_KERNEL);
if (err < 0)
goto out;
index = err;
/* We need to know how many segments before we allocate. */
err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
offsetof(struct virtio_blk_config, seg_max),
&sg_elems);
/* We need at least one SG element, whatever they say. */
if (err || !sg_elems)
sg_elems = 1;
/* We need an extra sg elements at head and tail. */
sg_elems += 2;
vdev->priv = vblk = kmalloc(sizeof(*vblk) +
sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
if (!vblk) {
err = -ENOMEM;
goto out_free_index;
}
init_waitqueue_head(&vblk->queue_wait);
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
sg_init_table(vblk->sg, vblk->sg_elems);
mutex_init(&vblk->config_lock);
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
vblk->config_enable = true;
err = init_vq(vblk);
if (err)
goto out_free_vblk;
pool_size = sizeof(struct virtblk_req);
if (use_bio)
pool_size += sizeof(struct scatterlist) * sg_elems;
vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
if (!vblk->pool) {
err = -ENOMEM;
goto out_free_vq;
}
/* FIXME: How many partitions? How long is a piece of string? */
vblk->disk = alloc_disk(1 << PART_BITS);
if (!vblk->disk) {
err = -ENOMEM;
goto out_mempool;
}
q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
if (!q) {
err = -ENOMEM;
goto out_put_disk;
}
if (use_bio)
blk_queue_make_request(q, virtblk_make_request);
q->queuedata = vblk;
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
vblk->disk->major = major;
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
vblk->disk->driverfs_dev = &vdev->dev;
vblk->index = index;
/* configure queue flush support */
virtblk_update_cache_mode(vdev);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
set_disk_ro(vblk->disk, 1);
/* Host must always specify the capacity. */
vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
&cap, sizeof(cap));
/* If capacity is too big, truncate with warning. */
if ((sector_t)cap != cap) {
dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
(unsigned long long)cap);
cap = (sector_t)-1;
}
set_capacity(vblk->disk, cap);
/* We can handle whatever the host told us to handle. */
blk_queue_max_segments(q, vblk->sg_elems-2);
/* No need to bounce any requests */
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
/* No real sector limit. */
blk_queue_max_hw_sectors(q, -1U);
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
offsetof(struct virtio_blk_config, size_max),
&v);
if (!err)
blk_queue_max_segment_size(q, v);
else
blk_queue_max_segment_size(q, -1U);
/* Host can optionally specify the block size of the device */
err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
offsetof(struct virtio_blk_config, blk_size),
&blk_size);
if (!err)
blk_queue_logical_block_size(q, blk_size);
else
blk_size = queue_logical_block_size(q);
/* Use topology information if available */
err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
offsetof(struct virtio_blk_config, physical_block_exp),
&physical_block_exp);
if (!err && physical_block_exp)
blk_queue_physical_block_size(q,
blk_size * (1 << physical_block_exp));
err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
offsetof(struct virtio_blk_config, alignment_offset),
&alignment_offset);
if (!err && alignment_offset)
blk_queue_alignment_offset(q, blk_size * alignment_offset);
err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
offsetof(struct virtio_blk_config, min_io_size),
&min_io_size);
if (!err && min_io_size)
blk_queue_io_min(q, blk_size * min_io_size);
err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
offsetof(struct virtio_blk_config, opt_io_size),
&opt_io_size);
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
add_disk(vblk->disk);
err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
if (err)
goto out_del_disk;
if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
err = device_create_file(disk_to_dev(vblk->disk),
&dev_attr_cache_type_rw);
else
err = device_create_file(disk_to_dev(vblk->disk),
&dev_attr_cache_type_ro);
if (err)
goto out_del_disk;
return 0;
out_del_disk:
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);
out_put_disk:
put_disk(vblk->disk);
out_mempool:
mempool_destroy(vblk->pool);
out_free_vq:
vdev->config->del_vqs(vdev);
out_free_vblk:
kfree(vblk);
out_free_index:
ida_simple_remove(&vd_index_ida, index);
out:
return err;
}
static void __devexit virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
int index = vblk->index;
/* Prevent config work handler from accessing the device. */
mutex_lock(&vblk->config_lock);
vblk->config_enable = false;
mutex_unlock(&vblk->config_lock);
del_gendisk(vblk->disk);
blk_cleanup_queue(vblk->disk->queue);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
flush_work(&vblk->config_work);
put_disk(vblk->disk);
mempool_destroy(vblk->pool);
vdev->config->del_vqs(vdev);
kfree(vblk);
ida_simple_remove(&vd_index_ida, index);
}
#ifdef CONFIG_PM
static int virtblk_freeze(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
/* Ensure we don't receive any more interrupts */
vdev->config->reset(vdev);
/* Prevent config work handler from accessing the device. */
mutex_lock(&vblk->config_lock);
vblk->config_enable = false;
mutex_unlock(&vblk->config_lock);
flush_work(&vblk->config_work);
spin_lock_irq(vblk->disk->queue->queue_lock);
blk_stop_queue(vblk->disk->queue);
spin_unlock_irq(vblk->disk->queue->queue_lock);
blk_sync_queue(vblk->disk->queue);
vdev->config->del_vqs(vdev);
return 0;
}
static int virtblk_restore(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
int ret;
vblk->config_enable = true;
ret = init_vq(vdev->priv);
if (!ret) {
spin_lock_irq(vblk->disk->queue->queue_lock);
blk_start_queue(vblk->disk->queue);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
return ret;
}
#endif
static const struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static unsigned int features[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE
};
/*
* virtio_blk causes spurious section mismatch warning by
* simultaneously referring to a __devinit and a __devexit function.
* Use __refdata to avoid this warning.
*/
static struct virtio_driver __refdata virtio_blk = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = virtblk_probe,
.remove = __devexit_p(virtblk_remove),
.config_changed = virtblk_config_changed,
#ifdef CONFIG_PM
.freeze = virtblk_freeze,
.restore = virtblk_restore,
#endif
};
static int __init init(void)
{
int error;
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
if (!virtblk_wq)
return -ENOMEM;
major = register_blkdev(0, "virtblk");
if (major < 0) {
error = major;
goto out_destroy_workqueue;
}
error = register_virtio_driver(&virtio_blk);
if (error)
goto out_unregister_blkdev;
return 0;
out_unregister_blkdev:
unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
destroy_workqueue(virtblk_wq);
return error;
}
static void __exit fini(void)
{
unregister_blkdev(major, "virtblk");
unregister_virtio_driver(&virtio_blk);
destroy_workqueue(virtblk_wq);
}
module_init(init);
module_exit(fini);
MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");