PageRenderTime 111ms CodeModel.GetById 23ms app.highlight 76ms RepoModel.GetById 1ms app.codeStats 1ms

/arch/um/drivers/ubd_kern.c

http://github.com/mirrors/linux
C | 1623 lines | 1286 code | 243 blank | 94 comment | 233 complexity | 7786ea95ed0e4c47b438f791f0b26877 MD5 | raw file
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2018 Cambridge Greys Ltd
   4 * Copyright (C) 2015-2016 Anton Ivanov (aivanov@brocade.com)
   5 * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
   6 */
   7
   8/* 2001-09-28...2002-04-17
   9 * Partition stuff by James_McMechan@hotmail.com
  10 * old style ubd by setting UBD_SHIFT to 0
  11 * 2002-09-27...2002-10-18 massive tinkering for 2.5
  12 * partitions have changed in 2.5
  13 * 2003-01-29 more tinkering for 2.5.59-1
  14 * This should now address the sysfs problems and has
  15 * the symlink for devfs to allow for booting with
  16 * the common /dev/ubd/discX/... names rather than
  17 * only /dev/ubdN/discN this version also has lots of
  18 * clean ups preparing for ubd-many.
  19 * James McMechan
  20 */
  21
  22#define UBD_SHIFT 4
  23
  24#include <linux/module.h>
  25#include <linux/init.h>
  26#include <linux/blkdev.h>
  27#include <linux/blk-mq.h>
  28#include <linux/ata.h>
  29#include <linux/hdreg.h>
  30#include <linux/cdrom.h>
  31#include <linux/proc_fs.h>
  32#include <linux/seq_file.h>
  33#include <linux/ctype.h>
  34#include <linux/slab.h>
  35#include <linux/vmalloc.h>
  36#include <linux/platform_device.h>
  37#include <linux/scatterlist.h>
  38#include <asm/tlbflush.h>
  39#include <kern_util.h>
  40#include "mconsole_kern.h"
  41#include <init.h>
  42#include <irq_kern.h>
  43#include "ubd.h"
  44#include <os.h>
  45#include "cow.h"
  46
  47/* Max request size is determined by sector mask - 32K */
  48#define UBD_MAX_REQUEST (8 * sizeof(long))
  49
  50struct io_thread_req {
  51	struct request *req;
  52	int fds[2];
  53	unsigned long offsets[2];
  54	unsigned long long offset;
  55	unsigned long length;
  56	char *buffer;
  57	int sectorsize;
  58	unsigned long sector_mask;
  59	unsigned long long cow_offset;
  60	unsigned long bitmap_words[2];
  61	int error;
  62};
  63
  64
  65static struct io_thread_req * (*irq_req_buffer)[];
  66static struct io_thread_req *irq_remainder;
  67static int irq_remainder_size;
  68
  69static struct io_thread_req * (*io_req_buffer)[];
  70static struct io_thread_req *io_remainder;
  71static int io_remainder_size;
  72
  73
  74
  75static inline int ubd_test_bit(__u64 bit, unsigned char *data)
  76{
  77	__u64 n;
  78	int bits, off;
  79
  80	bits = sizeof(data[0]) * 8;
  81	n = bit / bits;
  82	off = bit % bits;
  83	return (data[n] & (1 << off)) != 0;
  84}
  85
  86static inline void ubd_set_bit(__u64 bit, unsigned char *data)
  87{
  88	__u64 n;
  89	int bits, off;
  90
  91	bits = sizeof(data[0]) * 8;
  92	n = bit / bits;
  93	off = bit % bits;
  94	data[n] |= (1 << off);
  95}
  96/*End stuff from ubd_user.h*/
  97
  98#define DRIVER_NAME "uml-blkdev"
  99
 100static DEFINE_MUTEX(ubd_lock);
 101static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
 102
 103static int ubd_open(struct block_device *bdev, fmode_t mode);
 104static void ubd_release(struct gendisk *disk, fmode_t mode);
 105static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
 106		     unsigned int cmd, unsigned long arg);
 107static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 108
 109#define MAX_DEV (16)
 110
 111static const struct block_device_operations ubd_blops = {
 112        .owner		= THIS_MODULE,
 113        .open		= ubd_open,
 114        .release	= ubd_release,
 115        .ioctl		= ubd_ioctl,
 116        .compat_ioctl	= blkdev_compat_ptr_ioctl,
 117	.getgeo		= ubd_getgeo,
 118};
 119
 120/* Protected by ubd_lock */
 121static int fake_major = UBD_MAJOR;
 122static struct gendisk *ubd_gendisk[MAX_DEV];
 123static struct gendisk *fake_gendisk[MAX_DEV];
 124
 125#ifdef CONFIG_BLK_DEV_UBD_SYNC
 126#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
 127					 .cl = 1 })
 128#else
 129#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \
 130					 .cl = 1 })
 131#endif
 132static struct openflags global_openflags = OPEN_FLAGS;
 133
 134struct cow {
 135	/* backing file name */
 136	char *file;
 137	/* backing file fd */
 138	int fd;
 139	unsigned long *bitmap;
 140	unsigned long bitmap_len;
 141	int bitmap_offset;
 142	int data_offset;
 143};
 144
 145#define MAX_SG 64
 146
 147struct ubd {
 148	/* name (and fd, below) of the file opened for writing, either the
 149	 * backing or the cow file. */
 150	char *file;
 151	int count;
 152	int fd;
 153	__u64 size;
 154	struct openflags boot_openflags;
 155	struct openflags openflags;
 156	unsigned shared:1;
 157	unsigned no_cow:1;
 158	unsigned no_trim:1;
 159	struct cow cow;
 160	struct platform_device pdev;
 161	struct request_queue *queue;
 162	struct blk_mq_tag_set tag_set;
 163	spinlock_t lock;
 164};
 165
 166#define DEFAULT_COW { \
 167	.file =			NULL, \
 168	.fd =			-1,	\
 169	.bitmap =		NULL, \
 170	.bitmap_offset =	0, \
 171	.data_offset =		0, \
 172}
 173
 174#define DEFAULT_UBD { \
 175	.file = 		NULL, \
 176	.count =		0, \
 177	.fd =			-1, \
 178	.size =			-1, \
 179	.boot_openflags =	OPEN_FLAGS, \
 180	.openflags =		OPEN_FLAGS, \
 181	.no_cow =               0, \
 182	.no_trim =		0, \
 183	.shared =		0, \
 184	.cow =			DEFAULT_COW, \
 185	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
 186}
 187
 188/* Protected by ubd_lock */
 189static struct ubd ubd_devs[MAX_DEV] = { [0 ... MAX_DEV - 1] = DEFAULT_UBD };
 190
 191/* Only changed by fake_ide_setup which is a setup */
 192static int fake_ide = 0;
 193static struct proc_dir_entry *proc_ide_root = NULL;
 194static struct proc_dir_entry *proc_ide = NULL;
 195
 196static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
 197				 const struct blk_mq_queue_data *bd);
 198
 199static void make_proc_ide(void)
 200{
 201	proc_ide_root = proc_mkdir("ide", NULL);
 202	proc_ide = proc_mkdir("ide0", proc_ide_root);
 203}
 204
 205static int fake_ide_media_proc_show(struct seq_file *m, void *v)
 206{
 207	seq_puts(m, "disk\n");
 208	return 0;
 209}
 210
 211static void make_ide_entries(const char *dev_name)
 212{
 213	struct proc_dir_entry *dir, *ent;
 214	char name[64];
 215
 216	if(proc_ide_root == NULL) make_proc_ide();
 217
 218	dir = proc_mkdir(dev_name, proc_ide);
 219	if(!dir) return;
 220
 221	ent = proc_create_single("media", S_IRUGO, dir,
 222			fake_ide_media_proc_show);
 223	if(!ent) return;
 224	snprintf(name, sizeof(name), "ide0/%s", dev_name);
 225	proc_symlink(dev_name, proc_ide_root, name);
 226}
 227
 228static int fake_ide_setup(char *str)
 229{
 230	fake_ide = 1;
 231	return 1;
 232}
 233
 234__setup("fake_ide", fake_ide_setup);
 235
 236__uml_help(fake_ide_setup,
 237"fake_ide\n"
 238"    Create ide0 entries that map onto ubd devices.\n\n"
 239);
 240
 241static int parse_unit(char **ptr)
 242{
 243	char *str = *ptr, *end;
 244	int n = -1;
 245
 246	if(isdigit(*str)) {
 247		n = simple_strtoul(str, &end, 0);
 248		if(end == str)
 249			return -1;
 250		*ptr = end;
 251	}
 252	else if (('a' <= *str) && (*str <= 'z')) {
 253		n = *str - 'a';
 254		str++;
 255		*ptr = str;
 256	}
 257	return n;
 258}
 259
 260/* If *index_out == -1 at exit, the passed option was a general one;
 261 * otherwise, the str pointer is used (and owned) inside ubd_devs array, so it
 262 * should not be freed on exit.
 263 */
 264static int ubd_setup_common(char *str, int *index_out, char **error_out)
 265{
 266	struct ubd *ubd_dev;
 267	struct openflags flags = global_openflags;
 268	char *backing_file;
 269	int n, err = 0, i;
 270
 271	if(index_out) *index_out = -1;
 272	n = *str;
 273	if(n == '='){
 274		char *end;
 275		int major;
 276
 277		str++;
 278		if(!strcmp(str, "sync")){
 279			global_openflags = of_sync(global_openflags);
 280			return err;
 281		}
 282
 283		err = -EINVAL;
 284		major = simple_strtoul(str, &end, 0);
 285		if((*end != '\0') || (end == str)){
 286			*error_out = "Didn't parse major number";
 287			return err;
 288		}
 289
 290		mutex_lock(&ubd_lock);
 291		if (fake_major != UBD_MAJOR) {
 292			*error_out = "Can't assign a fake major twice";
 293			goto out1;
 294		}
 295
 296		fake_major = major;
 297
 298		printk(KERN_INFO "Setting extra ubd major number to %d\n",
 299		       major);
 300		err = 0;
 301	out1:
 302		mutex_unlock(&ubd_lock);
 303		return err;
 304	}
 305
 306	n = parse_unit(&str);
 307	if(n < 0){
 308		*error_out = "Couldn't parse device number";
 309		return -EINVAL;
 310	}
 311	if(n >= MAX_DEV){
 312		*error_out = "Device number out of range";
 313		return 1;
 314	}
 315
 316	err = -EBUSY;
 317	mutex_lock(&ubd_lock);
 318
 319	ubd_dev = &ubd_devs[n];
 320	if(ubd_dev->file != NULL){
 321		*error_out = "Device is already configured";
 322		goto out;
 323	}
 324
 325	if (index_out)
 326		*index_out = n;
 327
 328	err = -EINVAL;
 329	for (i = 0; i < sizeof("rscdt="); i++) {
 330		switch (*str) {
 331		case 'r':
 332			flags.w = 0;
 333			break;
 334		case 's':
 335			flags.s = 1;
 336			break;
 337		case 'd':
 338			ubd_dev->no_cow = 1;
 339			break;
 340		case 'c':
 341			ubd_dev->shared = 1;
 342			break;
 343		case 't':
 344			ubd_dev->no_trim = 1;
 345			break;
 346		case '=':
 347			str++;
 348			goto break_loop;
 349		default:
 350			*error_out = "Expected '=' or flag letter "
 351				"(r, s, c, t or d)";
 352			goto out;
 353		}
 354		str++;
 355	}
 356
 357	if (*str == '=')
 358		*error_out = "Too many flags specified";
 359	else
 360		*error_out = "Missing '='";
 361	goto out;
 362
 363break_loop:
 364	backing_file = strchr(str, ',');
 365
 366	if (backing_file == NULL)
 367		backing_file = strchr(str, ':');
 368
 369	if(backing_file != NULL){
 370		if(ubd_dev->no_cow){
 371			*error_out = "Can't specify both 'd' and a cow file";
 372			goto out;
 373		}
 374		else {
 375			*backing_file = '\0';
 376			backing_file++;
 377		}
 378	}
 379	err = 0;
 380	ubd_dev->file = str;
 381	ubd_dev->cow.file = backing_file;
 382	ubd_dev->boot_openflags = flags;
 383out:
 384	mutex_unlock(&ubd_lock);
 385	return err;
 386}
 387
 388static int ubd_setup(char *str)
 389{
 390	char *error;
 391	int err;
 392
 393	err = ubd_setup_common(str, NULL, &error);
 394	if(err)
 395		printk(KERN_ERR "Failed to initialize device with \"%s\" : "
 396		       "%s\n", str, error);
 397	return 1;
 398}
 399
 400__setup("ubd", ubd_setup);
 401__uml_help(ubd_setup,
 402"ubd<n><flags>=<filename>[(:|,)<filename2>]\n"
 403"    This is used to associate a device with a file in the underlying\n"
 404"    filesystem. When specifying two filenames, the first one is the\n"
 405"    COW name and the second is the backing file name. As separator you can\n"
 406"    use either a ':' or a ',': the first one allows writing things like;\n"
 407"	ubd0=~/Uml/root_cow:~/Uml/root_backing_file\n"
 408"    while with a ',' the shell would not expand the 2nd '~'.\n"
 409"    When using only one filename, UML will detect whether to treat it like\n"
 410"    a COW file or a backing file. To override this detection, add the 'd'\n"
 411"    flag:\n"
 412"	ubd0d=BackingFile\n"
 413"    Usually, there is a filesystem in the file, but \n"
 414"    that's not required. Swap devices containing swap files can be\n"
 415"    specified like this. Also, a file which doesn't contain a\n"
 416"    filesystem can have its contents read in the virtual \n"
 417"    machine by running 'dd' on the device. <n> must be in the range\n"
 418"    0 to 7. Appending an 'r' to the number will cause that device\n"
 419"    to be mounted read-only. For example ubd1r=./ext_fs. Appending\n"
 420"    an 's' will cause data to be written to disk on the host immediately.\n"
 421"    'c' will cause the device to be treated as being shared between multiple\n"
 422"    UMLs and file locking will be turned off - this is appropriate for a\n"
 423"    cluster filesystem and inappropriate at almost all other times.\n\n"
 424"    't' will disable trim/discard support on the device (enabled by default).\n\n"
 425);
 426
 427static int udb_setup(char *str)
 428{
 429	printk("udb%s specified on command line is almost certainly a ubd -> "
 430	       "udb TYPO\n", str);
 431	return 1;
 432}
 433
 434__setup("udb", udb_setup);
 435__uml_help(udb_setup,
 436"udb\n"
 437"    This option is here solely to catch ubd -> udb typos, which can be\n"
 438"    to impossible to catch visually unless you specifically look for\n"
 439"    them.  The only result of any option starting with 'udb' is an error\n"
 440"    in the boot output.\n\n"
 441);
 442
 443/* Only changed by ubd_init, which is an initcall. */
 444static int thread_fd = -1;
 445
 446/* Function to read several request pointers at a time
 447* handling fractional reads if (and as) needed
 448*/
 449
 450static int bulk_req_safe_read(
 451	int fd,
 452	struct io_thread_req * (*request_buffer)[],
 453	struct io_thread_req **remainder,
 454	int *remainder_size,
 455	int max_recs
 456	)
 457{
 458	int n = 0;
 459	int res = 0;
 460
 461	if (*remainder_size > 0) {
 462		memmove(
 463			(char *) request_buffer,
 464			(char *) remainder, *remainder_size
 465		);
 466		n = *remainder_size;
 467	}
 468
 469	res = os_read_file(
 470			fd,
 471			((char *) request_buffer) + *remainder_size,
 472			sizeof(struct io_thread_req *)*max_recs
 473				- *remainder_size
 474		);
 475	if (res > 0) {
 476		n += res;
 477		if ((n % sizeof(struct io_thread_req *)) > 0) {
 478			/*
 479			* Read somehow returned not a multiple of dword
 480			* theoretically possible, but never observed in the
 481			* wild, so read routine must be able to handle it
 482			*/
 483			*remainder_size = n % sizeof(struct io_thread_req *);
 484			WARN(*remainder_size > 0, "UBD IPC read returned a partial result");
 485			memmove(
 486				remainder,
 487				((char *) request_buffer) +
 488					(n/sizeof(struct io_thread_req *))*sizeof(struct io_thread_req *),
 489				*remainder_size
 490			);
 491			n = n - *remainder_size;
 492		}
 493	} else {
 494		n = res;
 495	}
 496	return n;
 497}
 498
 499/* Called without dev->lock held, and only in interrupt context. */
 500static void ubd_handler(void)
 501{
 502	int n;
 503	int count;
 504
 505	while(1){
 506		n = bulk_req_safe_read(
 507			thread_fd,
 508			irq_req_buffer,
 509			&irq_remainder,
 510			&irq_remainder_size,
 511			UBD_REQ_BUFFER_SIZE
 512		);
 513		if (n < 0) {
 514			if(n == -EAGAIN)
 515				break;
 516			printk(KERN_ERR "spurious interrupt in ubd_handler, "
 517			       "err = %d\n", -n);
 518			return;
 519		}
 520		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
 521			struct io_thread_req *io_req = (*irq_req_buffer)[count];
 522
 523			if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) {
 524				blk_queue_max_discard_sectors(io_req->req->q, 0);
 525				blk_queue_max_write_zeroes_sectors(io_req->req->q, 0);
 526				blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q);
 527			}
 528			if ((io_req->error) || (io_req->buffer == NULL))
 529				blk_mq_end_request(io_req->req, io_req->error);
 530			else {
 531				if (!blk_update_request(io_req->req, io_req->error, io_req->length))
 532					__blk_mq_end_request(io_req->req, io_req->error);
 533			}
 534			kfree(io_req);
 535		}
 536	}
 537}
 538
 539static irqreturn_t ubd_intr(int irq, void *dev)
 540{
 541	ubd_handler();
 542	return IRQ_HANDLED;
 543}
 544
 545/* Only changed by ubd_init, which is an initcall. */
 546static int io_pid = -1;
 547
 548static void kill_io_thread(void)
 549{
 550	if(io_pid != -1)
 551		os_kill_process(io_pid, 1);
 552}
 553
 554__uml_exitcall(kill_io_thread);
 555
 556static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out)
 557{
 558	char *file;
 559	int fd;
 560	int err;
 561
 562	__u32 version;
 563	__u32 align;
 564	char *backing_file;
 565	time64_t mtime;
 566	unsigned long long size;
 567	int sector_size;
 568	int bitmap_offset;
 569
 570	if (ubd_dev->file && ubd_dev->cow.file) {
 571		file = ubd_dev->cow.file;
 572
 573		goto out;
 574	}
 575
 576	fd = os_open_file(ubd_dev->file, of_read(OPENFLAGS()), 0);
 577	if (fd < 0)
 578		return fd;
 579
 580	err = read_cow_header(file_reader, &fd, &version, &backing_file, \
 581		&mtime, &size, &sector_size, &align, &bitmap_offset);
 582	os_close_file(fd);
 583
 584	if(err == -EINVAL)
 585		file = ubd_dev->file;
 586	else
 587		file = backing_file;
 588
 589out:
 590	return os_file_size(file, size_out);
 591}
 592
 593static int read_cow_bitmap(int fd, void *buf, int offset, int len)
 594{
 595	int err;
 596
 597	err = os_pread_file(fd, buf, len, offset);
 598	if (err < 0)
 599		return err;
 600
 601	return 0;
 602}
 603
 604static int backing_file_mismatch(char *file, __u64 size, time64_t mtime)
 605{
 606	time64_t modtime;
 607	unsigned long long actual;
 608	int err;
 609
 610	err = os_file_modtime(file, &modtime);
 611	if (err < 0) {
 612		printk(KERN_ERR "Failed to get modification time of backing "
 613		       "file \"%s\", err = %d\n", file, -err);
 614		return err;
 615	}
 616
 617	err = os_file_size(file, &actual);
 618	if (err < 0) {
 619		printk(KERN_ERR "Failed to get size of backing file \"%s\", "
 620		       "err = %d\n", file, -err);
 621		return err;
 622	}
 623
 624	if (actual != size) {
 625		/*__u64 can be a long on AMD64 and with %lu GCC complains; so
 626		 * the typecast.*/
 627		printk(KERN_ERR "Size mismatch (%llu vs %llu) of COW header "
 628		       "vs backing file\n", (unsigned long long) size, actual);
 629		return -EINVAL;
 630	}
 631	if (modtime != mtime) {
 632		printk(KERN_ERR "mtime mismatch (%lld vs %lld) of COW header vs "
 633		       "backing file\n", mtime, modtime);
 634		return -EINVAL;
 635	}
 636	return 0;
 637}
 638
 639static int path_requires_switch(char *from_cmdline, char *from_cow, char *cow)
 640{
 641	struct uml_stat buf1, buf2;
 642	int err;
 643
 644	if (from_cmdline == NULL)
 645		return 0;
 646	if (!strcmp(from_cmdline, from_cow))
 647		return 0;
 648
 649	err = os_stat_file(from_cmdline, &buf1);
 650	if (err < 0) {
 651		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cmdline,
 652		       -err);
 653		return 0;
 654	}
 655	err = os_stat_file(from_cow, &buf2);
 656	if (err < 0) {
 657		printk(KERN_ERR "Couldn't stat '%s', err = %d\n", from_cow,
 658		       -err);
 659		return 1;
 660	}
 661	if ((buf1.ust_dev == buf2.ust_dev) && (buf1.ust_ino == buf2.ust_ino))
 662		return 0;
 663
 664	printk(KERN_ERR "Backing file mismatch - \"%s\" requested, "
 665	       "\"%s\" specified in COW header of \"%s\"\n",
 666	       from_cmdline, from_cow, cow);
 667	return 1;
 668}
 669
 670static int open_ubd_file(char *file, struct openflags *openflags, int shared,
 671		  char **backing_file_out, int *bitmap_offset_out,
 672		  unsigned long *bitmap_len_out, int *data_offset_out,
 673		  int *create_cow_out)
 674{
 675	time64_t mtime;
 676	unsigned long long size;
 677	__u32 version, align;
 678	char *backing_file;
 679	int fd, err, sectorsize, asked_switch, mode = 0644;
 680
 681	fd = os_open_file(file, *openflags, mode);
 682	if (fd < 0) {
 683		if ((fd == -ENOENT) && (create_cow_out != NULL))
 684			*create_cow_out = 1;
 685		if (!openflags->w ||
 686		    ((fd != -EROFS) && (fd != -EACCES)))
 687			return fd;
 688		openflags->w = 0;
 689		fd = os_open_file(file, *openflags, mode);
 690		if (fd < 0)
 691			return fd;
 692	}
 693
 694	if (shared)
 695		printk(KERN_INFO "Not locking \"%s\" on the host\n", file);
 696	else {
 697		err = os_lock_file(fd, openflags->w);
 698		if (err < 0) {
 699			printk(KERN_ERR "Failed to lock '%s', err = %d\n",
 700			       file, -err);
 701			goto out_close;
 702		}
 703	}
 704
 705	/* Successful return case! */
 706	if (backing_file_out == NULL)
 707		return fd;
 708
 709	err = read_cow_header(file_reader, &fd, &version, &backing_file, &mtime,
 710			      &size, &sectorsize, &align, bitmap_offset_out);
 711	if (err && (*backing_file_out != NULL)) {
 712		printk(KERN_ERR "Failed to read COW header from COW file "
 713		       "\"%s\", errno = %d\n", file, -err);
 714		goto out_close;
 715	}
 716	if (err)
 717		return fd;
 718
 719	asked_switch = path_requires_switch(*backing_file_out, backing_file,
 720					    file);
 721
 722	/* Allow switching only if no mismatch. */
 723	if (asked_switch && !backing_file_mismatch(*backing_file_out, size,
 724						   mtime)) {
 725		printk(KERN_ERR "Switching backing file to '%s'\n",
 726		       *backing_file_out);
 727		err = write_cow_header(file, fd, *backing_file_out,
 728				       sectorsize, align, &size);
 729		if (err) {
 730			printk(KERN_ERR "Switch failed, errno = %d\n", -err);
 731			goto out_close;
 732		}
 733	} else {
 734		*backing_file_out = backing_file;
 735		err = backing_file_mismatch(*backing_file_out, size, mtime);
 736		if (err)
 737			goto out_close;
 738	}
 739
 740	cow_sizes(version, size, sectorsize, align, *bitmap_offset_out,
 741		  bitmap_len_out, data_offset_out);
 742
 743	return fd;
 744 out_close:
 745	os_close_file(fd);
 746	return err;
 747}
 748
 749static int create_cow_file(char *cow_file, char *backing_file,
 750		    struct openflags flags,
 751		    int sectorsize, int alignment, int *bitmap_offset_out,
 752		    unsigned long *bitmap_len_out, int *data_offset_out)
 753{
 754	int err, fd;
 755
 756	flags.c = 1;
 757	fd = open_ubd_file(cow_file, &flags, 0, NULL, NULL, NULL, NULL, NULL);
 758	if (fd < 0) {
 759		err = fd;
 760		printk(KERN_ERR "Open of COW file '%s' failed, errno = %d\n",
 761		       cow_file, -err);
 762		goto out;
 763	}
 764
 765	err = init_cow_file(fd, cow_file, backing_file, sectorsize, alignment,
 766			    bitmap_offset_out, bitmap_len_out,
 767			    data_offset_out);
 768	if (!err)
 769		return fd;
 770	os_close_file(fd);
 771 out:
 772	return err;
 773}
 774
 775static void ubd_close_dev(struct ubd *ubd_dev)
 776{
 777	os_close_file(ubd_dev->fd);
 778	if(ubd_dev->cow.file == NULL)
 779		return;
 780
 781	os_close_file(ubd_dev->cow.fd);
 782	vfree(ubd_dev->cow.bitmap);
 783	ubd_dev->cow.bitmap = NULL;
 784}
 785
 786static int ubd_open_dev(struct ubd *ubd_dev)
 787{
 788	struct openflags flags;
 789	char **back_ptr;
 790	int err, create_cow, *create_ptr;
 791	int fd;
 792
 793	ubd_dev->openflags = ubd_dev->boot_openflags;
 794	create_cow = 0;
 795	create_ptr = (ubd_dev->cow.file != NULL) ? &create_cow : NULL;
 796	back_ptr = ubd_dev->no_cow ? NULL : &ubd_dev->cow.file;
 797
 798	fd = open_ubd_file(ubd_dev->file, &ubd_dev->openflags, ubd_dev->shared,
 799				back_ptr, &ubd_dev->cow.bitmap_offset,
 800				&ubd_dev->cow.bitmap_len, &ubd_dev->cow.data_offset,
 801				create_ptr);
 802
 803	if((fd == -ENOENT) && create_cow){
 804		fd = create_cow_file(ubd_dev->file, ubd_dev->cow.file,
 805					  ubd_dev->openflags, SECTOR_SIZE, PAGE_SIZE,
 806					  &ubd_dev->cow.bitmap_offset,
 807					  &ubd_dev->cow.bitmap_len,
 808					  &ubd_dev->cow.data_offset);
 809		if(fd >= 0){
 810			printk(KERN_INFO "Creating \"%s\" as COW file for "
 811			       "\"%s\"\n", ubd_dev->file, ubd_dev->cow.file);
 812		}
 813	}
 814
 815	if(fd < 0){
 816		printk("Failed to open '%s', errno = %d\n", ubd_dev->file,
 817		       -fd);
 818		return fd;
 819	}
 820	ubd_dev->fd = fd;
 821
 822	if(ubd_dev->cow.file != NULL){
 823		blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
 824
 825		err = -ENOMEM;
 826		ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
 827		if(ubd_dev->cow.bitmap == NULL){
 828			printk(KERN_ERR "Failed to vmalloc COW bitmap\n");
 829			goto error;
 830		}
 831		flush_tlb_kernel_vm();
 832
 833		err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap,
 834				      ubd_dev->cow.bitmap_offset,
 835				      ubd_dev->cow.bitmap_len);
 836		if(err < 0)
 837			goto error;
 838
 839		flags = ubd_dev->openflags;
 840		flags.w = 0;
 841		err = open_ubd_file(ubd_dev->cow.file, &flags, ubd_dev->shared, NULL,
 842				    NULL, NULL, NULL, NULL);
 843		if(err < 0) goto error;
 844		ubd_dev->cow.fd = err;
 845	}
 846	if (ubd_dev->no_trim == 0) {
 847		ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE;
 848		ubd_dev->queue->limits.discard_alignment = SECTOR_SIZE;
 849		blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
 850		blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
 851		blk_queue_flag_set(QUEUE_FLAG_DISCARD, ubd_dev->queue);
 852	}
 853	blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
 854	return 0;
 855 error:
 856	os_close_file(ubd_dev->fd);
 857	return err;
 858}
 859
 860static void ubd_device_release(struct device *dev)
 861{
 862	struct ubd *ubd_dev = dev_get_drvdata(dev);
 863
 864	blk_cleanup_queue(ubd_dev->queue);
 865	blk_mq_free_tag_set(&ubd_dev->tag_set);
 866	*ubd_dev = ((struct ubd) DEFAULT_UBD);
 867}
 868
 869static int ubd_disk_register(int major, u64 size, int unit,
 870			     struct gendisk **disk_out)
 871{
 872	struct device *parent = NULL;
 873	struct gendisk *disk;
 874
 875	disk = alloc_disk(1 << UBD_SHIFT);
 876	if(disk == NULL)
 877		return -ENOMEM;
 878
 879	disk->major = major;
 880	disk->first_minor = unit << UBD_SHIFT;
 881	disk->fops = &ubd_blops;
 882	set_capacity(disk, size / 512);
 883	if (major == UBD_MAJOR)
 884		sprintf(disk->disk_name, "ubd%c", 'a' + unit);
 885	else
 886		sprintf(disk->disk_name, "ubd_fake%d", unit);
 887
 888	/* sysfs register (not for ide fake devices) */
 889	if (major == UBD_MAJOR) {
 890		ubd_devs[unit].pdev.id   = unit;
 891		ubd_devs[unit].pdev.name = DRIVER_NAME;
 892		ubd_devs[unit].pdev.dev.release = ubd_device_release;
 893		dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
 894		platform_device_register(&ubd_devs[unit].pdev);
 895		parent = &ubd_devs[unit].pdev.dev;
 896	}
 897
 898	disk->private_data = &ubd_devs[unit];
 899	disk->queue = ubd_devs[unit].queue;
 900	device_add_disk(parent, disk, NULL);
 901
 902	*disk_out = disk;
 903	return 0;
 904}
 905
 906#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
 907
 908static const struct blk_mq_ops ubd_mq_ops = {
 909	.queue_rq = ubd_queue_rq,
 910};
 911
 912static int ubd_add(int n, char **error_out)
 913{
 914	struct ubd *ubd_dev = &ubd_devs[n];
 915	int err = 0;
 916
 917	if(ubd_dev->file == NULL)
 918		goto out;
 919
 920	err = ubd_file_size(ubd_dev, &ubd_dev->size);
 921	if(err < 0){
 922		*error_out = "Couldn't determine size of device's file";
 923		goto out;
 924	}
 925
 926	ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
 927
 928	ubd_dev->tag_set.ops = &ubd_mq_ops;
 929	ubd_dev->tag_set.queue_depth = 64;
 930	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
 931	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 932	ubd_dev->tag_set.driver_data = ubd_dev;
 933	ubd_dev->tag_set.nr_hw_queues = 1;
 934
 935	err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
 936	if (err)
 937		goto out;
 938
 939	ubd_dev->queue = blk_mq_init_queue(&ubd_dev->tag_set);
 940	if (IS_ERR(ubd_dev->queue)) {
 941		err = PTR_ERR(ubd_dev->queue);
 942		goto out_cleanup_tags;
 943	}
 944
 945	ubd_dev->queue->queuedata = ubd_dev;
 946	blk_queue_write_cache(ubd_dev->queue, true, false);
 947
 948	blk_queue_max_segments(ubd_dev->queue, MAX_SG);
 949	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
 950	if(err){
 951		*error_out = "Failed to register device";
 952		goto out_cleanup_tags;
 953	}
 954
 955	if (fake_major != UBD_MAJOR)
 956		ubd_disk_register(fake_major, ubd_dev->size, n,
 957				  &fake_gendisk[n]);
 958
 959	/*
 960	 * Perhaps this should also be under the "if (fake_major)" above
 961	 * using the fake_disk->disk_name
 962	 */
 963	if (fake_ide)
 964		make_ide_entries(ubd_gendisk[n]->disk_name);
 965
 966	err = 0;
 967out:
 968	return err;
 969
 970out_cleanup_tags:
 971	blk_mq_free_tag_set(&ubd_dev->tag_set);
 972	if (!(IS_ERR(ubd_dev->queue)))
 973		blk_cleanup_queue(ubd_dev->queue);
 974	goto out;
 975}
 976
 977static int ubd_config(char *str, char **error_out)
 978{
 979	int n, ret;
 980
 981	/* This string is possibly broken up and stored, so it's only
 982	 * freed if ubd_setup_common fails, or if only general options
 983	 * were set.
 984	 */
 985	str = kstrdup(str, GFP_KERNEL);
 986	if (str == NULL) {
 987		*error_out = "Failed to allocate memory";
 988		return -ENOMEM;
 989	}
 990
 991	ret = ubd_setup_common(str, &n, error_out);
 992	if (ret)
 993		goto err_free;
 994
 995	if (n == -1) {
 996		ret = 0;
 997		goto err_free;
 998	}
 999
1000	mutex_lock(&ubd_lock);
1001	ret = ubd_add(n, error_out);
1002	if (ret)
1003		ubd_devs[n].file = NULL;
1004	mutex_unlock(&ubd_lock);
1005
1006out:
1007	return ret;
1008
1009err_free:
1010	kfree(str);
1011	goto out;
1012}
1013
1014static int ubd_get_config(char *name, char *str, int size, char **error_out)
1015{
1016	struct ubd *ubd_dev;
1017	int n, len = 0;
1018
1019	n = parse_unit(&name);
1020	if((n >= MAX_DEV) || (n < 0)){
1021		*error_out = "ubd_get_config : device number out of range";
1022		return -1;
1023	}
1024
1025	ubd_dev = &ubd_devs[n];
1026	mutex_lock(&ubd_lock);
1027
1028	if(ubd_dev->file == NULL){
1029		CONFIG_CHUNK(str, size, len, "", 1);
1030		goto out;
1031	}
1032
1033	CONFIG_CHUNK(str, size, len, ubd_dev->file, 0);
1034
1035	if(ubd_dev->cow.file != NULL){
1036		CONFIG_CHUNK(str, size, len, ",", 0);
1037		CONFIG_CHUNK(str, size, len, ubd_dev->cow.file, 1);
1038	}
1039	else CONFIG_CHUNK(str, size, len, "", 1);
1040
1041 out:
1042	mutex_unlock(&ubd_lock);
1043	return len;
1044}
1045
1046static int ubd_id(char **str, int *start_out, int *end_out)
1047{
1048	int n;
1049
1050	n = parse_unit(str);
1051	*start_out = 0;
1052	*end_out = MAX_DEV - 1;
1053	return n;
1054}
1055
1056static int ubd_remove(int n, char **error_out)
1057{
1058	struct gendisk *disk = ubd_gendisk[n];
1059	struct ubd *ubd_dev;
1060	int err = -ENODEV;
1061
1062	mutex_lock(&ubd_lock);
1063
1064	ubd_dev = &ubd_devs[n];
1065
1066	if(ubd_dev->file == NULL)
1067		goto out;
1068
1069	/* you cannot remove a open disk */
1070	err = -EBUSY;
1071	if(ubd_dev->count > 0)
1072		goto out;
1073
1074	ubd_gendisk[n] = NULL;
1075	if(disk != NULL){
1076		del_gendisk(disk);
1077		put_disk(disk);
1078	}
1079
1080	if(fake_gendisk[n] != NULL){
1081		del_gendisk(fake_gendisk[n]);
1082		put_disk(fake_gendisk[n]);
1083		fake_gendisk[n] = NULL;
1084	}
1085
1086	err = 0;
1087	platform_device_unregister(&ubd_dev->pdev);
1088out:
1089	mutex_unlock(&ubd_lock);
1090	return err;
1091}
1092
1093/* All these are called by mconsole in process context and without
1094 * ubd-specific locks.  The structure itself is const except for .list.
1095 */
1096static struct mc_device ubd_mc = {
1097	.list		= LIST_HEAD_INIT(ubd_mc.list),
1098	.name		= "ubd",
1099	.config		= ubd_config,
1100	.get_config	= ubd_get_config,
1101	.id		= ubd_id,
1102	.remove		= ubd_remove,
1103};
1104
1105static int __init ubd_mc_init(void)
1106{
1107	mconsole_register_dev(&ubd_mc);
1108	return 0;
1109}
1110
1111__initcall(ubd_mc_init);
1112
1113static int __init ubd0_init(void)
1114{
1115	struct ubd *ubd_dev = &ubd_devs[0];
1116
1117	mutex_lock(&ubd_lock);
1118	if(ubd_dev->file == NULL)
1119		ubd_dev->file = "root_fs";
1120	mutex_unlock(&ubd_lock);
1121
1122	return 0;
1123}
1124
1125__initcall(ubd0_init);
1126
1127/* Used in ubd_init, which is an initcall */
1128static struct platform_driver ubd_driver = {
1129	.driver = {
1130		.name  = DRIVER_NAME,
1131	},
1132};
1133
1134static int __init ubd_init(void)
1135{
1136	char *error;
1137	int i, err;
1138
1139	if (register_blkdev(UBD_MAJOR, "ubd"))
1140		return -1;
1141
1142	if (fake_major != UBD_MAJOR) {
1143		char name[sizeof("ubd_nnn\0")];
1144
1145		snprintf(name, sizeof(name), "ubd_%d", fake_major);
1146		if (register_blkdev(fake_major, "ubd"))
1147			return -1;
1148	}
1149
1150	irq_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
1151				       sizeof(struct io_thread_req *),
1152				       GFP_KERNEL
1153		);
1154	irq_remainder = 0;
1155
1156	if (irq_req_buffer == NULL) {
1157		printk(KERN_ERR "Failed to initialize ubd buffering\n");
1158		return -1;
1159	}
1160	io_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE,
1161				      sizeof(struct io_thread_req *),
1162				      GFP_KERNEL
1163		);
1164
1165	io_remainder = 0;
1166
1167	if (io_req_buffer == NULL) {
1168		printk(KERN_ERR "Failed to initialize ubd buffering\n");
1169		return -1;
1170	}
1171	platform_driver_register(&ubd_driver);
1172	mutex_lock(&ubd_lock);
1173	for (i = 0; i < MAX_DEV; i++){
1174		err = ubd_add(i, &error);
1175		if(err)
1176			printk(KERN_ERR "Failed to initialize ubd device %d :"
1177			       "%s\n", i, error);
1178	}
1179	mutex_unlock(&ubd_lock);
1180	return 0;
1181}
1182
1183late_initcall(ubd_init);
1184
1185static int __init ubd_driver_init(void){
1186	unsigned long stack;
1187	int err;
1188
1189	/* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/
1190	if(global_openflags.s){
1191		printk(KERN_INFO "ubd: Synchronous mode\n");
1192		/* Letting ubd=sync be like using ubd#s= instead of ubd#= is
1193		 * enough. So use anyway the io thread. */
1194	}
1195	stack = alloc_stack(0, 0);
1196	io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *),
1197				 &thread_fd);
1198	if(io_pid < 0){
1199		printk(KERN_ERR
1200		       "ubd : Failed to start I/O thread (errno = %d) - "
1201		       "falling back to synchronous I/O\n", -io_pid);
1202		io_pid = -1;
1203		return 0;
1204	}
1205	err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr,
1206			     0, "ubd", ubd_devs);
1207	if(err != 0)
1208		printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err);
1209	return 0;
1210}
1211
1212device_initcall(ubd_driver_init);
1213
1214static int ubd_open(struct block_device *bdev, fmode_t mode)
1215{
1216	struct gendisk *disk = bdev->bd_disk;
1217	struct ubd *ubd_dev = disk->private_data;
1218	int err = 0;
1219
1220	mutex_lock(&ubd_mutex);
1221	if(ubd_dev->count == 0){
1222		err = ubd_open_dev(ubd_dev);
1223		if(err){
1224			printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
1225			       disk->disk_name, ubd_dev->file, -err);
1226			goto out;
1227		}
1228	}
1229	ubd_dev->count++;
1230	set_disk_ro(disk, !ubd_dev->openflags.w);
1231
1232	/* This should no more be needed. And it didn't work anyway to exclude
1233	 * read-write remounting of filesystems.*/
1234	/*if((mode & FMODE_WRITE) && !ubd_dev->openflags.w){
1235	        if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev);
1236	        err = -EROFS;
1237	}*/
1238out:
1239	mutex_unlock(&ubd_mutex);
1240	return err;
1241}
1242
1243static void ubd_release(struct gendisk *disk, fmode_t mode)
1244{
1245	struct ubd *ubd_dev = disk->private_data;
1246
1247	mutex_lock(&ubd_mutex);
1248	if(--ubd_dev->count == 0)
1249		ubd_close_dev(ubd_dev);
1250	mutex_unlock(&ubd_mutex);
1251}
1252
1253static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
1254			  __u64 *cow_offset, unsigned long *bitmap,
1255			  __u64 bitmap_offset, unsigned long *bitmap_words,
1256			  __u64 bitmap_len)
1257{
1258	__u64 sector = io_offset >> SECTOR_SHIFT;
1259	int i, update_bitmap = 0;
1260
1261	for (i = 0; i < length >> SECTOR_SHIFT; i++) {
1262		if(cow_mask != NULL)
1263			ubd_set_bit(i, (unsigned char *) cow_mask);
1264		if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
1265			continue;
1266
1267		update_bitmap = 1;
1268		ubd_set_bit(sector + i, (unsigned char *) bitmap);
1269	}
1270
1271	if(!update_bitmap)
1272		return;
1273
1274	*cow_offset = sector / (sizeof(unsigned long) * 8);
1275
1276	/* This takes care of the case where we're exactly at the end of the
1277	 * device, and *cow_offset + 1 is off the end.  So, just back it up
1278	 * by one word.  Thanks to Lynn Kerby for the fix and James McMechan
1279	 * for the original diagnosis.
1280	 */
1281	if (*cow_offset == (DIV_ROUND_UP(bitmap_len,
1282					 sizeof(unsigned long)) - 1))
1283		(*cow_offset)--;
1284
1285	bitmap_words[0] = bitmap[*cow_offset];
1286	bitmap_words[1] = bitmap[*cow_offset + 1];
1287
1288	*cow_offset *= sizeof(unsigned long);
1289	*cow_offset += bitmap_offset;
1290}
1291
1292static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
1293		       __u64 bitmap_offset, __u64 bitmap_len)
1294{
1295	__u64 sector = req->offset >> SECTOR_SHIFT;
1296	int i;
1297
1298	if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT)
1299		panic("Operation too long");
1300
1301	if (req_op(req->req) == REQ_OP_READ) {
1302		for (i = 0; i < req->length >> SECTOR_SHIFT; i++) {
1303			if(ubd_test_bit(sector + i, (unsigned char *) bitmap))
1304				ubd_set_bit(i, (unsigned char *)
1305					    &req->sector_mask);
1306		}
1307	}
1308	else cowify_bitmap(req->offset, req->length, &req->sector_mask,
1309			   &req->cow_offset, bitmap, bitmap_offset,
1310			   req->bitmap_words, bitmap_len);
1311}
1312
1313static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
1314		u64 off, struct bio_vec *bvec)
1315{
1316	struct ubd *dev = hctx->queue->queuedata;
1317	struct io_thread_req *io_req;
1318	int ret;
1319
1320	io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
1321	if (!io_req)
1322		return -ENOMEM;
1323
1324	io_req->req = req;
1325	if (dev->cow.file)
1326		io_req->fds[0] = dev->cow.fd;
1327	else
1328		io_req->fds[0] = dev->fd;
1329	io_req->error = 0;
1330
1331	if (bvec != NULL) {
1332		io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
1333		io_req->length = bvec->bv_len;
1334	} else {
1335		io_req->buffer = NULL;
1336		io_req->length = blk_rq_bytes(req);
1337	}
1338
1339	io_req->sectorsize = SECTOR_SIZE;
1340	io_req->fds[1] = dev->fd;
1341	io_req->cow_offset = -1;
1342	io_req->offset = off;
1343	io_req->sector_mask = 0;
1344	io_req->offsets[0] = 0;
1345	io_req->offsets[1] = dev->cow.data_offset;
1346
1347	if (dev->cow.file)
1348		cowify_req(io_req, dev->cow.bitmap,
1349			   dev->cow.bitmap_offset, dev->cow.bitmap_len);
1350
1351	ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
1352	if (ret != sizeof(io_req)) {
1353		if (ret != -EAGAIN)
1354			pr_err("write to io thread failed: %d\n", -ret);
1355		kfree(io_req);
1356	}
1357	return ret;
1358}
1359
1360static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req)
1361{
1362	struct req_iterator iter;
1363	struct bio_vec bvec;
1364	int ret;
1365	u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT;
1366
1367	rq_for_each_segment(bvec, req, iter) {
1368		ret = ubd_queue_one_vec(hctx, req, off, &bvec);
1369		if (ret < 0)
1370			return ret;
1371		off += bvec.bv_len;
1372	}
1373	return 0;
1374}
1375
1376static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
1377				 const struct blk_mq_queue_data *bd)
1378{
1379	struct ubd *ubd_dev = hctx->queue->queuedata;
1380	struct request *req = bd->rq;
1381	int ret = 0, res = BLK_STS_OK;
1382
1383	blk_mq_start_request(req);
1384
1385	spin_lock_irq(&ubd_dev->lock);
1386
1387	switch (req_op(req)) {
1388	/* operations with no lentgth/offset arguments */
1389	case REQ_OP_FLUSH:
1390		ret = ubd_queue_one_vec(hctx, req, 0, NULL);
1391		break;
1392	case REQ_OP_READ:
1393	case REQ_OP_WRITE:
1394		ret = queue_rw_req(hctx, req);
1395		break;
1396	case REQ_OP_DISCARD:
1397	case REQ_OP_WRITE_ZEROES:
1398		ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL);
1399		break;
1400	default:
1401		WARN_ON_ONCE(1);
1402		res = BLK_STS_NOTSUPP;
1403	}
1404
1405	spin_unlock_irq(&ubd_dev->lock);
1406
1407	if (ret < 0) {
1408		if (ret == -ENOMEM)
1409			res = BLK_STS_RESOURCE;
1410		else
1411			res = BLK_STS_DEV_RESOURCE;
1412	}
1413
1414	return res;
1415}
1416
1417static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1418{
1419	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1420
1421	geo->heads = 128;
1422	geo->sectors = 32;
1423	geo->cylinders = ubd_dev->size / (128 * 32 * 512);
1424	return 0;
1425}
1426
1427static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
1428		     unsigned int cmd, unsigned long arg)
1429{
1430	struct ubd *ubd_dev = bdev->bd_disk->private_data;
1431	u16 ubd_id[ATA_ID_WORDS];
1432
1433	switch (cmd) {
1434		struct cdrom_volctrl volume;
1435	case HDIO_GET_IDENTITY:
1436		memset(&ubd_id, 0, ATA_ID_WORDS * 2);
1437		ubd_id[ATA_ID_CYLS]	= ubd_dev->size / (128 * 32 * 512);
1438		ubd_id[ATA_ID_HEADS]	= 128;
1439		ubd_id[ATA_ID_SECTORS]	= 32;
1440		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
1441				 sizeof(ubd_id)))
1442			return -EFAULT;
1443		return 0;
1444
1445	case CDROMVOLREAD:
1446		if(copy_from_user(&volume, (char __user *) arg, sizeof(volume)))
1447			return -EFAULT;
1448		volume.channel0 = 255;
1449		volume.channel1 = 255;
1450		volume.channel2 = 255;
1451		volume.channel3 = 255;
1452		if(copy_to_user((char __user *) arg, &volume, sizeof(volume)))
1453			return -EFAULT;
1454		return 0;
1455	}
1456	return -EINVAL;
1457}
1458
1459static int map_error(int error_code)
1460{
1461	switch (error_code) {
1462	case 0:
1463		return BLK_STS_OK;
1464	case ENOSYS:
1465	case EOPNOTSUPP:
1466		return BLK_STS_NOTSUPP;
1467	case ENOSPC:
1468		return BLK_STS_NOSPC;
1469	}
1470	return BLK_STS_IOERR;
1471}
1472
1473/*
1474 * Everything from here onwards *IS NOT PART OF THE KERNEL*
1475 *
1476 * The following functions are part of UML hypervisor code.
1477 * All functions from here onwards are executed as a helper
1478 * thread and are not allowed to execute any kernel functions.
1479 *
1480 * Any communication must occur strictly via shared memory and IPC.
1481 *
1482 * Do not add printks, locks, kernel memory operations, etc - it
1483 * will result in unpredictable behaviour and/or crashes.
1484 */
1485
1486static int update_bitmap(struct io_thread_req *req)
1487{
1488	int n;
1489
1490	if(req->cow_offset == -1)
1491		return map_error(0);
1492
1493	n = os_pwrite_file(req->fds[1], &req->bitmap_words,
1494			  sizeof(req->bitmap_words), req->cow_offset);
1495	if (n != sizeof(req->bitmap_words))
1496		return map_error(-n);
1497
1498	return map_error(0);
1499}
1500
1501static void do_io(struct io_thread_req *req)
1502{
1503	char *buf = NULL;
1504	unsigned long len;
1505	int n, nsectors, start, end, bit;
1506	__u64 off;
1507
1508	/* FLUSH is really a special case, we cannot "case" it with others */
1509
1510	if (req_op(req->req) == REQ_OP_FLUSH) {
1511		/* fds[0] is always either the rw image or our cow file */
1512		req->error = map_error(-os_sync_file(req->fds[0]));
1513		return;
1514	}
1515
1516	nsectors = req->length / req->sectorsize;
1517	start = 0;
1518	do {
1519		bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask);
1520		end = start;
1521		while((end < nsectors) &&
1522		      (ubd_test_bit(end, (unsigned char *)
1523				    &req->sector_mask) == bit))
1524			end++;
1525
1526		off = req->offset + req->offsets[bit] +
1527			start * req->sectorsize;
1528		len = (end - start) * req->sectorsize;
1529		if (req->buffer != NULL)
1530			buf = &req->buffer[start * req->sectorsize];
1531
1532		switch (req_op(req->req)) {
1533		case REQ_OP_READ:
1534			n = 0;
1535			do {
1536				buf = &buf[n];
1537				len -= n;
1538				n = os_pread_file(req->fds[bit], buf, len, off);
1539				if (n < 0) {
1540					req->error = map_error(-n);
1541					return;
1542				}
1543			} while((n < len) && (n != 0));
1544			if (n < len) memset(&buf[n], 0, len - n);
1545			break;
1546		case REQ_OP_WRITE:
1547			n = os_pwrite_file(req->fds[bit], buf, len, off);
1548			if(n != len){
1549				req->error = map_error(-n);
1550				return;
1551			}
1552			break;
1553		case REQ_OP_DISCARD:
1554		case REQ_OP_WRITE_ZEROES:
1555			n = os_falloc_punch(req->fds[bit], off, len);
1556			if (n) {
1557				req->error = map_error(-n);
1558				return;
1559			}
1560			break;
1561		default:
1562			WARN_ON_ONCE(1);
1563			req->error = BLK_STS_NOTSUPP;
1564			return;
1565		}
1566
1567		start = end;
1568	} while(start < nsectors);
1569
1570	req->error = update_bitmap(req);
1571}
1572
1573/* Changed in start_io_thread, which is serialized by being called only
1574 * from ubd_init, which is an initcall.
1575 */
1576int kernel_fd = -1;
1577
1578/* Only changed by the io thread. XXX: currently unused. */
1579static int io_count = 0;
1580
1581int io_thread(void *arg)
1582{
1583	int n, count, written, res;
1584
1585	os_fix_helper_signals();
1586
1587	while(1){
1588		n = bulk_req_safe_read(
1589			kernel_fd,
1590			io_req_buffer,
1591			&io_remainder,
1592			&io_remainder_size,
1593			UBD_REQ_BUFFER_SIZE
1594		);
1595		if (n <= 0) {
1596			if (n == -EAGAIN)
1597				ubd_read_poll(-1);
1598
1599			continue;
1600		}
1601
1602		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
1603			io_count++;
1604			do_io((*io_req_buffer)[count]);
1605		}
1606
1607		written = 0;
1608
1609		do {
1610			res = os_write_file(kernel_fd,
1611					    ((char *) io_req_buffer) + written,
1612					    n - written);
1613			if (res >= 0) {
1614				written += res;
1615			}
1616			if (written < n) {
1617				ubd_write_poll(-1);
1618			}
1619		} while (written < n);
1620	}
1621
1622	return 0;
1623}