[PATCH] md: allow md intent bitmap to be stored near the superblock.

This provides an alternate to storing the bitmap in a separate file.  The
bitmap can be stored at a given offset from the superblock.  Obviously the
creator of the array must make sure this doesn't intersect with data....
After is good for version-0.90 superblocks.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 204564d..030d686 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -116,7 +116,7 @@
 	if (!page)
 		printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
 	else
-		printk("%s: bitmap_alloc_page: allocated page at %p\n",
+		PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
 			bmname(bitmap), page);
 	return page;
 }
@@ -258,13 +258,61 @@
  * basic page I/O operations
  */
 
+/* IO operations when bitmap is stored near all superblocks */
+static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index)
+{
+	/* choose a good rdev and read the page from there */
+
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+	struct page *page = alloc_page(GFP_KERNEL);
+	sector_t target;
+
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+	do {
+		ITERATE_RDEV(mddev, rdev, tmp)
+			if (rdev->in_sync && !rdev->faulty)
+				goto found;
+		return ERR_PTR(-EIO);
+
+	found:
+		target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
+
+	} while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ));
+
+	page->index = index;
+	return page;
+}
+
+static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
+{
+	mdk_rdev_t *rdev;
+	struct list_head *tmp;
+
+	ITERATE_RDEV(mddev, rdev, tmp)
+		if (rdev->in_sync && !rdev->faulty)
+			md_super_write(mddev, rdev,
+				       (rdev->sb_offset<<1) + offset
+				       + page->index * (PAGE_SIZE/512),
+				       PAGE_SIZE,
+				       page);
+
+	if (wait)
+		wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+	return 0;
+}
+
 /*
- * write out a page
+ * write out a page to a file
  */
 static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	int ret = -ENOMEM;
 
+	if (bitmap->file == NULL)
+		return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
+
 	lock_page(page);
 
 	ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE);
@@ -394,7 +442,12 @@
 	int err = -EINVAL;
 
 	/* page 0 is the superblock, read it... */
-	bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
+	if (bitmap->file)
+		bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
+	else {
+		bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
+		bytes_read = PAGE_SIZE;
+	}
 	if (IS_ERR(bitmap->sb_page)) {
 		err = PTR_ERR(bitmap->sb_page);
 		bitmap->sb_page = NULL;
@@ -625,14 +678,16 @@
 	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
 	bitmap_update_sb(bitmap);
 
-	path = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (path)
-		ptr = file_path(bitmap->file, path, PAGE_SIZE);
+	if (bitmap->file) {
+		path = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (path)
+			ptr = file_path(bitmap->file, path, PAGE_SIZE);
 
-	printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
-		bmname(bitmap), ptr ? ptr : "");
+		printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
+		       bmname(bitmap), ptr ? ptr : "");
 
-	kfree(path);
+		kfree(path);
+	}
 
 	bitmap_file_put(bitmap);
 
@@ -676,7 +731,7 @@
 	void *kaddr;
 	unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
 
-	if (!bitmap->file || !bitmap->filemap) {
+	if (!bitmap->filemap) {
 		return;
 	}
 
@@ -715,7 +770,7 @@
 	 * flushed out to disk */
 	for (i = 0; i < bitmap->file_pages; i++) {
 		spin_lock_irqsave(&bitmap->lock, flags);
-		if (!bitmap->file || !bitmap->filemap) {
+		if (!bitmap->filemap) {
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 			return 0;
 		}
@@ -732,11 +787,15 @@
 				return 1;
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
-		spin_lock_irq(&bitmap->write_lock);
-		wait_event_lock_irq(bitmap->write_wait,
-			list_empty(&bitmap->complete_pages), bitmap->write_lock,
-			wake_up_process(bitmap->writeback_daemon->tsk));
-		spin_unlock_irq(&bitmap->write_lock);
+		if (bitmap->file) {
+			spin_lock_irq(&bitmap->write_lock);
+			wait_event_lock_irq(bitmap->write_wait,
+					    list_empty(&bitmap->complete_pages), bitmap->write_lock,
+					    wake_up_process(bitmap->writeback_daemon->tsk));
+			spin_unlock_irq(&bitmap->write_lock);
+		} else
+			wait_event(bitmap->mddev->sb_wait,
+				   atomic_read(&bitmap->mddev->pending_writes)==0);
 	}
 	return 0;
 }
@@ -764,7 +823,7 @@
 	chunks = bitmap->chunks;
 	file = bitmap->file;
 
-	BUG_ON(!file);
+	BUG_ON(!file && !bitmap->offset);
 
 #if INJECT_FAULTS_3
 	outofdate = 1;
@@ -779,7 +838,7 @@
 
 	num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
 
-	if (i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
+	if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
 		printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
@@ -816,14 +875,18 @@
 				 */
 				page = bitmap->sb_page;
 				offset = sizeof(bitmap_super_t);
-			} else {
+			} else if (file) {
 				page = read_page(file, index, &dummy);
-				if (IS_ERR(page)) { /* read error */
-					ret = PTR_ERR(page);
-					goto out;
-				}
+				offset = 0;
+			} else {
+				page = read_sb_page(bitmap->mddev, bitmap->offset, index);
 				offset = 0;
 			}
+			if (IS_ERR(page)) { /* read error */
+				ret = PTR_ERR(page);
+				goto out;
+			}
+
 			oldindex = index;
 			oldpage = page;
 			kmap(page);
@@ -874,6 +937,19 @@
 	return ret;
 }
 
+void bitmap_write_all(struct bitmap *bitmap)
+{
+	/* We don't actually write all bitmap blocks here,
+	 * just flag them as needing to be written
+	 */
+
+	unsigned long chunks = bitmap->chunks;
+	unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t);
+	unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE;
+	while (num_pages--)
+		bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
+}
+
 
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
 {
@@ -913,7 +989,7 @@
 	for (j = 0; j < bitmap->chunks; j++) {
 		bitmap_counter_t *bmc;
 		spin_lock_irqsave(&bitmap->lock, flags);
-		if (!bitmap->file || !bitmap->filemap) {
+		if (!bitmap->filemap) {
 			/* error or shutdown */
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 			break;
@@ -1072,6 +1148,7 @@
 
 	spin_lock_irqsave(&bitmap->lock, flags);
 	*ptr = NULL;
+
 	if (!bitmap->file) /* no need for daemon if there's no backing file */
 		goto out_unlock;
 
@@ -1416,9 +1493,11 @@
 
 	BUG_ON(sizeof(bitmap_super_t) != 256);
 
-	if (!file) /* bitmap disabled, nothing to do */
+	if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
 		return 0;
 
+	BUG_ON(file && mddev->bitmap_offset);
+
 	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
 		return -ENOMEM;
@@ -1438,7 +1517,8 @@
 		return -ENOMEM;
 
 	bitmap->file = file;
-	get_file(file);
+	bitmap->offset = mddev->bitmap_offset;
+	if (file) get_file(file);
 	/* read superblock from bitmap file (this sets bitmap->chunksize) */
 	err = bitmap_read_sb(bitmap);
 	if (err)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7075beb..fde8acf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -337,7 +337,7 @@
 	return 0;
 }
 
-static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 		   struct page *page, int rw)
 {
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
@@ -609,6 +609,17 @@
 		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 
 		mddev->max_disks = MD_SB_DISKS;
+
+		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+		    mddev->bitmap_file == NULL) {
+			if (mddev->level != 1) {
+				/* FIXME use a better test */
+				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+				return -EINVAL;
+			}
+			mddev->bitmap_offset = (MD_SB_BYTES >> 9);
+		}
+
 	} else if (mddev->pers == NULL) {
 		/* Insist on good event counter while assembling */
 		__u64 ev1 = md_event(sb);
@@ -702,6 +713,9 @@
 	sb->layout = mddev->layout;
 	sb->chunk_size = mddev->chunk_size;
 
+	if (mddev->bitmap && mddev->bitmap_file == NULL)
+		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
 	sb->disks[0].state = (1<<MD_DISK_REMOVED);
 	ITERATE_RDEV(mddev,rdev2,tmp) {
 		mdp_disk_t *d;
@@ -898,6 +912,15 @@
 		memcpy(mddev->uuid, sb->set_uuid, 16);
 
 		mddev->max_disks =  (4096-256)/2;
+
+		if ((le32_to_cpu(sb->feature_map) & 1) &&
+		    mddev->bitmap_file == NULL ) {
+			if (mddev->level != 1) {
+				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+				return -EINVAL;
+			}
+			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+		}
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling */
 		__u64 ev1 = le64_to_cpu(sb->events);
@@ -960,6 +983,11 @@
 	else
 		sb->resync_offset = cpu_to_le64(0);
 
+	if (mddev->bitmap && mddev->bitmap_file == NULL) {
+		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+		sb->feature_map = cpu_to_le32(1);
+	}
+
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
 		if (rdev2->desc_nr+1 > max_dev)
@@ -2406,7 +2434,8 @@
 			mdname(mddev));
 		fput(mddev->bitmap_file);
 		mddev->bitmap_file = NULL;
-	}
+	} else
+		mddev->bitmap_offset = 0; /* file overrides offset */
 	return err;
 }
 
@@ -3774,6 +3803,13 @@
 			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (!spares)
 				set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+				/* We are adding a device or devices to an array
+				 * which has the bitmap stored on all devices.
+				 * So make sure all bitmap pages get written
+				 */
+				bitmap_write_all(mddev->bitmap);
+			}
 			mddev->sync_thread = md_register_thread(md_do_sync,
 								mddev,
 								"%s_resync");
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index cfe60cf..e24b74b 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -217,6 +217,7 @@
 	/* bitmap spinlock */
 	spinlock_t lock;
 
+	long offset; /* offset from superblock if file is NULL */
 	struct file *file; /* backing disk file */
 	struct page *sb_page; /* cached copy of the bitmap file superblock */
 	struct page **filemap; /* list of cache pages for the file */
@@ -255,6 +256,7 @@
 int bitmap_update_sb(struct bitmap *bitmap);
 
 int  bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
 
 /* these are exported */
 int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 75f41d8..ffa316c 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -60,7 +60,14 @@
  */
 #define MD_MAJOR_VERSION                0
 #define MD_MINOR_VERSION                90
-#define MD_PATCHLEVEL_VERSION           1
+/*
+ * MD_PATCHLEVEL_VERSION indicates kernel functionality.
+ * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
+ *     and major_version/minor_version accordingly
+ * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
+ *     in the super status byte
+ */
+#define MD_PATCHLEVEL_VERSION           2
 
 extern int register_md_personality (int p_num, mdk_personality_t *p);
 extern int unregister_md_personality (int p_num);
@@ -78,6 +85,12 @@
 
 extern void md_print_devices (void);
 
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+			   sector_t sector, int size, struct page *page);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+			struct page *page, int rw);
+
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3e97702..a3725b5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -273,6 +273,10 @@
 
 	struct bitmap                   *bitmap; /* the bitmap for the device */
 	struct file			*bitmap_file; /* the bitmap file */
+	long				bitmap_offset; /* offset from superblock of
+							* start of bitmap. May be
+							* negative, but not '0'
+							*/
 
 	struct list_head		all_mddevs;
 };
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 8ba95d6..8e592a2 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -96,6 +96,7 @@
 #define MD_SB_CLEAN		0
 #define MD_SB_ERRORS		1
 
+#define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
 typedef struct mdp_superblock_s {
 	/*
 	 * Constant generic information
@@ -184,7 +185,7 @@
 	/* constant array information - 128 bytes */
 	__u32	magic;		/* MD_SB_MAGIC: 0xa92b4efc - little endian */
 	__u32	major_version;	/* 1 */
-	__u32	feature_map;	/* 0 for now */
+	__u32	feature_map;	/* bit 0 set if 'bitmap_offset' is meaningful */
 	__u32	pad0;		/* always set to 0 when writing */
 
 	__u8	set_uuid[16];	/* user-space generated. */
@@ -197,6 +198,10 @@
 
 	__u32	chunksize;	/* in 512byte sectors */
 	__u32	raid_disks;
+	__u32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+				 * NOTE: signed, so bitmap can be before superblock
+				 * only meaningful of feature_map[0] is set.
+				 */
 	__u8	pad1[128-96];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */