/*
 * btnode.c - NILFS BT-Node implementation
 *
 * Copyright (C) 2005-2007 Nippon Telegraph and Telephone Corporation.
 *
 * This file is part of NILFS.
 *
 * NILFS is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * NILFS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NILFS; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * btnode.c,v 1.139 2007-07-17 04:47:28 kihara Exp
 *
 * Written by Seiji Kihara <kihara@osrg.net>
 */

#include <linux/types.h>
#include <linux/list.h>
#include "nilfs.h"
#include "mdt.h"
#include "page.h"
#include "btnode.h"

#define NILFS_BTNODE_GANG_SIZE	16

/*
 * applications: node pages for
 * - normal files and "ifile"s
 * - the checkpoint file
 * - the dat file (ppn based)
 */

/*
 * just utility
 */

/* block to page_index */
inline static unsigned long
B2P(nilfs_sector_t blocknum, struct inode *inode)
{
	return (unsigned long) (blocknum >>
				(PAGE_CACHE_SHIFT - inode->i_blkbits));
}

/* block to offset in the page */
inline static unsigned int
B2O(nilfs_sector_t blocknum, struct inode *inode)
{
	return (unsigned int) (blocknum &
			       ((1UL << (PAGE_CACHE_SHIFT - inode->i_blkbits))
				- 1));
}

inline static struct nilfs_inode_info *
NILFS_BTNODE_CACHE_I(const struct nilfs_btnode_cache *btnode)
{
	return container_of(btnode, struct nilfs_inode_info, i_btnode_cache);
}

inline static struct inode *
BTNC_I(struct nilfs_btnode_cache *btnc)
{
	return &NILFS_BTNODE_CACHE_I(btnc)->vfs_inode;
}

inline static struct nilfs_inode_info *
NILFS_AS_II(struct address_space *mapping)
{
	return NILFS_I(NILFS_AS_I(mapping));
}

/* borrowed from mm/filemap.c::find_get_page */
static struct page *
nilfs_btnode_page_find_get(struct nilfs_btnode_cache *btnc,
			   unsigned long index)
{
	struct page *page;
	unsigned long flags;

	/* page_debug(3, "btnc=%p idx=%lu\n", btnc, index); */
	read_lock_irqsave(&btnc->tree_lock, flags);
	page = radix_tree_lookup(&btnc->page_tree, index);
	if (page)
		page_cache_get(page);
	read_unlock_irqrestore(&btnc->tree_lock, flags);
	BUG_ON(page && !page_has_buffers(page));
	return page;
}

/* return locked page with buffers */
static int
nilfs_btnode_page_alloc(struct nilfs_btnode_cache *btnc, struct page **res)
{
	struct page *page;
	struct buffer_head *bh, *head;
	struct inode *inode = BTNC_I(btnc);
	struct block_device *bdev = NILFS_I_NILFS(inode)->ns_bdev;

	/* page_debug(3, "btnc=%p\n", btnc); */
	/* create new page/buffers, if fail return -ENOMEM */
	/* locked page */
	page = nilfs_alloc_buffer_page(bdev, 1UL << inode->i_blkbits);
	if (unlikely(!page))
		return -ENOMEM;
	BUG_ON(!page_has_buffers(page));
	bh = head = page_buffers(page);
	do {
		set_buffer_nilfs_node(bh);
		bh->b_blocknr = 0;
		bh = bh->b_this_page;
	} while (bh != head);
	*res = page;
	return 0;
}

static int
nilfs_btnode_page_add_cache(struct page *page, struct nilfs_btnode_cache *btnc,
			    unsigned long index)
{
	int err;
	unsigned long flags;

	/*
	 * We cannot call radix_tree_preload, because it is not
	 * exported for modules...  I believe we'd better calling it
	 * if exported, as following:
	 *
	 * err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
	 * if (err)
	 *	return err;
	 */
	write_lock_irqsave(&btnc->tree_lock, flags);
	err = radix_tree_insert(&btnc->page_tree, index, page);
	if (!err) {
		page_cache_get(page);	/* for radix-tree */
		page->index = index;
		page->mapping = BTNC_I(btnc)->i_mapping;
	}
	write_unlock_irqrestore(&btnc->tree_lock, flags);
	/*
	 * call radix_tree_preload_end() here if radix_tree_preload()
	 * can be called
	 */
	return err;
}

static inline int
nilfs_btnode_page_remove_lru(struct page *page)
{
	if (likely(PageLRU(page))) {
		nilfs_page_delete_from_lru(page);
		return 0;
	}
	return -1;
}

static void __nilfs_btnode_page_mark_dirty(struct page *, int, int);

/* some codes borrowed from mm/filemap.c::read_cache_page() */
static int
__nilfs_btnode_get_page(struct nilfs_btnode_cache *btnc, unsigned long index,
			struct page **res, struct nilfs_btnode_cache *altbc)
{
	struct page *page, *cached_page = NULL;
	int err = 0;

repeat:
	page = nilfs_btnode_page_find_get(btnc, index);
#ifdef CONFIG_NILFS_DEBUG
	if (page && (BTNC_I(btnc)->i_ino == NILFS_DAT_INO))
		page_debug(3, "got page %p for index %lu\n", page, index);
#endif
	if (!page) {
		if (!cached_page) {
			err = nilfs_btnode_page_alloc(btnc, &cached_page);
			if (unlikely(err)) {	/* -ENOMEM */
				page_debug(3, "return %d (alloc).\n", err);
				return err;
			}
		}
		err = nilfs_btnode_page_add_cache(cached_page, btnc, index);
		if (unlikely(err)) {
			if (err == -EEXIST)
				goto repeat;
			printk(KERN_ERR
			       "%s: add_cache failed for page %p index %llu\n",
			       __FUNCTION__, cached_page,
			       (unsigned long long)index);
			goto out_free;
		}
		page = cached_page;
		cached_page = NULL;
		nilfs_page_add_to_inactive(page);

#ifdef CONFIG_NILFS_DEBUG
		if (page && (BTNC_I(btnc)->i_ino == NILFS_DAT_INO))
			page_debug(3, "got new page %p for index %lu\n",
				   page, index);
#endif
		/* for GC DAT inode */
		if (unlikely(altbc)) {
			struct page *opage = nilfs_btnode_page_find_get(altbc,
									index);
			if (opage) {
				page_debug(3, "got orig dat page %p "
					   "for index %lu\n", opage, index);
				lock_page(opage);
				/* dirty or pdirty pages do not appear here */
				BUG_ON(PageDirty(opage));
				if (page_has_buffers(opage))
					nilfs_gcdat_copy_buffer_page(opage,
								     page, 0);
				unlock_page(opage);
				page_cache_release(opage);
			}
		}
		/* pass page_count from btnode_page_alloc to caller */
		unlock_page(page);
	}
out_free:
	/*
	 * maybe -ENOMEM occured at radix_tree_insert() if cached_page remains.
	 * it is not in the inactive list in this case.
	 */
	if (cached_page)
		nilfs_free_buffer_page(cached_page);

	if (likely(page)) {
		nilfs_page_mark_accessed(page);
		*res = page;
		err = 0;
		if (unlikely(!nilfs_btnode_page_referenced(page, 0))) {
			page_debug(1,
				   "count %d map %d buf %d lru %d page %p\n",
				   page_count(page), !!page->mapping,
				   page_has_buffers(page), PageLRU(page),
				   page);
			BUG();
		}
	}
	if (err)
		page_debug(3, "return %d.\n", err);
	return err;
}

static inline int
nilfs_btnode_get_page(struct nilfs_btnode_cache *btnc, unsigned long index,
		      struct page **res)
{
	struct inode *inode = BTNC_I(btnc);
	struct nilfs_btnode_cache *altbc = NULL;

	/* set alternate cache for GC DAT */
	if (unlikely(nilfs_doing_gc() &&
		     inode->i_ino == NILFS_DAT_INO))
		altbc = &NILFS_I(NILFS_I_NILFS(inode)->ns_dat)->i_btnode_cache;
	return __nilfs_btnode_get_page(btnc, index, res, altbc);
}

/* Note: page must be locked and ref must be hold by caller. */
int
nilfs_btnode_page_delete_cache(struct page *page)
{
	struct nilfs_btnode_cache *btnc;
	unsigned long flags;
	int ret = -1;

	BUG_ON(!page->mapping);
	btnc = &NILFS_AS_II(page->mapping)->i_btnode_cache;
	write_lock_irqsave(&btnc->tree_lock, flags);
	/* check refs in critical section, page count 1 for caller */
	if (nilfs_btnode_page_referenced(page, 1)) {
		page_debug(2, "skip page %p cnt %d\n", page, page_count(page));
		goto out_locked;
	}
	if (page_has_buffers(page) && nilfs_page_buffers_busy(page)) {
		page_debug(2, "skip page %p (bufs busy)\n", page);
		goto out_locked;
	}
	/* remove from cache */
	radix_tree_delete(&btnc->page_tree, page->index);
	page_cache_release(page);	/* ref for radix-tree */
	page->mapping = NULL;
	page->index = 0;
	ret = 0;
out_locked:
	write_unlock_irqrestore(&btnc->tree_lock, flags);
	return ret;
}

/* Note: page must be locked and ref must be hold by caller. */
static int
nilfs_btnode_page_delete(struct page *page)
{
	if (nilfs_btnode_page_delete_cache(page))
		return -1;	/* used by others */
	if (nilfs_btnode_page_remove_lru(page))
		return -1;	/* non-lru page, shrinker may handle it */
	return nilfs_free_buffer_page(page);
}

/* page must be locked by caller */
static inline void
nilfs_btnode_buf_setpb(struct buffer_head *bh, sector_t pbn)
{
	set_buffer_mapped(bh);
	/* clear_buffer_nilfs_vb(bh); */
	bh->b_blocknr = pbn;
}

/* page must be locked by caller */
static inline void
nilfs_btnode_buf_setvb(struct buffer_head *bh, nilfs_sector_t vbn)
{
	set_buffer_mapped(bh);
	/* set_buffer_nilfs_vb(bh); */
	bh->b_blocknr = vbn;
}

int
__nilfs_btnode_get(struct nilfs_btnode_cache *btnc, nilfs_sector_t bn,
		   sector_t pbn, struct buffer_head **result, int exist_pg)
{
	struct page *page = NULL;
	struct buffer_head *bh = NULL;
	int err;
	struct inode *inode = BTNC_I(btnc);

	/* page_debug(3, "btnc=%p bn=%llu exist=%d\n", btnc, (unsigned long long)bn, exist_pg); */
	err = nilfs_btnode_get_page(btnc, B2P(bn, inode), &page);
	if (unlikely(err)) {	/* -ENOMEM */
		page_debug(3, "return %d (get_page).\n", err);
		goto out_nopage;
	}
	page_debug(3, "locking page %p for buf %p\n", page, bh);
	lock_page(page);

	bh = nilfs_page_get_nth_block(page, B2O(bn, inode));
	BUG_ON(!bh);

	if (!exist_pg) {
		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
			     buffer_dirty(bh))) {
			brelse(bh);
			page_debug(1, "*** INVALID New BH ! ***\n");
			page_debug(2, "BH %p #blk %llu state %lx "
				   "PAGE %p idx %ld\n",
				   bh, (unsigned long long)bh->b_blocknr,
				   bh->b_state, page, page->index);
			BUG();
		}
		nilfs_btnode_buf_setvb(bh, bn);
		/* page_debug(3, "return 0 for new bh.\n"); */
		goto found;
	}

	if (buffer_uptodate(bh) || buffer_dirty(bh)) {
		/* page_debug(3, "return 0 for valid bh.\n"); */
		goto found;
	}

	/* vbn = bh->b_blocknr; */
	if (pbn == 0) {
		pbn = bn;
		if (btnc->ops.translate) {
			struct inode *dat_inode;

			/* bn is a Virtual BN */
			dat_inode = nilfs_dat_inode(NILFS_I_NILFS(inode));
			page_debug(3, "xlate=%p\n", btnc->ops.translate);
			err = (*(btnc->ops.translate))(dat_inode, bn, &pbn);
			if (unlikely(err)) {
				brelse(bh);
				page_debug(3, "return %d (xlate).\n", err);
				goto out_locked;
			}
		}
	}
	nilfs_btnode_buf_setpb(bh, pbn);

	bh = nilfs_bread_slow(bh);

	if (unlikely(bh == NULL)) {
		page_debug(3, "return -EIO.\n");
		err = -EIO;
		goto out_locked;
	}
	nilfs_btnode_buf_setvb(bh, bn);
	page_debug(3, "return 0.\n");
found:
	*result = bh;
	err = 0;
out_locked:
	unlock_page(page);
	page_debug(3, "unlocked page %p for buf %p\n", page, bh);
	page_cache_release(page);	/* from nilfs_btnode_get_page() */
out_nopage:
	return err;
}

static inline int nilfs_btnode_buffer_busy(struct buffer_head *bh)
{
	return atomic_read(&bh->b_count) |
		(bh->b_state &
		 ((1 << BH_Dirty) | (1 << BH_Lock) | (1 << BH_Mapped)));
}

/* Note: caller must lock the page */
/* retval: -EINVAL for invalid buf, 1 for busy page, 0 for non-busy page */
int
nilfs_btnode_delete_bh(struct buffer_head *bh)
{
	int ret = 1;
	struct buffer_head *b;

	if (unlikely(!bh))
		goto out;
	nilfs_btnode_clear_dirties(bh);
	clear_buffer_uptodate(bh);
	bh->b_blocknr = 0;
	if (!buffer_mapped(bh))
		page_debug(1, "warning: deleting unused buf %p.\n", bh);
	clear_buffer_mapped(bh);
	brelse(bh);		/* hold by caller */
	b = bh;
	do {
		if (nilfs_btnode_buffer_busy(b))
			goto out;
		b = b->b_this_page;
	} while (b != bh);
	ret = 0;
out:
	return ret;
}

static inline int
nilfs_btnode_delete_bufs(struct page *page)
{
	struct buffer_head *head, *bh;
	int ret = 0;

	if (unlikely(!page_has_buffers(page))) {
		page_debug(3, "page %p has no buffers.\n", page);
		goto out;
	}
	head = bh = page_buffers(page);
	do {
		get_bh(bh);	/* for delete_bh() */
		ret = nilfs_btnode_delete_bh(bh);
		bh = bh->b_this_page;
	} while (bh != head);
out:
	/* return non-zero if the last deletion failed */
	return ret;
}

int
__nilfs_btnode_delete(struct buffer_head *bh, int lock)
{
	int ret = 0;
	struct page *page = bh->b_page;

	page_cache_get(page);	/* for dealloc */
	if (lock) {
		page_debug(3, "locking page %p for buf %p\n", page, bh);
		lock_page(page);
	} else
		BUG_ON(!PageLocked(page));
	wait_on_page_writeback(page);		/* wait writing by others */
	ret = nilfs_btnode_delete_bh(bh);	/* bh ref freed */
	if (ret < 0)	/* error */
		goto out_unlock;
	if (ret) {	/* valid bh remains in page */
		ret = 0;
		goto out_unlock;
	}
	wait_on_page_writeback(page);
	if (nilfs_btnode_page_delete(page) == 0)
		goto out;
	ret = 0;	/* someone looking. shrinker will remove. */
out_unlock:
	if (lock) {
		unlock_page(page);
		page_debug(3, "unlocked page %p for buf %p\n", page, bh);
	}
	page_cache_release(page);
out:
	return ret;
}

static int
nilfs_btnode_free_pages(struct nilfs_btnode_cache *btnc, int dirty, int tag)
{
	struct page *pages[NILFS_BTNODE_GANG_SIZE], *page;
	unsigned long flags, index = 0;
	int i, n, ret = 0;

	for (;;) {
		write_lock_irqsave(&btnc->tree_lock, flags);
		if (dirty)
			n = radix_tree_gang_lookup_tag(&btnc->page_tree,
						       (void **)pages, index,
						       NILFS_BTNODE_GANG_SIZE,
						       tag);
		else
			n = radix_tree_gang_lookup(&btnc->page_tree,
						   (void **)pages, index,
						   NILFS_BTNODE_GANG_SIZE);
		for (i = 0; i < n; i++)
			page_cache_get(pages[i]);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
		if (!n)
			break;
		index = page_index(pages[n - 1]) + 1;

		for (i = 0; i < n; i++) {
			struct buffer_head *bh, *head;

			page = pages[i];
			lock_page(page);
			page_debug(3, "locked page %p.\n", page);
			/* wait writing by others */
			wait_on_page_writeback(page);
			BUG_ON(PageWriteback(page));
			BUG_ON(!page_has_buffers(page));
			if (dirty) {
				bh = head = page_buffers(page);
				do {
					nilfs_btnode_clear_dirties(bh);
					bh = bh->b_this_page;
				} while (bh != head);
			}
			if (nilfs_btnode_page_delete_cache(page)) {
				page_debug(2, "Skipped(BufBusy) %p\n", page);
				goto continue_locked;
			}
			bh = head = page_buffers(page);
			do {
				if (buffer_mapped(bh)) {
					get_bh(bh);
					nilfs_btnode_delete_bh(bh);
				}
				clear_buffer_nilfs_node(bh);
				bh = bh->b_this_page;
			} while (bh != head);
			/* XXX should remove from LRU? */
			unlock_page(page);
			page_cache_release(page);
			continue;
		continue_locked:
			unlock_page(page);
			page_cache_release(page);
			ret++;
		}
	}
	return ret;
}

static inline int
nilfs_btnode_free_dirty_pages(struct nilfs_btnode_cache *btnc)
{
	int ret;

	ret = nilfs_btnode_free_pages(btnc, 1, NILFS_PAGECACHE_TAG_PDIRTY);
	return ret + nilfs_btnode_free_pages(btnc, 1, PAGECACHE_TAG_DIRTY);
}

static inline int
nilfs_btnode_free_all(struct nilfs_btnode_cache *btnc)
{
	return nilfs_btnode_free_pages(btnc, 0, 0);
}

static int
__nilfs_btnode_delete_all(struct nilfs_btnode_cache *btnc)
{
	struct page *pages[NILFS_BTNODE_GANG_SIZE], *page;
	unsigned long flags, index = 0;
	int i, n, ret = 0;

	for (;;) {
		write_lock_irqsave(&btnc->tree_lock, flags);
		n = radix_tree_gang_lookup(&btnc->page_tree, (void **)pages,
					   index, NILFS_BTNODE_GANG_SIZE);
		for (i = 0; i < n; i++)
			page_cache_get(pages[i]);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
		if (!n)
			break;
		index = page_index(pages[n - 1]) + 1;

		for (i = 0; i < n; i++) {
			page = pages[i];
			lock_page(page);
			page_debug(3, "locked page %p.\n", page);
			/* wait writing by others */
			wait_on_page_writeback(page);
			BUG_ON(PageWriteback(page));
			/* check busy before removing from cache */
			if (nilfs_btnode_delete_bufs(page)) {
				page_debug(3, "page %p buf busy.\n", page);
				goto continue_locked;
			}
			if (nilfs_btnode_page_delete(page) == 0)
				continue;
		continue_locked:
			unlock_page(page);
			page_cache_release(page);
			ret = 1;
		}
	}
	return ret;
}

#define	TRY_DELETION_TIMES	4	/* XXX: adequate? */
int
nilfs_btnode_delete_all(struct nilfs_btnode_cache *btnc)
{
	int i, ret;

	page_debug(3, "called for %p\n", btnc);
	for (i = 0; i < TRY_DELETION_TIMES; i++) {
		ret = __nilfs_btnode_delete_all(btnc);
		if (ret == 0)
			break;
	}
#ifdef CONFIG_NILFS_DEBUG
	if (ret) {
		page_debug(1, "page(s) remains for %p while tried %d times.\n",
			   btnc, TRY_DELETION_TIMES);
	}
#endif /* CONFIG_NILFS_DEBUG */
	page_debug(3, "return for %p\n", btnc);
	return ret;
}

static void
__nilfs_btnode_page_mark_dirty(struct page *page, int tag, int p2d)
{
	struct nilfs_btnode_cache *btnc;
	unsigned long flags;

	btnc = &NILFS_AS_II(page->mapping)->i_btnode_cache;
	if (!TestSetPageDirty(page)) {
		write_lock_irqsave(&btnc->tree_lock, flags);
		radix_tree_tag_set(&btnc->page_tree, page->index, tag);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
		return;
	}
	if (p2d) {
		/* changed from prepare-dirty to dirty */
		write_lock_irqsave(&btnc->tree_lock, flags);
		radix_tree_tag_clear(&btnc->page_tree, page->index,
				     NILFS_PAGECACHE_TAG_PDIRTY);
		radix_tree_tag_set(&btnc->page_tree, page->index,
				   PAGECACHE_TAG_DIRTY);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
	}
	return;
}
/*
 * __nilfs_btnode_mark_dirty() - mark dirty bh and set radix tree tag
 *
 *  STATES:
 *    mark_dirty: BUF-dirty, PAGE-dirty
 *    mark_prepare_dirty: BUF-pdirty, PAGE-pdirty
 *    BUF-dirty: BH-dirty
 *    BUF-pdirty: BH-dirty&pdirty
 *    PAGE-dirty: PageFlag-dirty&TAG-dirty
 *    PAGE-pdirty: PageFlag-dirty&TAG-pdirty
 *    Pdirty-page has only pdirty-bufs or clean bufs
 *    Dirty-page must have at least one dirty bufs,
 *    and also can have pdirty or clean bufs.
 */
void
__nilfs_btnode_mark_dirty(struct buffer_head *bh, int tag)
{
	struct page *page;
	struct buffer_head *b;
	int p2d = 0;

	BUG_ON((tag != PAGECACHE_TAG_DIRTY) &&
	       (tag != NILFS_PAGECACHE_TAG_PDIRTY));
	page = bh->b_page;
	page_debug(3, "locking page %p for buf %p.\n", page, bh);
	lock_page(page);
	if (test_set_buffer_dirty(bh))
		goto out_unlock;
	if ((tag == NILFS_PAGECACHE_TAG_PDIRTY) &&
	    (test_set_buffer_prepare_dirty(bh)))
		goto out_unlock;
	if ((tag == PAGECACHE_TAG_DIRTY) && PageDirty(page)) {
		p2d = 1;
		for (b = bh->b_this_page; b != bh; b = b->b_this_page)
			if (buffer_dirty(b) && !buffer_prepare_dirty(b)) {
				p2d = 0;
				break;
			}
	}
	__nilfs_btnode_page_mark_dirty(page, tag, p2d);
out_unlock:
	unlock_page(page);
	page_debug(3, "unlocked page %p for buf %p.\n", page, bh);
}


/*
 * Note: spinlock for the radix-tree must hold by caller.
 */
static inline void
__nilfs_btnode_page_clear_dirty(struct page *page, int bits,
				struct radix_tree_root *tree)
{
	pgoff_t index = page_index(page);

	if (bits & 1 << PAGECACHE_TAG_DIRTY) {
		/* may be called twice for the same page with DIRTY bit */
		if (TestClearPageDirty(page))
			radix_tree_tag_clear(tree, index, PAGECACHE_TAG_DIRTY);
	} else /* the PDIRTY bit must be set here */
		radix_tree_tag_set(tree, index, PAGECACHE_TAG_DIRTY);
	if (bits & 1 << NILFS_PAGECACHE_TAG_PDIRTY)
		radix_tree_tag_clear(tree, index, NILFS_PAGECACHE_TAG_PDIRTY);
}

/**
 * nilfs_btnode_page_clear_dirty - clear dirty bits on page and tag on radix-tree
 * @page: page to be cleared
 * @bits: bitmap to specify which dirty flag should be cleared:
 *	00(b): page state unchanged (remains dirty or prepare-dirty)
 *	01(b): page state will be changed from dirty to clean
 *	10(b): page state will be changed from prepare-dirty to dirty
 *	11(b): page state will be changed from prepare-dirty to clean
 */
void
nilfs_btnode_page_clear_dirty(struct page *page, int bits)
{
	unsigned long flags = 0;
	struct nilfs_btnode_cache *btnc;

	BUG_ON(!bits);
	if (!page->mapping && (bits & 1 << PAGECACHE_TAG_DIRTY)) {
		ClearPageDirty(page);
		return;
	}
	btnc = &NILFS_AS_II(page->mapping)->i_btnode_cache;
	write_lock_irqsave(&btnc->tree_lock, flags);
	__nilfs_btnode_page_clear_dirty(page, bits, &btnc->page_tree);
	write_unlock_irqrestore(&btnc->tree_lock, flags);
}

/**
 * nilfs_btnode_clear_dirties - clear dirty (or prepare) state for the
 * btnode buf.  The page state and pages' radix-tree tag for the page
 * which contains the buf are also changed.
 * @bh: pointer of a btnode buffer
 *
 * Note: page lock must be acquired by caller.
 */
void
nilfs_btnode_clear_dirties(struct buffer_head *bh)
{
	struct page *page = bh->b_page;
	int bits;

	clear_buffer_dirty(bh);
	if ((bits = nilfs_page_buffers_clean(page)) != 0)
		nilfs_btnode_page_clear_dirty(page, bits);
}

/*
 * nilfs_btnode_prepare_change_key
 *  prepare to move contents of the block for old key to one of new key.
 *  the old buffer will not be removed, but might be reused for new buffer.
 *  it might return -ENOMEM because of memory allocation errors,
 *  and might return -EIO because of disk read errors.
 */
int
nilfs_btnode_prepare_change_key(struct nilfs_btnode_cache *btnc,
				struct nilfs_btnode_chkey_ctxt *ctxt)
{
	int err;
	unsigned long flags;
	struct buffer_head *obh, *nbh;
	struct inode *inode = BTNC_I(btnc);
	nilfs_sector_t oldkey = ctxt->oldkey;
	nilfs_sector_t newkey = ctxt->newkey;

	if (oldkey == newkey) {
		page_debug(3, "oldkey==newkey(%llu).\n", (unsigned long long)oldkey);
		return 0;
	}
	page_debug(3, "oldkey %llu newkey %llu\n",
		   (unsigned long long)oldkey, (unsigned long long)newkey);
	obh = ctxt->bh;
	if (likely(inode->i_blkbits == PAGE_CACHE_SHIFT)) {
		/*
		 * We cannot call radix_tree_preload, because it is
		 * not exported for modules...  I believe we'd better
		 * calling it if exported, as following:
		 *
		 * err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
		 * if (err)
		 *	goto out;
		 */
		/* BUG_ON(oldkey != obh->b_page->index); */
		if (unlikely(oldkey != obh->b_page->index)) {
			printk(KERN_ERR "%s: invalid key %lld and index %ld. "
			       "page %p newkey %lld\n",
			       __FUNCTION__, (unsigned long long)oldkey,
			       obh->b_page->index, obh->b_page,
			       (unsigned long long)newkey);
			BUG();
		}
	retry:
		write_lock_irqsave(&btnc->tree_lock, flags);
		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
		/*
		 * call radix_tree_preload_end() here if
		 * radix_tree_preload() can be called
		 */
		if (unlikely(err == -EEXIST)) { /* -ENOMEM remains */
			struct page *page;
		       
			if (unlikely(inode->i_ino != NILFS_DAT_INO)) {
				page_debug(1,
					   "insert failed, ino %lu key %lld\n",
					   inode->i_ino,
					   (unsigned long long)newkey);
				BUG();
			}
			page = nilfs_btnode_page_find_get(btnc, newkey);
			if (page) {
				page_debug(1, "page %p exist for key %lld\n",
					   page, (unsigned long long)newkey);
				lock_page(page);
				page_debug(3, "locked page %p.\n", page);
				BUG_ON(PageWriteback(page));
				BUG_ON(!page_has_buffers(page));
				if (nilfs_btnode_page_delete(page)) {
					page_debug(1,
						   "busy page %p exist "
						   "for key %lld\n",
						   page,
						   (unsigned long long)newkey);
					BH_DEBUG(page_buffers(page), "");
					BUG();
				}
			}
			goto retry;
		}
		ctxt->newbh = NULL;
	} else {
		page_debug(2, "trying to copy a buffer in page "
			   "(key: %llu -> %llu)\n",
			   (unsigned long long)oldkey, (unsigned long long)newkey);
		err = nilfs_btnode_get_new(btnc, newkey, &nbh);
		if (unlikely(err)) {	/* -ENOMEM or -EIO */
			page_debug(1, "cannot btnode_get_new for key %lld "
				   "err %d\n",
				   (long long)newkey, err);
			goto out;
		}
		BUG_ON(nbh == obh);
		ctxt->newbh = nbh;
	}
out:
	return err;
}

/*
 * nilfs_btnode_commit_change_key
 *  commit the change_key operation prepared by prepare_change_key().
 */
void
nilfs_btnode_commit_change_key(struct nilfs_btnode_cache *btnc,
			       struct nilfs_btnode_chkey_ctxt *ctxt)
{
	unsigned long flags;
	struct buffer_head *obh, *nbh;
	nilfs_sector_t oldkey = ctxt->oldkey;
	nilfs_sector_t newkey = ctxt->newkey;

	if (oldkey == newkey) {
		page_debug(3, "oldkey==newkey(%llu).\n",
			   (unsigned long long)oldkey);
		return;
	}
	page_debug(3, "ino %lu oldkey %llu newkey %llu\n",
		   BTNC_I(btnc)->i_ino,
		   (unsigned long long)oldkey, (unsigned long long)newkey);
	obh = ctxt->bh;
	if ((nbh = ctxt->newbh) == NULL) {	/* blocksize == pagesize */
		/* BUG_ON(oldkey != obh->b_page->index); */
		if (unlikely(oldkey != obh->b_page->index)) {
			printk(KERN_ERR "%s: invalid key %lld and index %ld. "
			       "page %p newkey %lld\n",
			       __FUNCTION__, (unsigned long long)oldkey,
			       obh->b_page->index, obh->b_page,
			       (unsigned long long)newkey);
			BUG();
		}
		page_debug(3, "locking page %p for buf %p.\n",
			   obh->b_page, obh);
		lock_page(obh->b_page);
		if (!test_set_buffer_dirty(obh)) {
			/* virtual block, will be prepare-dirty */
			if (unlikely(test_set_buffer_prepare_dirty(obh)))
				BUG();
			if (unlikely(TestSetPageDirty(obh->b_page)))
				BUG();
		}
		write_lock_irqsave(&btnc->tree_lock, flags);
		radix_tree_delete(&btnc->page_tree, oldkey);
		radix_tree_tag_set(&btnc->page_tree, newkey,
				   buffer_prepare_dirty(obh) ?
				   NILFS_PAGECACHE_TAG_PDIRTY :
				   PAGECACHE_TAG_DIRTY);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
		unlock_page(obh->b_page);
		page_debug(3, "unlocked page %p for buf %p.\n",
			   obh->b_page, obh);
		obh->b_page->index = newkey;
		obh->b_blocknr = newkey;
	} else {
		if (!buffer_dirty(obh) || buffer_prepare_dirty(obh))
			nilfs_btnode_mark_prepare_dirty(nbh);
		else
			nilfs_btnode_mark_dirty(nbh);	/* before copy */
		memcpy(nbh->b_data, obh->b_data, 1 << BTNC_I(btnc)->i_blkbits);
		nbh->b_state = obh->b_state |
			(nbh->b_state &
			 ((1 << BH_Dirty) | (1 << BH_Prepare_Dirty)));;
		brelse(nbh);	/* free ref got by btnode_get_new() */
		/* Cleaning up old page */
		get_bh(obh);	/* hold bh ref more to guard from deletion */
		nilfs_btnode_delete(obh);
	}
}

/*
 * nilfs_btnode_abort_change_key
 *  abort the change_key operation prepared by prepare_change_key().
 */
void
nilfs_btnode_abort_change_key(struct nilfs_btnode_cache *btnc,
			      struct nilfs_btnode_chkey_ctxt *ctxt)
{
	unsigned long flags;
	struct buffer_head *nbh;
	nilfs_sector_t oldkey = ctxt->oldkey;
	nilfs_sector_t newkey = ctxt->newkey;

	if (oldkey == newkey) {
		page_debug(3, "oldkey==newkey(%llu).\n", (unsigned long long)oldkey);
		return;
	}
	page_debug(3, "oldkey %llu newkey %llu\n",
		   (unsigned long long)oldkey, (unsigned long long)newkey);
	if ((nbh = ctxt->newbh) == NULL) {	/* blocksize == pagesize */
		write_lock_irqsave(&btnc->tree_lock, flags);
		radix_tree_delete(&btnc->page_tree, newkey);
		write_unlock_irqrestore(&btnc->tree_lock, flags);
	} else {
		brelse(nbh);
		/* should be reclaimed by shrinker, or reuse by others */
	}
}

void
nilfs_btnode_do_copy_dirty_pages(struct nilfs_btnode_cache *src,
				 struct nilfs_btnode_cache *dst,
				 int tag)
{
	struct page *pages[NILFS_BTNODE_GANG_SIZE];
	unsigned int nr_pages;
	pgoff_t index = 0;
	int i, err;

repeat:
	nr_pages = nilfs_btnode_gang_lookup_tag(src, pages, index,
						NILFS_BTNODE_GANG_SIZE,
						tag);
	if (nr_pages == 0)
		return;
	index = pages[nr_pages - 1]->index + 1;

	for (i = 0; i < nr_pages; i++) {
		struct page *page = pages[i], *dpage;
		int locked = 1;

		/* lock_page(page); */
		if (TestSetPageLocked(page)) {
			printk("%s: page %p already locked.\n",
			       __FUNCTION__, page);
			locked = 0;
		}
		/* do not search original dat cache */
		err = __nilfs_btnode_get_page(dst, page->index, &dpage, NULL);
		if (unlikely(err)) {
			/* XXX what can I do against ENOMEM? */
			printk(KERN_ERR
			       "%s: fatal error. err=%d, dst %p idx %lu.\n",
			       __FUNCTION__, err, dst, page->index);
			BUG();
		}
		lock_page(dpage);
		if (PageDirty(page) && !PageDirty(dpage))
			__nilfs_btnode_page_mark_dirty(dpage, tag, 0);
		page_debug(3, "cp: orig: page %p idx %lu, "
			   "gc: page %p idx %lu.\n",
			   page, page->index, dpage, dpage->index);
		nilfs_gcdat_copy_buffer_page(page, dpage, 1);
		unlock_page(dpage);
		page_cache_release(dpage);
		if (locked)
			unlock_page(page);
		page_cache_release(page);
	}
	goto repeat;
}

void
nilfs_btnode_copy_gcdat_cache(struct nilfs_btnode_cache *gdbc,
				struct nilfs_btnode_cache *odbc)
{
	struct page *pages[NILFS_BTNODE_GANG_SIZE];
	unsigned int nr_pages;
	pgoff_t index = 0;
	int i, err;

repeat:
	nr_pages = nilfs_btnode_gang_lookup(gdbc, pages, index,
					    NILFS_BTNODE_GANG_SIZE);
	if (nr_pages == 0)
		return;
	/* note: mdt dirty flags should be cleared by segctor. */
	index = pages[nr_pages - 1]->index + 1;

	for (i = 0; i < nr_pages; i++) {
		struct page *page = pages[i], *dpage;
		pgoff_t offset = page->index;

		lock_page(page);
		if (unlikely(!page_has_buffers(page))) {
			page_debug(1, "page %p has not buffers. skipping\n",
				   page);
			goto skip;
		}
		dpage = nilfs_btnode_page_find_get(odbc, offset);
		if (dpage) {
			BUG_ON(!page_has_buffers(dpage));
			/* XXX skip if identical */
			lock_page(dpage);
			page_debug(3, "orig: page %p idx %lu, "
				   "gc: page %p idx %lu.\n",
				   dpage, dpage->index, page, page->index);
			/* dirty or pdirty pages do not appear here */
			BUG_ON(PageDirty(dpage));
			nilfs_gcdat_copy_buffer_page(page, dpage, 0);
			unlock_page(dpage);
			page_cache_release(dpage);
#if 1 /* 0 for debug, withdrawn pages only in gcdat cache */
		} else {
			struct page *p;

			write_lock_irq(&gdbc->tree_lock);
			p = radix_tree_delete(&gdbc->page_tree, offset);
			if (unlikely(!p))
				printk(KERN_ERR
				       "%s: radix_tree_delete failed "
				       "tree %p offset %lu\n",
				       __FUNCTION__, &gdbc->page_tree, offset);
			page->mapping = NULL;
			write_unlock_irq(&gdbc->tree_lock);
			page_cache_release(page); /* deletion from gc dat */
			page_debug(3, "adding page %p idx %lu as off %lu\n",
				   page, page->index, offset);
			err = nilfs_btnode_page_add_cache(page, odbc, offset);
			if (unlikely(err))
				page_debug(3,  "add cache failed %d. "
				       "page %p bc %p offset %lu.\n",
				       err, page, odbc, offset);
#endif
		}
	skip:
		unlock_page(page);
		page_cache_release(page);
	}
	goto repeat;
}

void
nilfs_btnode_commit_gcdat_cache(struct nilfs_btnode_cache *odbc,
				struct nilfs_btnode_cache *gdbc)
{
	int ret;

	ret = nilfs_btnode_free_dirty_pages(odbc);
	page_debug(3, "%d dirty pages hold on orig dat node cache.\n", ret);
	nilfs_btnode_copy_gcdat_cache(gdbc, odbc);
#if 0 /* for debug */
	ret = nilfs_btnode_free_all(odbc);
	page_debug(3, "%d pages remains on orig dat node cache.\n", ret);
#endif
}

void
nilfs_btnode_clear_gcdat_cache(struct nilfs_btnode_cache *gdbc)
{
	int ret;

	ret = nilfs_btnode_free_all(gdbc);
	page_debug(3, "%d pages remains on gc dat node cache.\n", ret);
}

/* Local Variables:		*/
/* eval: (c-set-style "linux")	*/
/* End:				*/
