mdb.c 82 KB
Newer Older
Howard Chu's avatar
Howard Chu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* mdb.c - memory-mapped database library */
/*
 * Copyright 2011 Howard Chu, Symas Corp.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted only as authorized by the OpenLDAP
 * Public License.
 *
 * A copy of this license is available in the file LICENSE in the
 * top-level directory of the distribution or, alternatively, at
 * <http://www.OpenLDAP.org/license.html>.
 *
 * This code is derived from btree.c written by Martin Hedenfalk.
 *
 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
Howard Chu's avatar
Howard Chu committed
30
31
32
33
34
35
36
37
38
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
Howard Chu's avatar
Howard Chu committed
39
#include <fcntl.h>
Howard Chu's avatar
Howard Chu committed
40
41
42
43
44
45
46
47
48
49

#include <assert.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
Howard Chu's avatar
Howard Chu committed
50
#include <pthread.h>
Howard Chu's avatar
Howard Chu committed
51
52

#include "mdb.h"
Howard Chu's avatar
Howard Chu committed
53

Howard Chu's avatar
Howard Chu committed
54
55
#define ULONG		unsigned long
typedef ULONG		pgno_t;
Howard Chu's avatar
Howard Chu committed
56

Howard Chu's avatar
Howard Chu committed
57
#include "midl.h"
Howard Chu's avatar
Howard Chu committed
58

59
60
61
#ifndef DEBUG
#define DEBUG 1
#endif
Howard Chu's avatar
Howard Chu committed
62

Howard Chu's avatar
Howard Chu committed
63
#if DEBUG && defined(__GNUC__)
64
65
# define DPRINTF(fmt, ...) \
	fprintf(stderr, "%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
Howard Chu's avatar
Howard Chu committed
66
#else
67
# define DPRINTF(...)	((void) 0)
Howard Chu's avatar
Howard Chu committed
68
69
70
71
72
73
#endif

#define PAGESIZE	 4096
#define MDB_MINKEYS	 4
#define MDB_MAGIC	 0xBEEFC0DE
#define MDB_VERSION	 1
Howard Chu's avatar
Howard Chu committed
74
#define MAXKEYSIZE	 511
Howard Chu's avatar
Howard Chu committed
75

Hallvard Furuseth's avatar
Hallvard Furuseth committed
76
#define P_INVALID	 (~0UL)
Howard Chu's avatar
Howard Chu committed
77
78
79
80
81

#define F_ISSET(w, f)	 (((w) & (f)) == (f))

typedef uint16_t	 indx_t;

Howard Chu's avatar
Howard Chu committed
82
83
84
85
86
#define DEFAULT_READERS	126
#define DEFAULT_MAPSIZE	1048576

/* Lock descriptor stuff */
#ifndef CACHELINE
Howard Chu's avatar
Howard Chu committed
87
#define CACHELINE	64	/* most CPUs. Itanium uses 128 */
Howard Chu's avatar
Howard Chu committed
88
89
#endif

Howard Chu's avatar
Howard Chu committed
90
91
92
93
94
95
typedef struct MDB_rxbody {
	ULONG		mrb_txnid;
	pid_t		mrb_pid;
	pthread_t	mrb_tid;
} MDB_rxbody;

Howard Chu's avatar
Howard Chu committed
96
typedef struct MDB_reader {
Howard Chu's avatar
Howard Chu committed
97
98
99
100
101
102
103
104
	union {
		MDB_rxbody mrx;
#define	mr_txnid	mru.mrx.mrb_txnid
#define	mr_pid	mru.mrx.mrb_pid
#define	mr_tid	mru.mrx.mrb_tid
		/* cache line alignment */
		char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mru;
Howard Chu's avatar
Howard Chu committed
105
106
107
} MDB_reader;

typedef struct MDB_txbody {
Howard Chu's avatar
Howard Chu committed
108
109
110
111
112
	uint32_t	mtb_magic;
	uint32_t	mtb_version;
	pthread_mutex_t	mtb_mutex;
	ULONG		mtb_txnid;
	uint32_t	mtb_numreaders;
Howard Chu's avatar
Howard Chu committed
113
	uint32_t	mtb_me_toggle;
Howard Chu's avatar
Howard Chu committed
114
115
116
} MDB_txbody;

typedef struct MDB_txninfo {
Howard Chu's avatar
Howard Chu committed
117
118
119
120
121
122
123
	union {
		MDB_txbody mtb;
#define mti_magic	mt1.mtb.mtb_magic
#define mti_version	mt1.mtb.mtb_version
#define mti_mutex	mt1.mtb.mtb_mutex
#define mti_txnid	mt1.mtb.mtb_txnid
#define mti_numreaders	mt1.mtb.mtb_numreaders
Howard Chu's avatar
Howard Chu committed
124
#define mti_me_toggle	mt1.mtb.mtb_me_toggle
Howard Chu's avatar
Howard Chu committed
125
126
127
128
129
130
131
132
		char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt1;
	union {
		pthread_mutex_t	mt2_wmutex;
#define mti_wmutex	mt2.mt2_wmutex
		char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt2;
	MDB_reader	mti_readers[1];
Howard Chu's avatar
Howard Chu committed
133
134
} MDB_txninfo;

Howard Chu's avatar
Howard Chu committed
135
136
137
138
139
/* Common header for all page types. Overflow pages
 * occupy a number of contiguous pages with no
 * headers on any page after the first.
 */
typedef struct MDB_page {		/* represents a page of storage */
Howard Chu's avatar
Howard Chu committed
140
141
142
143
144
#define	mp_pgno		mp_p.p_pgno
	union padded {
		pgno_t		p_pgno;		/* page number */
		void *		p_pad;
	} mp_p;
Howard Chu's avatar
Howard Chu committed
145
146
147
148
#define	P_BRANCH	 0x01		/* branch page */
#define	P_LEAF		 0x02		/* leaf page */
#define	P_OVERFLOW	 0x04		/* overflow page */
#define	P_META		 0x08		/* meta page */
Howard Chu's avatar
Howard Chu committed
149
#define	P_DIRTY		 0x10		/* dirty page */
Howard Chu's avatar
Howard Chu committed
150
#define	P_LEAF2		 0x20		/* DB with small, fixed size keys and no data */
Howard Chu's avatar
Howard Chu committed
151
152
153
154
155
156
157
158
159
160
	uint32_t	mp_flags;
#define mp_lower	mp_pb.pb.pb_lower
#define mp_upper	mp_pb.pb.pb_upper
#define mp_pages	mp_pb.pb_pages
	union page_bounds {
		struct {
			indx_t		pb_lower;		/* lower bound of free space */
			indx_t		pb_upper;		/* upper bound of free space */
		} pb;
		uint32_t	pb_pages;	/* number of overflow pages */
Howard Chu's avatar
Howard Chu committed
161
162
163
164
		struct {
			indx_t	pb_ksize;	/* on a LEAF2 page */
			indx_t	pb_numkeys;
		} pb2;
Howard Chu's avatar
Howard Chu committed
165
166
167
168
	} mp_pb;
	indx_t		mp_ptrs[1];		/* dynamic size */
} MDB_page;

169
#define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
Howard Chu's avatar
Howard Chu committed
170
171
172

#define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
Howard Chu's avatar
Howard Chu committed
173
174
#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
				((env)->me_psize - PAGEHDRSZ))
Howard Chu's avatar
Howard Chu committed
175
176
177
178
#define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)

Howard Chu's avatar
Howard Chu committed
179
180
#define OVPAGES(size, psize)	(PAGEHDRSZ + size + psize - 1) / psize;

Howard Chu's avatar
Howard Chu committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
typedef struct MDB_db {
	uint32_t	md_pad;
	uint16_t	md_flags;
	uint16_t	md_depth;
	ULONG		md_branch_pages;
	ULONG		md_leaf_pages;
	ULONG		md_overflow_pages;
	ULONG		md_entries;
	pgno_t		md_root;
} MDB_db;

#define	FREE_DBI	0
#define	MAIN_DBI	1

Howard Chu's avatar
Howard Chu committed
195
typedef struct MDB_meta {			/* meta (footer) page content */
Howard Chu's avatar
Howard Chu committed
196
197
198
199
	uint32_t	mm_magic;
	uint32_t	mm_version;
	void		*mm_address;		/* address for fixed mapping */
	size_t		mm_mapsize;			/* size of mmap region */
Howard Chu's avatar
Howard Chu committed
200
201
202
	MDB_db		mm_dbs[2];			/* first is free space, 2nd is main db */
#define	mm_psize	mm_dbs[0].md_pad
#define	mm_flags	mm_dbs[0].md_flags
Howard Chu's avatar
Howard Chu committed
203
204
	pgno_t		mm_last_pg;			/* last used page in file */
	ULONG		mm_txnid;			/* txnid that committed this page */
Howard Chu's avatar
Howard Chu committed
205
206
207
208
} MDB_meta;

typedef struct MDB_dhead {					/* a dirty page */
	MDB_page	*md_parent;
209
	unsigned	md_pi;				/* parent index */
Howard Chu's avatar
Howard Chu committed
210
211
212
213
214
215
216
217
	int			md_num;
} MDB_dhead;

typedef struct MDB_dpage {
	MDB_dhead	h;
	MDB_page	p;
} MDB_dpage;

Howard Chu's avatar
Howard Chu committed
218
219
typedef struct MDB_oldpages {
	struct MDB_oldpages *mo_next;
Howard Chu's avatar
Howard Chu committed
220
	ULONG		mo_txnid;
Howard Chu's avatar
Howard Chu committed
221
222
223
	pgno_t		mo_pages[1];	/* dynamic */
} MDB_oldpages;

Howard Chu's avatar
Howard Chu committed
224
225
226
typedef struct MDB_pageparent {
	MDB_page *mp_page;
	MDB_page *mp_parent;
227
	unsigned mp_pi;
Howard Chu's avatar
Howard Chu committed
228
229
} MDB_pageparent;

Howard Chu's avatar
Howard Chu committed
230
static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num);
Howard Chu's avatar
Howard Chu committed
231
232
233
234
235
236
237
238
239
static int 		mdb_touch(MDB_txn *txn, MDB_pageparent *mp);

typedef struct MDB_ppage {					/* ordered list of pages */
	SLIST_ENTRY(MDB_ppage)	 mp_entry;
	MDB_page		*mp_page;
	unsigned int	mp_ki;		/* cursor index on page */
} MDB_ppage;
SLIST_HEAD(page_stack, MDB_ppage);

Howard Chu's avatar
Howard Chu committed
240
241
242
/* FIXME: tree depth is mostly bounded, we should just
 * use a fixed array and avoid malloc/pointer chasing
 */
Howard Chu's avatar
Howard Chu committed
243
244
245
246
247
#define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->mc_stack)
#define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->mc_stack)
#define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->mc_stack, mp_entry)
#define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->mc_stack, p, mp_entry)

Howard Chu's avatar
Howard Chu committed
248
struct MDB_xcursor;
249

Howard Chu's avatar
Howard Chu committed
250
251
252
struct MDB_cursor {
	MDB_txn		*mc_txn;
	struct page_stack	 mc_stack;		/* stack of parent pages */
Howard Chu's avatar
Howard Chu committed
253
	MDB_dbi		mc_dbi;
Howard Chu's avatar
Howard Chu committed
254
255
	short		mc_initialized;	/* 1 if initialized */
	short		mc_eof;		/* 1 if end is reached */
Howard Chu's avatar
Howard Chu committed
256
	struct MDB_xcursor	*mc_xcursor;
Howard Chu's avatar
Howard Chu committed
257
258
259
260
261
262
263
264
265
266
267
};

#define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))

typedef struct MDB_node {
#define mn_pgno		 mn_p.np_pgno
#define mn_dsize	 mn_p.np_dsize
	union {
		pgno_t		 np_pgno;	/* child page number */
		uint32_t	 np_dsize;	/* leaf data size */
	} mn_p;
Howard Chu's avatar
Howard Chu committed
268
269
	unsigned int	mn_flags:4;
	unsigned int	mn_ksize:12;			/* key size */
Howard Chu's avatar
Howard Chu committed
270
#define F_BIGDATA	 0x01			/* data put on overflow page */
Howard Chu's avatar
Howard Chu committed
271
#define F_SUBDATA	 0x02			/* data is a sub-database */
Howard Chu's avatar
Howard Chu committed
272
273
274
	char		mn_data[1];
} MDB_node;

Howard Chu's avatar
Howard Chu committed
275
typedef struct MDB_dbx {
276
	MDB_val		md_name;
Howard Chu's avatar
Howard Chu committed
277
278
279
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_cmp_func	*md_dcmp;		/* user dupsort function */
	MDB_rel_func	*md_rel;		/* user relocate function */
280
281
	MDB_dbi	md_parent;
	unsigned int	md_dirty;
Howard Chu's avatar
Howard Chu committed
282
283
} MDB_dbx;

Howard Chu's avatar
Howard Chu committed
284
285
struct MDB_txn {
	pgno_t		mt_next_pgno;	/* next unallocated page */
Howard Chu's avatar
Howard Chu committed
286
287
	ULONG		mt_txnid;
	ULONG		mt_oldest;
Howard Chu's avatar
Howard Chu committed
288
	MDB_env		*mt_env;	
Howard Chu's avatar
Howard Chu committed
289
	pgno_t		*mt_free_pgs;	/* this is an IDL */
Howard Chu's avatar
Howard Chu committed
290
	union {
Howard Chu's avatar
Howard Chu committed
291
		MIDL2	*dirty_list;	/* modified pages */
Howard Chu's avatar
Howard Chu committed
292
293
		MDB_reader	*reader;
	} mt_u;
Howard Chu's avatar
Howard Chu committed
294
	MDB_dbx		*mt_dbxs;		/* array */
Howard Chu's avatar
Howard Chu committed
295
	MDB_db		*mt_dbs;
Howard Chu's avatar
Howard Chu committed
296
297
	unsigned int	mt_numdbs;

Howard Chu's avatar
Howard Chu committed
298
299
#define MDB_TXN_RDONLY		0x01		/* read-only transaction */
#define MDB_TXN_ERROR		0x02		/* an error has occurred */
Howard Chu's avatar
Howard Chu committed
300
#define MDB_TXN_METOGGLE	0x04		/* used meta page 1 */
Howard Chu's avatar
Howard Chu committed
301
	unsigned int	mt_flags;
Howard Chu's avatar
Howard Chu committed
302
303
};

Howard Chu's avatar
Howard Chu committed
304
305
306
307
308
309
310
311
/* Context for sorted-dup records */
typedef struct MDB_xcursor {
	MDB_cursor mx_cursor;
	MDB_txn mx_txn;
	MDB_dbx	mx_dbxs[4];
	MDB_db	mx_dbs[4];
} MDB_xcursor;

Howard Chu's avatar
Howard Chu committed
312
313
struct MDB_env {
	int			me_fd;
Howard Chu's avatar
Howard Chu committed
314
	int			me_lfd;
Howard Chu's avatar
Howard Chu committed
315
	int			me_mfd;			/* just for writing the meta pages */
Howard Chu's avatar
Howard Chu committed
316
317
318
#define	MDB_FATAL_ERROR	0x80000000U
	uint32_t 	me_flags;
	uint32_t	me_extrapad;	/* unused for now */
Howard Chu's avatar
Howard Chu committed
319
320
321
	unsigned int	me_maxreaders;
	unsigned int	me_numdbs;
	unsigned int	me_maxdbs;
Howard Chu's avatar
Howard Chu committed
322
	char		*me_path;
Howard Chu's avatar
Howard Chu committed
323
	char		*me_map;
Howard Chu's avatar
Howard Chu committed
324
	MDB_txninfo	*me_txns;
Howard Chu's avatar
Howard Chu committed
325
326
	MDB_meta	*me_metas[2];
	MDB_meta	*me_meta;
Howard Chu's avatar
Howard Chu committed
327
	MDB_txn		*me_txn;		/* current write transaction */
Howard Chu's avatar
Howard Chu committed
328
329
	size_t		me_mapsize;
	off_t		me_size;		/* current file size */
Howard Chu's avatar
Howard Chu committed
330
331
332
	pgno_t		me_maxpg;		/* me_mapsize / me_psize */
	unsigned int	me_psize;
	unsigned int	me_db_toggle;
Howard Chu's avatar
Howard Chu committed
333
334
	MDB_dbx		*me_dbxs;		/* array */
	MDB_db		*me_dbs[2];
Howard Chu's avatar
Howard Chu committed
335
	MDB_oldpages *me_pghead;
Howard Chu's avatar
Howard Chu committed
336
	pthread_key_t	me_txkey;	/* thread-key for readers */
337
	pgno_t		me_free_pgs[MDB_IDL_UM_SIZE];
Howard Chu's avatar
Howard Chu committed
338
	MIDL2		me_dirty_list[MDB_IDL_DB_SIZE];
Howard Chu's avatar
Howard Chu committed
339
340
341
342
343
344
345
346
347
348
349
350
351
352
};

#define NODESIZE	 offsetof(MDB_node, mn_data)

#define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node)	 (void *)((node)->mn_data)
#define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node)	 ((node)->mn_pgno)
#define NODEDSZ(node)	 ((node)->mn_dsize)

#define MDB_COMMIT_PAGES	 64	/* max number of pages to write in one commit */

Howard Chu's avatar
Howard Chu committed
353
354
static int  mdb_search_page_root(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
355
356
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);
Howard Chu's avatar
Howard Chu committed
357
358
static int  mdb_search_page(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
359
360
361
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);

Howard Chu's avatar
Howard Chu committed
362
363
364
static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
static int  mdb_env_read_meta(MDB_env *env, int *which);
static int  mdb_env_write_meta(MDB_txn *txn);
Howard Chu's avatar
Howard Chu committed
365
static MDB_page *mdb_get_page(MDB_txn *txn, pgno_t pgno);
Howard Chu's avatar
Howard Chu committed
366

Howard Chu's avatar
Howard Chu committed
367
static MDB_node *mdb_search_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
368
			    MDB_val *key, int *exactp, unsigned int *kip);
Howard Chu's avatar
Howard Chu committed
369
static int  mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
370
371
			    indx_t indx, MDB_val *key, MDB_val *data,
			    pgno_t pgno, uint8_t flags);
Howard Chu's avatar
Howard Chu committed
372
static void mdb_del_node(MDB_page *mp, indx_t indx);
Howard Chu's avatar
Howard Chu committed
373
374
static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, unsigned int ki,
    MDB_pageparent *mpp, MDB_node *leaf);
Howard Chu's avatar
Howard Chu committed
375
376
static int mdb_put0(MDB_txn *txn, MDB_dbi dbi,
    MDB_val *key, MDB_val *data, unsigned int flags);
Howard Chu's avatar
Howard Chu committed
377
static int  mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
378
379
380
381

static int		 mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mp);
static int		 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
static int		 mdb_move_node(MDB_txn *txn, MDB_dbi dbi, 
Howard Chu's avatar
Howard Chu committed
382
383
				MDB_pageparent *src, indx_t srcindx,
				MDB_pageparent *dst, indx_t dstindx);
Howard Chu's avatar
Howard Chu committed
384
static int		 mdb_merge(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src,
Howard Chu's avatar
Howard Chu committed
385
			    MDB_pageparent *dst);
Howard Chu's avatar
Howard Chu committed
386
static int		 mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp,
Howard Chu's avatar
Howard Chu committed
387
388
			    unsigned int *newindxp, MDB_val *newkey,
			    MDB_val *newdata, pgno_t newpgno);
Howard Chu's avatar
Howard Chu committed
389
static MDB_dpage *mdb_new_page(MDB_txn *txn, MDB_dbi dbi, uint32_t flags, int num);
Howard Chu's avatar
Howard Chu committed
390
391
392
393
394

static void		 cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
			    MDB_page *mp);

Howard Chu's avatar
Howard Chu committed
395
static int		 mdb_set_key(MDB_node *node, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
396
397
static int		 mdb_sibling(MDB_cursor *cursor, int move_right);
static int		 mdb_cursor_next(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
398
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
399
static int		 mdb_cursor_prev(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
400
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
401
static int		 mdb_cursor_set(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
402
			    MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp);
Howard Chu's avatar
Howard Chu committed
403
404
static int		 mdb_cursor_first(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
405
406
static int		 mdb_cursor_last(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
407

Howard Chu's avatar
Howard Chu committed
408
static void		mdb_xcursor_init0(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
Howard Chu's avatar
Howard Chu committed
409
static void		mdb_xcursor_init1(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx, MDB_node *node);
Howard Chu's avatar
Howard Chu committed
410
411
static void		mdb_xcursor_fini(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);

Howard Chu's avatar
Howard Chu committed
412
static size_t		 mdb_leaf_size(MDB_env *env, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
413
			    MDB_val *data);
Howard Chu's avatar
Howard Chu committed
414
static size_t		 mdb_branch_size(MDB_env *env, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
415
416
417
418
419
420
421
422
423

static int		 memncmp(const void *s1, size_t n1,
				 const void *s2, size_t n2);
static int		 memnrcmp(const void *s1, size_t n1,
				  const void *s2, size_t n2);

static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
424
425
426
427
428
	int diff, len_diff = -1;

	if (n1 >= n2) {
		len_diff = (n1 > n2);
		n1 = n2;
Howard Chu's avatar
Howard Chu committed
429
	}
430
431
	diff = memcmp(s1, s2, n1);
	return diff ? diff : len_diff;
Howard Chu's avatar
Howard Chu committed
432
433
434
435
436
}

static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
437
	const unsigned char	*p1, *p2, *p1_lim;
Howard Chu's avatar
Howard Chu committed
438
439

	if (n2 == 0)
440
441
442
		return n1 != 0;
	if (n1 == 0)
		return -1;
Howard Chu's avatar
Howard Chu committed
443
444
445
446

	p1 = (const unsigned char *)s1 + n1 - 1;
	p2 = (const unsigned char *)s2 + n2 - 1;

447
448
449
	for (p1_lim = (n1 <= n2 ? s1 : s2);  *p1 == *p2;  p1--, p2--) {
		if (p1 == p1_lim)
			return (p1 != s1) ? (p1 != p2) : (p2 != s2) ? -1 : 0;
Howard Chu's avatar
Howard Chu committed
450
451
452
453
	}
	return *p1 - *p2;
}

Howard Chu's avatar
Howard Chu committed
454
455
456
457
458
459
460
461
462
char *
mdb_version(int *maj, int *min, int *pat)
{
	*maj = MDB_VERSION_MAJOR;
	*min = MDB_VERSION_MINOR;
	*pat = MDB_VERSION_PATCH;
	return MDB_VERSION_STRING;
}

Howard Chu's avatar
Howard Chu committed
463
int
Howard Chu's avatar
Howard Chu committed
464
mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
Howard Chu's avatar
Howard Chu committed
465
{
Howard Chu's avatar
Howard Chu committed
466
	return txn->mt_dbxs[dbi].md_cmp(a, b);
Howard Chu's avatar
Howard Chu committed
467
468
469
}

static int
Howard Chu's avatar
Howard Chu committed
470
_mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *key1, const MDB_val *key2)
Howard Chu's avatar
Howard Chu committed
471
{
Howard Chu's avatar
Howard Chu committed
472
473
474
475
476
	if (txn->mt_dbs[dbi].md_flags & (MDB_REVERSEKEY
#if __BYTE_ORDER == __LITTLE_ENDIAN
		|MDB_INTEGERKEY
#endif
	))
Howard Chu's avatar
Howard Chu committed
477
478
479
480
481
482
483
		return memnrcmp(key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
	else
		return memncmp((char *)key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
}

/* Allocate new page(s) for writing */
static MDB_dpage *
Howard Chu's avatar
Howard Chu committed
484
mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
Howard Chu's avatar
Howard Chu committed
485
486
{
	MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
487
	pgno_t pgno = P_INVALID;
Howard Chu's avatar
Howard Chu committed
488
	ULONG oldest;
Howard Chu's avatar
Howard Chu committed
489
	MIDL2 mid;
Howard Chu's avatar
Howard Chu committed
490

Howard Chu's avatar
Howard Chu committed
491
492
493
	if (txn->mt_txnid > 2) {

	oldest = txn->mt_txnid - 2;
Howard Chu's avatar
Howard Chu committed
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
	if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
		/* See if there's anything in the free DB */
		MDB_pageparent mpp;
		MDB_node *leaf;
		ULONG *kptr;

		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp);
		leaf = NODEPTR(mpp.mp_page, 0);
		kptr = (ULONG *)NODEKEY(leaf);

		/* It's potentially usable, unless there are still
		 * older readers outstanding. Grab it.
		 */
		if (oldest > *kptr) {
			MDB_oldpages *mop;
			MDB_val data;
			pgno_t *idl;

Howard Chu's avatar
Howard Chu committed
514
			mdb_read_data(txn, leaf, &data);
Howard Chu's avatar
Howard Chu committed
515
516
517
518
519
520
521
			idl = (ULONG *)data.mv_data;
			mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
			mop->mo_next = txn->mt_env->me_pghead;
			mop->mo_txnid = *kptr;
			txn->mt_env->me_pghead = mop;
			memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));

Howard Chu's avatar
Howard Chu committed
522
523
524
525
526
527
528
529
530
531
#if DEBUG > 1
			{
				unsigned int i;
				DPRINTF("IDL read txn %lu root %lu num %lu",
					mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
				for (i=0; i<idl[0]; i++) {
					DPRINTF("IDL %lu", idl[i+1]);
				}
			}
#endif
Howard Chu's avatar
Howard Chu committed
532
533
534
535
536
537
538
539
			/* drop this IDL from the DB */
			mpp.mp_parent = NULL;
			mpp.mp_pi = 0;
			mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp);
			leaf = NODEPTR(mpp.mp_page, 0);
			mdb_del0(txn, FREE_DBI, 0, &mpp, leaf);
		}
	}
Howard Chu's avatar
Howard Chu committed
540
	if (txn->mt_env->me_pghead) {
Howard Chu's avatar
Howard Chu committed
541
		unsigned int i;
Howard Chu's avatar
Howard Chu committed
542
543
		for (i=0; i<txn->mt_env->me_txns->mti_numreaders; i++) {
			ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
544
545
			if (!mr) continue;
			if (mr < oldest)
Howard Chu's avatar
Howard Chu committed
546
				oldest = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
547
548
549
550
551
		}
		if (oldest > txn->mt_env->me_pghead->mo_txnid) {
			MDB_oldpages *mop = txn->mt_env->me_pghead;
			txn->mt_oldest = oldest;
			if (num > 1) {
552
553
554
555
				/* FIXME: For now, always use fresh pages. We
				 * really ought to search the free list for a
				 * contiguous range.
				 */
Howard Chu's avatar
Howard Chu committed
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
				;
			} else {
				/* peel pages off tail, so we only have to truncate the list */
				pgno = MDB_IDL_LAST(mop->mo_pages);
				if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
					mop->mo_pages[2]++;
					if (mop->mo_pages[2] > mop->mo_pages[1])
						mop->mo_pages[0] = 0;
				} else {
					mop->mo_pages[0]--;
				}
				if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
					txn->mt_env->me_pghead = mop->mo_next;
					free(mop);
				}
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
574
	}
Howard Chu's avatar
Howard Chu committed
575

Howard Chu's avatar
Howard Chu committed
576
577
578
579
580
	if (pgno == P_INVALID) {
		/* DB size is maxed out */
		if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg)
			return NULL;
	}
Howard Chu's avatar
Howard Chu committed
581
	if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL)
Howard Chu's avatar
Howard Chu committed
582
583
584
585
		return NULL;
	dp->h.md_num = num;
	dp->h.md_parent = parent;
	dp->h.md_pi = parent_idx;
Howard Chu's avatar
Howard Chu committed
586
587
588
589
590
591
	if (pgno == P_INVALID) {
		dp->p.mp_pgno = txn->mt_next_pgno;
		txn->mt_next_pgno += num;
	} else {
		dp->p.mp_pgno = pgno;
	}
Howard Chu's avatar
Howard Chu committed
592
593
594
	mid.mid = dp->p.mp_pgno;
	mid.mptr = dp;
	mdb_midl2_insert(txn->mt_u.dirty_list, &mid);
Howard Chu's avatar
Howard Chu committed
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610

	return dp;
}

/* Touch a page: make it dirty and re-insert into tree with updated pgno.
 */
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
	MDB_page *mp = pp->mp_page;
	pgno_t	pgno;
	assert(txn != NULL);
	assert(pp != NULL);

	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
		MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
611
		if ((dp = mdb_alloc_page(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
Howard Chu's avatar
Howard Chu committed
612
			return ENOMEM;
Howard Chu's avatar
Howard Chu committed
613
		DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
614
		mdb_midl_insert(txn->mt_free_pgs, mp->mp_pgno);
Howard Chu's avatar
Howard Chu committed
615
		pgno = dp->p.mp_pgno;
Howard Chu's avatar
Howard Chu committed
616
		memcpy(&dp->p, mp, txn->mt_env->me_psize);
Howard Chu's avatar
Howard Chu committed
617
618
619
620
621
622
623
624
625
626
627
628
629
		mp = &dp->p;
		mp->mp_pgno = pgno;
		mp->mp_flags |= P_DIRTY;

		/* Update the page number to new touched page. */
		if (pp->mp_parent != NULL)
			NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
		pp->mp_page = mp;
	}
	return 0;
}

int
Howard Chu's avatar
Howard Chu committed
630
mdb_env_sync(MDB_env *env, int force)
Howard Chu's avatar
Howard Chu committed
631
632
{
	int rc = 0;
Howard Chu's avatar
Howard Chu committed
633
	if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
Howard Chu's avatar
Howard Chu committed
634
		if (fdatasync(env->me_fd))
Howard Chu's avatar
Howard Chu committed
635
636
637
638
639
640
641
642
643
			rc = errno;
	}
	return rc;
}

int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
	MDB_txn	*txn;
Howard Chu's avatar
Howard Chu committed
644
	int rc, toggle;
Howard Chu's avatar
Howard Chu committed
645

Howard Chu's avatar
Howard Chu committed
646
647
648
649
	if (env->me_flags & MDB_FATAL_ERROR) {
		DPRINTF("mdb_txn_begin: environment had fatal error, must shutdown!");
		return MDB_PANIC;
	}
Howard Chu's avatar
Howard Chu committed
650
	if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) {
Howard Chu's avatar
Howard Chu committed
651
652
653
654
655
656
657
		DPRINTF("calloc: %s", strerror(errno));
		return ENOMEM;
	}

	if (rdonly) {
		txn->mt_flags |= MDB_TXN_RDONLY;
	} else {
Howard Chu's avatar
Howard Chu committed
658
659
660
661
		txn->mt_u.dirty_list = env->me_dirty_list;
		txn->mt_u.dirty_list[0].mid = 0;
		txn->mt_free_pgs = env->me_free_pgs;
		txn->mt_free_pgs[0] = 0;
Howard Chu's avatar
Howard Chu committed
662

Howard Chu's avatar
Howard Chu committed
663
664
		pthread_mutex_lock(&env->me_txns->mti_wmutex);
		env->me_txns->mti_txnid++;
Howard Chu's avatar
Howard Chu committed
665
	}
Howard Chu's avatar
Howard Chu committed
666

Howard Chu's avatar
Howard Chu committed
667
	txn->mt_txnid = env->me_txns->mti_txnid;
Howard Chu's avatar
Howard Chu committed
668
669
670
	if (rdonly) {
		MDB_reader *r = pthread_getspecific(env->me_txkey);
		if (!r) {
Howard Chu's avatar
Howard Chu committed
671
			unsigned int i;
Howard Chu's avatar
Howard Chu committed
672
673
674
			pthread_mutex_lock(&env->me_txns->mti_mutex);
			for (i=0; i<env->me_txns->mti_numreaders; i++)
				if (env->me_txns->mti_readers[i].mr_pid == 0)
Howard Chu's avatar
Howard Chu committed
675
676
					break;
			if (i == env->me_maxreaders) {
Howard Chu's avatar
Howard Chu committed
677
				pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
678
679
				return ENOSPC;
			}
Howard Chu's avatar
Howard Chu committed
680
681
682
			env->me_txns->mti_readers[i].mr_pid = getpid();
			env->me_txns->mti_readers[i].mr_tid = pthread_self();
			r = &env->me_txns->mti_readers[i];
Howard Chu's avatar
Howard Chu committed
683
			pthread_setspecific(env->me_txkey, r);
Howard Chu's avatar
Howard Chu committed
684
685
686
			if (i >= env->me_txns->mti_numreaders)
				env->me_txns->mti_numreaders = i+1;
			pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
687
		}
Howard Chu's avatar
Howard Chu committed
688
689
690
		r->mr_txnid = txn->mt_txnid;
		txn->mt_u.reader = r;
	} else {
Howard Chu's avatar
Howard Chu committed
691
692
693
694
695
		env->me_txn = txn;
	}

	txn->mt_env = env;

Howard Chu's avatar
Howard Chu committed
696
	toggle = env->me_txns->mti_me_toggle;
Howard Chu's avatar
Howard Chu committed
697
	if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
698
699
700
701
		mdb_txn_abort(txn);
		return rc;
	}

Howard Chu's avatar
Howard Chu committed
702
703
	/* Copy the DB arrays */
	txn->mt_numdbs = env->me_numdbs;
Howard Chu's avatar
Howard Chu committed
704
705
706
707
708
709
	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
	txn->mt_dbs = malloc(env->me_maxdbs * sizeof(MDB_db));
	memcpy(txn->mt_dbs, env->me_meta->mm_dbs, 2 * sizeof(MDB_db));
	if (txn->mt_numdbs > 2)
		memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2,
			(txn->mt_numdbs - 2) * sizeof(MDB_db));
Howard Chu's avatar
Howard Chu committed
710
711
712
713

	if (!rdonly) {
		if (toggle)
			txn->mt_flags |= MDB_TXN_METOGGLE;
Howard Chu's avatar
Howard Chu committed
714
		txn->mt_next_pgno = env->me_meta->mm_last_pg+1;
Howard Chu's avatar
Howard Chu committed
715
	}
Howard Chu's avatar
Howard Chu committed
716

717
	DPRINTF("begin transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
718
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
719
720
721
722
723
724
725
726
727
728
729
730
731
732

	*ret = txn;
	return MDB_SUCCESS;
}

void
mdb_txn_abort(MDB_txn *txn)
{
	MDB_env	*env;

	if (txn == NULL)
		return;

	env = txn->mt_env;
733
	DPRINTF("abort transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
734
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
735

Howard Chu's avatar
Howard Chu committed
736
	free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
737

Howard Chu's avatar
Howard Chu committed
738
739
740
	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		txn->mt_u.reader->mr_txnid = 0;
	} else {
Howard Chu's avatar
Howard Chu committed
741
		MDB_oldpages *mop;
742
743
		unsigned int i;

Howard Chu's avatar
Howard Chu committed
744
		/* Discard all dirty pages. */
Howard Chu's avatar
Howard Chu committed
745
746
		for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++)
			free(txn->mt_u.dirty_list[i].mptr);
Howard Chu's avatar
Howard Chu committed
747
748
749
750
751
752

		while ((mop = txn->mt_env->me_pghead)) {
			txn->mt_env->me_pghead = mop->mo_next;
			free(mop);
		}

Howard Chu's avatar
Howard Chu committed
753
		env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
754
		env->me_txns->mti_txnid--;
755
756
		for (i=2; i<env->me_numdbs; i++)
			env->me_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
757
		pthread_mutex_unlock(&env->me_txns->mti_wmutex);
Howard Chu's avatar
Howard Chu committed
758
759
760
761
762
763
764
765
766
	}

	free(txn);
}

int
mdb_txn_commit(MDB_txn *txn)
{
	int		 n, done;
Howard Chu's avatar
Howard Chu committed
767
	unsigned int i;
Howard Chu's avatar
Howard Chu committed
768
769
770
771
	ssize_t		 rc;
	off_t		 size;
	MDB_dpage	*dp;
	MDB_env	*env;
772
	pgno_t	next;
Howard Chu's avatar
Howard Chu committed
773
774
775
776
777
778
779
780
781
	struct iovec	 iov[MDB_COMMIT_PAGES];

	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	env = txn->mt_env;

	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		mdb_txn_abort(txn);
782
		return MDB_SUCCESS;
Howard Chu's avatar
Howard Chu committed
783
784
785
786
787
788
789
790
791
792
793
794
795
796
	}

	if (txn != env->me_txn) {
		DPRINTF("attempt to commit unknown transaction");
		mdb_txn_abort(txn);
		return EINVAL;
	}

	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
		DPRINTF("error flag is set, can't commit");
		mdb_txn_abort(txn);
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
797
	if (!txn->mt_u.dirty_list[0].mid)
Howard Chu's avatar
Howard Chu committed
798
799
		goto done;

Howard Chu's avatar
Howard Chu committed
800
	DPRINTF("committing transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
801
802
	    txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);

Howard Chu's avatar
Howard Chu committed
803
804
805
806
807
808
809
810
811
812
	/* should only be one record now */
	if (env->me_pghead) {
		MDB_val key, data;
		MDB_oldpages *mop;

		mop = env->me_pghead;
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&mop->mo_txnid;
		data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
		data.mv_data = mop->mo_pages;
Howard Chu's avatar
Howard Chu committed
813
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
		free(env->me_pghead);
		env->me_pghead = NULL;
	}
	/* save to free list */
	if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
		MDB_val key, data;
		MDB_pageparent mpp;

		/* make sure last page of freeDB is touched and on freelist */
		key.mv_size = MAXKEYSIZE+1;
		key.mv_data = NULL;
		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, &key, NULL, 1, &mpp);

Howard Chu's avatar
Howard Chu committed
829
830
831
832
833
834
835
836
837
838
839
#if DEBUG > 1
		{
			unsigned int i;
			ULONG *idl = txn->mt_free_pgs;
			DPRINTF("IDL write txn %lu root %lu num %lu",
				txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
			for (i=0; i<idl[0]; i++) {
				DPRINTF("IDL %lu", idl[i+1]);
			}
		}
#endif
Howard Chu's avatar
Howard Chu committed
840
841
842
843
844
		/* write to last page of freeDB */
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&txn->mt_txnid;
		data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
		data.mv_data = txn->mt_free_pgs;
Howard Chu's avatar
Howard Chu committed
845
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
846
847
	}

Howard Chu's avatar
Howard Chu committed
848
849
850
851
852
853
854
	/* Update DB root pointers. Their pages have already been
	 * touched so this is all in-place and cannot fail.
	 */
	{
		MDB_val data;
		data.mv_size = sizeof(MDB_db);

855
856
		for (i = 2; i < txn->mt_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
857
				data.mv_data = &txn->mt_dbs[i];
Howard Chu's avatar
Howard Chu committed
858
				mdb_put0(txn, MAIN_DBI, &txn->mt_dbxs[i].md_name, &data, 0);
Howard Chu's avatar
Howard Chu committed
859
860
861
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
862
863
864

	/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
	 */
865
	next = 0;
Howard Chu's avatar
Howard Chu committed
866
867
868
	do {
		n = 0;
		done = 1;
869
		size = 0;
Howard Chu's avatar
Howard Chu committed
870
871
		for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
			dp = txn->mt_u.dirty_list[i].mptr;
872
			if (dp->p.mp_pgno != next) {
Howard Chu's avatar
Howard Chu committed
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
				if (n) {
					DPRINTF("committing %u dirty pages", n);
					rc = writev(env->me_fd, iov, n);
					if (rc != size) {
						n = errno;
						if (rc > 0)
							DPRINTF("short write, filesystem full?");
						else
							DPRINTF("writev: %s", strerror(errno));
						mdb_txn_abort(txn);
						return n;
					}
					n = 0;
					size = 0;
				}
Howard Chu's avatar
Howard Chu committed
888
				lseek(env->me_fd, dp->p.mp_pgno * env->me_psize, SEEK_SET);
889
890
				next = dp->p.mp_pgno;
			}
Howard Chu's avatar
Howard Chu committed
891
			DPRINTF("committing page %lu", dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
892
			iov[n].iov_len = env->me_psize * dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
893
			iov[n].iov_base = &dp->p;
894
895
			size += iov[n].iov_len;
			next = dp->p.mp_pgno + dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
896
897
898
899
900
901
902
903
904
905
906
907
908
			/* clear dirty flag */
			dp->p.mp_flags &= ~P_DIRTY;
			if (++n >= MDB_COMMIT_PAGES) {
				done = 0;
				break;
			}
		}

		if (n == 0)
			break;

		DPRINTF("committing %u dirty pages", n);
		rc = writev(env->me_fd, iov, n);
909
		if (rc != size) {
Howard Chu's avatar
Howard Chu committed
910
911
912
913
914
915
916
917
918
919
920
			n = errno;
			if (rc > 0)
				DPRINTF("short write, filesystem full?");
			else
				DPRINTF("writev: %s", strerror(errno));
			mdb_txn_abort(txn);
			return n;
		}

	} while (!done);

Howard Chu's avatar
Howard Chu committed
921
922
	/* Drop the dirty pages.
	 */
Howard Chu's avatar
Howard Chu committed
923
924
	for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++)
		free(txn->mt_u.dirty_list[i].mptr);
Howard Chu's avatar
Howard Chu committed
925

Howard Chu's avatar
Howard Chu committed
926
	if ((n = mdb_env_sync(env, 0)) != 0 ||
Howard Chu's avatar
Howard Chu committed
927
	    (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
928
929
930
931
		mdb_txn_abort(txn);
		return n;
	}
	env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
932

Howard Chu's avatar
Howard Chu committed
933
	/* update the DB tables */
Howard Chu's avatar
Howard Chu committed
934
	{
Howard Chu's avatar
Howard Chu committed
935
		int toggle = !env->me_db_toggle;
Howard Chu's avatar
Howard Chu committed
936

937
938
		for (i = 2; i < env->me_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
939
				env->me_dbs[toggle][i] = txn->mt_dbs[i];
940
941
				txn->mt_dbxs[i].md_dirty = 0;
			}
Howard Chu's avatar
Howard Chu committed
942
943
		}
		for (i = env->me_numdbs; i < txn->mt_numdbs; i++) {
944
			txn->mt_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
945
946
947
948
			env->me_dbxs[i] = txn->mt_dbxs[i];
			env->me_dbs[toggle][i] = txn->mt_dbs[i];
		}
		env->me_db_toggle = toggle;
Howard Chu's avatar
Howard Chu committed
949
		env->me_numdbs = txn->mt_numdbs;
Howard Chu's avatar
Howard Chu committed
950

Howard Chu's avatar
Howard Chu committed
951
		free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
952
953
	}

Howard Chu's avatar
Howard Chu committed
954
	pthread_mutex_unlock(&env->me_txns->mti_wmutex);
955
	free(txn);
Howard Chu's avatar
Howard Chu committed
956
	txn = NULL;
Howard Chu's avatar
Howard Chu committed
957
958
959
960
961
962
963
964

done:
	mdb_txn_abort(txn);

	return MDB_SUCCESS;
}

static int
Howard Chu's avatar
Howard Chu committed
965
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
966
967
968
{
	char		 page[PAGESIZE];
	MDB_page	*p;
Howard Chu's avatar
Howard Chu committed
969
	MDB_meta	*m;
Howard Chu's avatar
Howard Chu committed
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
	int		 rc;

	assert(env != NULL);

	/* We don't know the page size yet, so use a minimum value.
	 */

	if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
		return ENOENT;
	} else if (rc != PAGESIZE) {
		if (rc > 0)
			errno = EINVAL;
		DPRINTF("read: %s", strerror(errno));
		return errno;
	}

	p = (MDB_page *)page;

Howard Chu's avatar
Howard Chu committed
988
989
	if (!F_ISSET(p->mp_flags, P_META)) {
		DPRINTF("page %lu not a meta page", p->mp_pgno);
Howard Chu's avatar
Howard Chu committed
990
991
992
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
993
994
995
	m = METADATA(p);
	if (m->mm_magic != MDB_MAGIC) {
		DPRINTF("meta has invalid magic");
Howard Chu's avatar
Howard Chu committed
996
997
998
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
999
	if (m->mm_version != MDB_VERSION) {
Howard Chu's avatar
Howard Chu committed
1000
		DPRINTF("database is version %u, expected version %u",