mdb.c 56.7 KB
Newer Older
Howard Chu's avatar
Howard Chu committed
1
2
3
4
5
6
7
8
9
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
10
11
#include <sys/ipc.h>
#include <sys/shm.h>
Howard Chu's avatar
Howard Chu committed
12
13
14
15
16
17
18
19
20
21
22
23

#include <assert.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
Howard Chu's avatar
Howard Chu committed
24
#include <pthread.h>
Howard Chu's avatar
Howard Chu committed
25
26
27

#include "mdb.h"

28
29
30
#ifndef DEBUG
#define DEBUG 1
#endif
Howard Chu's avatar
Howard Chu committed
31

32
33
34
#if (DEBUG +0) && defined(__GNUC__)
# define DPRINTF(fmt, ...) \
	fprintf(stderr, "%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
Howard Chu's avatar
Howard Chu committed
35
#else
36
# define DPRINTF(...)	((void) 0)
Howard Chu's avatar
Howard Chu committed
37
38
39
40
41
42
43
44
#endif

#define PAGESIZE	 4096
#define MDB_MINKEYS	 4
#define MDB_MAGIC	 0xBEEFC0DE
#define MDB_VERSION	 1
#define MAXKEYSIZE	 255

Howard Chu's avatar
Howard Chu committed
45
#define P_INVALID	 (~0L)
Howard Chu's avatar
Howard Chu committed
46
47
48
49
50
51

#define F_ISSET(w, f)	 (((w) & (f)) == (f))

typedef ulong		 pgno_t;
typedef uint16_t	 indx_t;

Howard Chu's avatar
Howard Chu committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#define DEFAULT_READERS	126
#define DEFAULT_MAPSIZE	1048576

/* Lock descriptor stuff */
#define RXBODY	\
	ulong		mr_txnid; \
	pid_t		mr_pid; \
	pthread_t	mr_tid
typedef struct MDB_rxbody {
	RXBODY;
} MDB_rxbody;

#ifndef CACHELINE
#define CACHELINE	64	/* most CPUs. Itanium uses 128 */
#endif

typedef struct MDB_reader {
	RXBODY;
	/* cache line alignment */
	char pad[CACHELINE-sizeof(MDB_rxbody)];
} MDB_reader;

#define	TXBODY \
	uint32_t	mt_magic;	\
	uint32_t	mt_version;	\
	pthread_mutex_t	mt_mutex;	\
	ulong		mt_txnid;	\
	uint32_t	mt_numreaders
typedef struct MDB_txbody {
	TXBODY;
} MDB_txbody;

typedef struct MDB_txninfo {
	TXBODY;
	char pad[CACHELINE-sizeof(MDB_txbody)];
	pthread_mutex_t	mt_wmutex;
	char pad2[CACHELINE-sizeof(pthread_mutex_t)];
	MDB_reader	mt_readers[1];
} MDB_txninfo;

Howard Chu's avatar
Howard Chu committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* Common header for all page types. Overflow pages
 * occupy a number of contiguous pages with no
 * headers on any page after the first.
 */
typedef struct MDB_page {		/* represents a page of storage */
	pgno_t		mp_pgno;		/* page number */
#define	P_BRANCH	 0x01		/* branch page */
#define	P_LEAF		 0x02		/* leaf page */
#define	P_OVERFLOW	 0x04		/* overflow page */
#define	P_META		 0x08		/* meta page */
#define	P_HEAD		 0x10		/* header page */
#define	P_DIRTY		 0x20		/* dirty page */
	uint32_t	mp_flags;
#define mp_lower	mp_pb.pb.pb_lower
#define mp_upper	mp_pb.pb.pb_upper
#define mp_pages	mp_pb.pb_pages
	union page_bounds {
		struct {
			indx_t		pb_lower;		/* lower bound of free space */
			indx_t		pb_upper;		/* upper bound of free space */
		} pb;
		uint32_t	pb_pages;	/* number of overflow pages */
	} mp_pb;
	indx_t		mp_ptrs[1];		/* dynamic size */
} MDB_page;

118
#define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
Howard Chu's avatar
Howard Chu committed
119
120
121

#define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
122
#define PAGEFILL(env, p) (1000L * ((env)->me_head.mh_psize - PAGEHDRSZ - SIZELEFT(p)) / \
Howard Chu's avatar
Howard Chu committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
				((env)->me_head.mh_psize - PAGEHDRSZ))
#define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)

typedef struct MDB_head {			/* header page content */
	uint32_t	mh_magic;
	uint32_t	mh_version;
	uint32_t	mh_flags;
	uint32_t	mh_psize;			/* page size */
	void		*mh_address;		/* address for fixed mapping */
	size_t		mh_mapsize;			/* size of mmap region */
} MDB_head;

typedef struct MDB_meta {			/* meta (footer) page content */
	MDB_stat	mm_stat;
139
140
141
	pgno_t		mm_root;			/* page number of root page */
	pgno_t		mm_last_pg;			/* last used page in file */
	ulong		mm_txnid;			/* txnid that committed this page */
Howard Chu's avatar
Howard Chu committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
} MDB_meta;

typedef struct MDB_dhead {					/* a dirty page */
	SIMPLEQ_ENTRY(MDB_dpage)	 md_next;	/* queue of dirty pages */
	MDB_page	*md_parent;
	int			md_pi;				/* parent index */
	int			md_num;
} MDB_dhead;

typedef struct MDB_dpage {
	MDB_dhead	h;
	MDB_page	p;
} MDB_dpage;

SIMPLEQ_HEAD(dirty_queue, MDB_dpage);

typedef struct MDB_pageparent {
	MDB_page *mp_page;
	MDB_page *mp_parent;
	int		mp_pi;
} MDB_pageparent;

static MDB_dpage *mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num);
static int 		mdb_touch(MDB_txn *txn, MDB_pageparent *mp);

typedef struct MDB_ppage {					/* ordered list of pages */
	SLIST_ENTRY(MDB_ppage)	 mp_entry;
	MDB_page		*mp_page;
	unsigned int	mp_ki;		/* cursor index on page */
} MDB_ppage;
SLIST_HEAD(page_stack, MDB_ppage);

#define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->mc_stack)
#define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->mc_stack)
#define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->mc_stack, mp_entry)
#define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->mc_stack, p, mp_entry)

struct MDB_cursor {
	MDB_db		*mc_db;
	MDB_txn		*mc_txn;
	struct page_stack	 mc_stack;		/* stack of parent pages */
	short		mc_initialized;	/* 1 if initialized */
	short		mc_eof;		/* 1 if end is reached */
};

#define METAHASHLEN	 offsetof(MDB_meta, mm_hash)
#define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))

typedef struct MDB_node {
#define mn_pgno		 mn_p.np_pgno
#define mn_dsize	 mn_p.np_dsize
	union {
		pgno_t		 np_pgno;	/* child page number */
		uint32_t	 np_dsize;	/* leaf data size */
	} mn_p;
Howard Chu's avatar
Howard Chu committed
197
198
	unsigned int	mn_flags:4;
	unsigned int	mn_ksize:12;			/* key size */
Howard Chu's avatar
Howard Chu committed
199
200
201
202
203
204
205
206
#define F_BIGDATA	 0x01			/* data put on overflow page */
	char		mn_data[1];
} MDB_node;

struct MDB_txn {
	pgno_t		mt_root;		/* current / new root page */
	pgno_t		mt_next_pgno;	/* next unallocated page */
	pgno_t		mt_first_pgno;
Howard Chu's avatar
Howard Chu committed
207
	ulong		mt_txnid;
Howard Chu's avatar
Howard Chu committed
208
	MDB_env		*mt_env;	
Howard Chu's avatar
Howard Chu committed
209
210
211
212
	union {
		struct dirty_queue	*dirty_queue;	/* modified pages */
		MDB_reader	*reader;
	} mt_u;
Howard Chu's avatar
Howard Chu committed
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#define MDB_TXN_RDONLY		 0x01		/* read-only transaction */
#define MDB_TXN_ERROR		 0x02		/* an error has occurred */
	unsigned int		 mt_flags;
};

/* Must be same as MDB_db, minus md_root/md_stat */
typedef struct MDB_db0 {
	unsigned int	md_flags;
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_rel_func	*md_rel;		/* user relocate function */
	MDB_db			*md_parent;		/* parent tree */
	MDB_env 		*md_env;
} MDB_db0;

struct MDB_db {
	unsigned int	md_flags;
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_rel_func	*md_rel;		/* user relocate function */
	MDB_db			*md_parent;		/* parent tree */
	MDB_env 		*md_env;
	MDB_stat		md_stat;
234
	pgno_t			md_root;		/* page number of root page */
Howard Chu's avatar
Howard Chu committed
235
236
237
238
};

struct MDB_env {
	int			me_fd;
239
	key_t		me_shmkey;
Howard Chu's avatar
Howard Chu committed
240
	uint32_t	me_flags;
Howard Chu's avatar
Howard Chu committed
241
	int			me_maxreaders;
242
	int			me_metatoggle;
Howard Chu's avatar
Howard Chu committed
243
244
	char		*me_path;
	char *me_map;
Howard Chu's avatar
Howard Chu committed
245
	MDB_txninfo	*me_txns;
Howard Chu's avatar
Howard Chu committed
246
247
248
249
250
251
	MDB_head	me_head;
	MDB_db0		me_db;		/* first DB, overlaps with meta */
	MDB_meta	me_meta;
	MDB_txn	*me_txn;		/* current write transaction */
	size_t		me_mapsize;
	off_t		me_size;		/* current file size */
Howard Chu's avatar
Howard Chu committed
252
	pthread_key_t	me_txkey;	/* thread-key for readers */
Howard Chu's avatar
Howard Chu committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
};

#define NODESIZE	 offsetof(MDB_node, mn_data)

#define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node)	 (void *)((node)->mn_data)
#define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node)	 ((node)->mn_pgno)
#define NODEDSZ(node)	 ((node)->mn_dsize)

#define MDB_COMMIT_PAGES	 64	/* max number of pages to write in one commit */
#define MDB_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */

static int  mdb_search_page_root(MDB_db *db,
			    MDB_val *key,
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);
static int  mdb_search_page(MDB_db *db,
			    MDB_txn *txn, MDB_val *key,
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);

static int  mdbenv_write_header(MDB_env *env);
static int  mdbenv_read_header(MDB_env *env);
static int  mdb_check_meta_page(MDB_page *p);
280
281
static int  mdbenv_read_meta(MDB_env *env);
static int  mdbenv_write_meta(MDB_txn *txn);
Howard Chu's avatar
Howard Chu committed
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
static MDB_page *mdbenv_get_page(MDB_env *env, pgno_t pgno);

static MDB_node *mdb_search_node(MDB_db *db, MDB_page *mp,
			    MDB_val *key, int *exactp, unsigned int *kip);
static int  mdb_add_node(MDB_db *bt, MDB_page *mp,
			    indx_t indx, MDB_val *key, MDB_val *data,
			    pgno_t pgno, uint8_t flags);
static void mdb_del_node(MDB_db *bt, MDB_page *mp,
			    indx_t indx);
static int  mdb_read_data(MDB_db *bt, MDB_page *mp,
			    MDB_node *leaf, MDB_val *data);

static int		 mdb_rebalance(MDB_db *bt, MDB_pageparent *mp);
static int		 mdb_update_key(MDB_db *bt, MDB_page *mp,
			    indx_t indx, MDB_val *key);
static int		 mdb_move_node(MDB_db *bt, 
				MDB_pageparent *src, indx_t srcindx,
				MDB_pageparent *dst, indx_t dstindx);
static int		 mdb_merge(MDB_db *bt, MDB_pageparent *src,
			    MDB_pageparent *dst);
static int		 mdb_split(MDB_db *bt, MDB_page **mpp,
			    unsigned int *newindxp, MDB_val *newkey,
			    MDB_val *newdata, pgno_t newpgno);
static MDB_dpage *mdbenv_new_page(MDB_env *env, uint32_t flags, int num);

static void		 cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
			    MDB_page *mp);

static int		 mdb_set_key(MDB_db *bt, MDB_page *mp,
			    MDB_node *node, MDB_val *key);
static int		 mdb_sibling(MDB_cursor *cursor, int move_right);
static int		 mdb_cursor_next(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
static int		 mdb_cursor_set(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data, int *exactp);
static int		 mdb_cursor_first(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);

static size_t		 mdb_leaf_size(MDB_db *bt, MDB_val *key,
			    MDB_val *data);
static size_t		 mdb_branch_size(MDB_db *bt, MDB_val *key);

static pgno_t		 mdbenv_compact_tree(MDB_env *env, pgno_t pgno,
			    MDB_env *envc);

static int		 memncmp(const void *s1, size_t n1,
				 const void *s2, size_t n2);
static int		 memnrcmp(const void *s1, size_t n1,
				  const void *s2, size_t n2);

static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
336
337
338
339
340
	int diff, len_diff = -1;

	if (n1 >= n2) {
		len_diff = (n1 > n2);
		n1 = n2;
Howard Chu's avatar
Howard Chu committed
341
	}
342
343
	diff = memcmp(s1, s2, n1);
	return diff ? diff : len_diff;
Howard Chu's avatar
Howard Chu committed
344
345
346
347
348
}

static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
349
	const unsigned char	*p1, *p2, *p1_lim;
Howard Chu's avatar
Howard Chu committed
350
351

	if (n2 == 0)
352
353
354
		return n1 != 0;
	if (n1 == 0)
		return -1;
Howard Chu's avatar
Howard Chu committed
355
356
357
358

	p1 = (const unsigned char *)s1 + n1 - 1;
	p2 = (const unsigned char *)s2 + n2 - 1;

359
360
361
	for (p1_lim = (n1 <= n2 ? s1 : s2);  *p1 == *p2;  p1--, p2--) {
		if (p1 == p1_lim)
			return (p1 != s1) ? (p1 != p2) : (p2 != s2) ? -1 : 0;
Howard Chu's avatar
Howard Chu committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
	}
	return *p1 - *p2;
}

int
mdb_cmp(MDB_db *db, const MDB_val *a, const MDB_val *b)
{
	return db->md_cmp(a, b);
}

static int
_mdb_cmp(MDB_db *db, const MDB_val *key1, const MDB_val *key2)
{
	if (F_ISSET(db->md_flags, MDB_REVERSEKEY))
		return memnrcmp(key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
	else
		return memncmp((char *)key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
}

/* Allocate new page(s) for writing */
static MDB_dpage *
mdb_newpage(MDB_txn *txn, MDB_page *parent, int parent_idx, int num)
{
	MDB_dpage *dp;

	if ((dp = malloc(txn->mt_env->me_head.mh_psize * num + sizeof(MDB_dhead))) == NULL)
		return NULL;
	dp->h.md_num = num;
	dp->h.md_parent = parent;
	dp->h.md_pi = parent_idx;
Howard Chu's avatar
Howard Chu committed
392
	SIMPLEQ_INSERT_TAIL(txn->mt_u.dirty_queue, dp, h.md_next);
Howard Chu's avatar
Howard Chu committed
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
	dp->p.mp_pgno = txn->mt_next_pgno;
	txn->mt_next_pgno += num;

	return dp;
}

/* Touch a page: make it dirty and re-insert into tree with updated pgno.
 */
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
	int rc;
	MDB_page *mp = pp->mp_page;
	pgno_t	pgno;
	assert(txn != NULL);
	assert(pp != NULL);

	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
		MDB_dpage *dp;
		DPRINTF("touching page %lu -> %lu", mp->mp_pgno, txn->mt_next_pgno);
		if ((dp = mdb_newpage(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
			return ENOMEM;
		pgno = dp->p.mp_pgno;
		bcopy(mp, &dp->p, txn->mt_env->me_head.mh_psize);
		mp = &dp->p;
		mp->mp_pgno = pgno;
		mp->mp_flags |= P_DIRTY;

		/* Update the page number to new touched page. */
		if (pp->mp_parent != NULL)
			NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
		pp->mp_page = mp;
	}
	return 0;
}

int
mdbenv_sync(MDB_env *env)
{
	int rc = 0;
	if (!F_ISSET(env->me_flags, MDB_NOSYNC)) {
		if (fsync(env->me_fd))
			rc = errno;
	}
	return rc;
}

int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
	MDB_txn	*txn;
	int rc;

	if ((txn = calloc(1, sizeof(*txn))) == NULL) {
		DPRINTF("calloc: %s", strerror(errno));
		return ENOMEM;
	}

	if (rdonly) {
		txn->mt_flags |= MDB_TXN_RDONLY;
	} else {
Howard Chu's avatar
Howard Chu committed
454
455
		txn->mt_u.dirty_queue = calloc(1, sizeof(*txn->mt_u.dirty_queue));
		if (txn->mt_u.dirty_queue == NULL) {
Howard Chu's avatar
Howard Chu committed
456
457
458
			free(txn);
			return ENOMEM;
		}
Howard Chu's avatar
Howard Chu committed
459
		SIMPLEQ_INIT(txn->mt_u.dirty_queue);
Howard Chu's avatar
Howard Chu committed
460

Howard Chu's avatar
Howard Chu committed
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
		pthread_mutex_lock(&env->me_txns->mt_wmutex);
		env->me_txns->mt_txnid++;
	}
	txn->mt_txnid = env->me_txns->mt_txnid;
	if (rdonly) {
		MDB_reader *r = pthread_getspecific(env->me_txkey);
		if (!r) {
			int i;
			pthread_mutex_lock(&env->me_txns->mt_mutex);
			for (i=0; i<env->me_maxreaders; i++) {
				if (env->me_txns->mt_readers[i].mr_pid == 0) {
					env->me_txns->mt_readers[i].mr_pid = getpid();
					env->me_txns->mt_readers[i].mr_tid = pthread_self();
					pthread_setspecific(env->me_txkey, &env->me_txns->mt_readers[i]);
					if (i >= env->me_txns->mt_numreaders)
						env->me_txns->mt_numreaders = i+1;
					break;
				}
			}
			pthread_mutex_unlock(&env->me_txns->mt_mutex);
			if (i == env->me_maxreaders) {
				return ENOSPC;
			}
Howard Chu's avatar
Howard Chu committed
484
		}
Howard Chu's avatar
Howard Chu committed
485
486
487
		r->mr_txnid = txn->mt_txnid;
		txn->mt_u.reader = r;
	} else {
Howard Chu's avatar
Howard Chu committed
488
489
490
491
492
		env->me_txn = txn;
	}

	txn->mt_env = env;

493
	if ((rc = mdbenv_read_meta(env)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
494
495
496
497
		mdb_txn_abort(txn);
		return rc;
	}

498
	txn->mt_next_pgno = env->me_meta.mm_last_pg+1;
Howard Chu's avatar
Howard Chu committed
499
500
	txn->mt_first_pgno = txn->mt_next_pgno;
	txn->mt_root = env->me_meta.mm_root;
Howard Chu's avatar
Howard Chu committed
501
	DPRINTF("begin transaction %lu on mdbenv %p, root page %lu", txn->mt_txnid, env, txn->mt_root);
Howard Chu's avatar
Howard Chu committed
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516

	*ret = txn;
	return MDB_SUCCESS;
}

void
mdb_txn_abort(MDB_txn *txn)
{
	MDB_dpage *dp;
	MDB_env	*env;

	if (txn == NULL)
		return;

	env = txn->mt_env;
Howard Chu's avatar
Howard Chu committed
517
	DPRINTF("abort transaction %lu on mdbenv %p, root page %lu", txn->mt_txnid, env, txn->mt_root);
Howard Chu's avatar
Howard Chu committed
518

Howard Chu's avatar
Howard Chu committed
519
520
521
	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		txn->mt_u.reader->mr_txnid = 0;
	} else {
Howard Chu's avatar
Howard Chu committed
522
523
		/* Discard all dirty pages.
		 */
Howard Chu's avatar
Howard Chu committed
524
525
526
		while (!SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) {
			dp = SIMPLEQ_FIRST(txn->mt_u.dirty_queue);
			SIMPLEQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next);
Howard Chu's avatar
Howard Chu committed
527
528
529
530
531
532
533
534
535
536
537
			free(dp);
		}

#if 0
		DPRINTF("releasing write lock on txn %p", txn);
		txn->bt->txn = NULL;
		if (flock(txn->bt->fd, LOCK_UN) != 0) {
			DPRINTF("failed to unlock fd %d: %s",
			    txn->bt->fd, strerror(errno));
		}
#endif
Howard Chu's avatar
Howard Chu committed
538
539
540
		free(txn->mt_u.dirty_queue);
		env->me_txn = NULL;
		pthread_mutex_unlock(&env->me_txns->mt_wmutex);
Howard Chu's avatar
Howard Chu committed
541
542
543
544
545
546
547
548
549
550
551
552
553
	}

	free(txn);
}

int
mdb_txn_commit(MDB_txn *txn)
{
	int		 n, done;
	ssize_t		 rc;
	off_t		 size;
	MDB_dpage	*dp;
	MDB_env	*env;
554
	pgno_t	next;
Howard Chu's avatar
Howard Chu committed
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
	struct iovec	 iov[MDB_COMMIT_PAGES];

	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	env = txn->mt_env;

	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		DPRINTF("attempt to commit read-only transaction");
		mdb_txn_abort(txn);
		return EPERM;
	}

	if (txn != env->me_txn) {
		DPRINTF("attempt to commit unknown transaction");
		mdb_txn_abort(txn);
		return EINVAL;
	}

	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
		DPRINTF("error flag is set, can't commit");
		mdb_txn_abort(txn);
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
580
	if (SIMPLEQ_EMPTY(txn->mt_u.dirty_queue))
Howard Chu's avatar
Howard Chu committed
581
582
		goto done;

Howard Chu's avatar
Howard Chu committed
583
584
	DPRINTF("committing transaction %lu on mdbenv %p, root page %lu",
	    txn->mt_txnid, env, txn->mt_root);
Howard Chu's avatar
Howard Chu committed
585
586
587

	/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
	 */
588
	next = 0;
Howard Chu's avatar
Howard Chu committed
589
590
591
	do {
		n = 0;
		done = 1;
592
		size = 0;
Howard Chu's avatar
Howard Chu committed
593
		SIMPLEQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) {
594
595
596
597
598
599
			if (dp->p.mp_pgno != next) {
				lseek(env->me_fd, dp->p.mp_pgno * env->me_head.mh_psize, SEEK_SET);
				next = dp->p.mp_pgno;
				if (n)
					break;
			}
Howard Chu's avatar
Howard Chu committed
600
601
602
			DPRINTF("committing page %lu", dp->p.mp_pgno);
			iov[n].iov_len = env->me_head.mh_psize * dp->h.md_num;
			iov[n].iov_base = &dp->p;
603
604
			size += iov[n].iov_len;
			next = dp->p.mp_pgno + dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
605
606
607
608
609
610
611
612
613
614
615
616
617
			/* clear dirty flag */
			dp->p.mp_flags &= ~P_DIRTY;
			if (++n >= MDB_COMMIT_PAGES) {
				done = 0;
				break;
			}
		}

		if (n == 0)
			break;

		DPRINTF("committing %u dirty pages", n);
		rc = writev(env->me_fd, iov, n);
618
		if (rc != size) {
Howard Chu's avatar
Howard Chu committed
619
620
621
622
623
624
625
626
627
628
629
			n = errno;
			if (rc > 0)
				DPRINTF("short write, filesystem full?");
			else
				DPRINTF("writev: %s", strerror(errno));
			mdb_txn_abort(txn);
			return n;
		}

		/* Drop the dirty pages.
		 */
Howard Chu's avatar
Howard Chu committed
630
631
632
		while (!SIMPLEQ_EMPTY(txn->mt_u.dirty_queue)) {
			dp = SIMPLEQ_FIRST(txn->mt_u.dirty_queue);
			SIMPLEQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next);
Howard Chu's avatar
Howard Chu committed
633
634
635
636
637
638
639
			free(dp);
			if (--n == 0)
				break;
		}
	} while (!done);

	if ((n = mdbenv_sync(env)) != 0 ||
640
	    (n = mdbenv_write_meta(txn)) != MDB_SUCCESS ||
Howard Chu's avatar
Howard Chu committed
641
642
643
644
645
	    (n = mdbenv_sync(env)) != 0) {
		mdb_txn_abort(txn);
		return n;
	}
	env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
646
	pthread_mutex_unlock(&env->me_txns->mt_wmutex);
647
648
	free(txn->mt_u.dirty_queue);
	free(txn);
Howard Chu's avatar
Howard Chu committed
649
	txn = NULL;
Howard Chu's avatar
Howard Chu committed
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738

done:
	mdb_txn_abort(txn);

	return MDB_SUCCESS;
}

static int
mdbenv_write_header(MDB_env *env)
{
	struct stat	 sb;
	MDB_head	*h;
	MDB_page	*p;
	ssize_t		 rc;
	unsigned int	 psize;

	DPRINTF("writing header page");
	assert(env != NULL);

	psize = sysconf(_SC_PAGE_SIZE);

	if ((p = calloc(1, psize)) == NULL)
		return ENOMEM;
	p->mp_flags = P_HEAD;

	env->me_head.mh_psize = psize;
	env->me_head.mh_flags = env->me_flags & 0xffff;

	h = METADATA(p);
	bcopy(&env->me_head, h, sizeof(*h));

	rc = write(env->me_fd, p, env->me_head.mh_psize);
	free(p);
	if (rc != (ssize_t)env->me_head.mh_psize) {
		int err = errno;
		if (rc > 0)
			DPRINTF("short write, filesystem full?");
		return err;
	}

	return MDB_SUCCESS;
}

static int
mdbenv_read_header(MDB_env *env)
{
	char		 page[PAGESIZE];
	MDB_page	*p;
	MDB_head	*h;
	int		 rc;

	assert(env != NULL);

	/* We don't know the page size yet, so use a minimum value.
	 */

	if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
		return ENOENT;
	} else if (rc != PAGESIZE) {
		if (rc > 0)
			errno = EINVAL;
		DPRINTF("read: %s", strerror(errno));
		return errno;
	}

	p = (MDB_page *)page;

	if (!F_ISSET(p->mp_flags, P_HEAD)) {
		DPRINTF("page %lu not a header page", p->mp_pgno);
		return EINVAL;
	}

	h = METADATA(p);
	if (h->mh_magic != MDB_MAGIC) {
		DPRINTF("header has invalid magic");
		return EINVAL;
	}

	if (h->mh_version != MDB_VERSION) {
		DPRINTF("database is version %u, expected version %u",
		    h->mh_version, MDB_VERSION);
		return EINVAL;
	}

	bcopy(h, &env->me_head, sizeof(*h));
	return 0;
}

static int
739
mdbenv_init_meta(MDB_env *env)
Howard Chu's avatar
Howard Chu committed
740
{
741
742
743
	MDB_page *p, *q;
	MDB_meta *meta;
	int rc;
Howard Chu's avatar
Howard Chu committed
744

745
746
747
	p = calloc(2, env->me_head.mh_psize);
	p->mp_pgno = 1;
	p->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
748

749
750
751
	meta = METADATA(p);
	meta->mm_root = P_INVALID;
	meta->mm_last_pg = 2;
Howard Chu's avatar
Howard Chu committed
752

753
	q = (MDB_page *)((char *)p + env->me_head.mh_psize);
Howard Chu's avatar
Howard Chu committed
754

755
756
	q->mp_pgno = 2;
	q->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
757

758
759
760
	meta = METADATA(q);
	meta->mm_root = P_INVALID;
	meta->mm_last_pg = 2;
Howard Chu's avatar
Howard Chu committed
761

762
763
764
765
766
767
768
769
770
771
772
773
	rc = write(env->me_fd, p, env->me_head.mh_psize * 2);
	free(p);
	return (rc == env->me_head.mh_psize * 2) ? MDB_SUCCESS : errno;
}

static int
mdbenv_write_meta(MDB_txn *txn)
{
	MDB_env *env;
	MDB_meta	meta;
	off_t off;
	int rc;
Howard Chu's avatar
Howard Chu committed
774

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	DPRINTF("writing meta page for root page %lu", txn->mt_root);

	env = txn->mt_env;

	bcopy(&env->me_meta, &meta, sizeof(meta));
	meta.mm_root = txn->mt_root;
	meta.mm_last_pg = txn->mt_next_pgno - 1;
	meta.mm_txnid = txn->mt_txnid;

	off = env->me_head.mh_psize;
	if (!env->me_metatoggle)
		off *= 2;
	off += PAGEHDRSZ;

	lseek(env->me_fd, off, SEEK_SET);
	rc = write(env->me_fd, &meta, sizeof(meta));
	if (rc != sizeof(meta)) {
		DPRINTF("write failed, disk error?");
		return errno;
Howard Chu's avatar
Howard Chu committed
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
	}

	return MDB_SUCCESS;
}

/* Returns true if page p is a valid meta page, false otherwise.
 */
static int
mdb_check_meta_page(MDB_page *p)
{
	if (!F_ISSET(p->mp_flags, P_META)) {
		DPRINTF("page %lu not a meta page", p->mp_pgno);
		return EINVAL;
	}

	return 0;
}

static int
816
mdbenv_read_meta(MDB_env *env)
Howard Chu's avatar
Howard Chu committed
817
{
818
819
820
	MDB_page	*mp0, *mp1;
	MDB_meta	*meta[2];
	int toggle = 0, rc;
Howard Chu's avatar
Howard Chu committed
821
822
823

	assert(env != NULL);

824
825
826
	if ((mp0 = mdbenv_get_page(env, 1)) == NULL ||
		(mp1 = mdbenv_get_page(env, 2)) == NULL)
		return EIO;
Howard Chu's avatar
Howard Chu committed
827

828
829
	rc = mdb_check_meta_page(mp0);
	if (rc) return rc;
Howard Chu's avatar
Howard Chu committed
830

831
832
	rc = mdb_check_meta_page(mp1);
	if (rc) return rc;
Howard Chu's avatar
Howard Chu committed
833

834
835
	meta[0] = METADATA(mp0);
	meta[1] = METADATA(mp1);
Howard Chu's avatar
Howard Chu committed
836

837
838
	if (meta[0]->mm_txnid < meta[1]->mm_txnid)
		toggle = 1;
Howard Chu's avatar
Howard Chu committed
839

Howard Chu's avatar
Howard Chu committed
840
841
842
843
	if (meta[toggle]->mm_txnid > env->me_meta.mm_txnid) {
		bcopy(meta[toggle], &env->me_meta, sizeof(env->me_meta));
		env->me_metatoggle = toggle;
	}
Howard Chu's avatar
Howard Chu committed
844

845
	DPRINTF("Using meta page %d", toggle);
Howard Chu's avatar
Howard Chu committed
846

847
	return MDB_SUCCESS;
Howard Chu's avatar
Howard Chu committed
848
849
850
}

int
851
mdbenv_create(MDB_env **env)
Howard Chu's avatar
Howard Chu committed
852
853
854
855
856
857
858
859
{
	MDB_env *e;

	e = calloc(1, sizeof(*e));
	if (!e) return ENOMEM;

	e->me_head.mh_magic = MDB_MAGIC;
	e->me_head.mh_version = MDB_VERSION;
Howard Chu's avatar
Howard Chu committed
860
861
	e->me_head.mh_mapsize = DEFAULT_MAPSIZE;
	e->me_maxreaders = DEFAULT_READERS;
Howard Chu's avatar
Howard Chu committed
862
863
864
865
866
	e->me_db.md_env = e;
	*env = e;
	return MDB_SUCCESS;
}

Howard Chu's avatar
Howard Chu committed
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
int
mdbenv_set_mapsize(MDB_env *env, size_t size)
{
	if (env->me_map)
		return EINVAL;
	env->me_mapsize = env->me_head.mh_mapsize = size;
	return MDB_SUCCESS;
}

int
mdbenv_set_maxreaders(MDB_env *env, int readers)
{
	env->me_maxreaders = readers;
	return MDB_SUCCESS;
}

int
mdbenv_get_maxreaders(MDB_env *env, int *readers)
{
	if (!env || !readers)
		return EINVAL;
	*readers = env->me_maxreaders;
	return MDB_SUCCESS;
}

Howard Chu's avatar
Howard Chu committed
892
893
894
895
896
897
898
int
mdbenv_open2(MDB_env *env, unsigned int flags)
{
	int i, newenv = 0;

	env->me_flags = flags;
	env->me_meta.mm_root = P_INVALID;
Howard Chu's avatar
Howard Chu committed
899
	env->me_meta.mm_last_pg = 2;
Howard Chu's avatar
Howard Chu committed
900
901
902
903
904
905
906
907

	if ((i = mdbenv_read_header(env)) != 0) {
		if (i != ENOENT)
			return i;
		DPRINTF("new mdbenv");
		newenv = 1;
	}

908
909
910
	if (!env->me_mapsize)
		env->me_mapsize = env->me_head.mh_mapsize;

Howard Chu's avatar
Howard Chu committed
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
	i = MAP_SHARED;
	if (env->me_head.mh_address && (flags & MDB_FIXEDMAP))
		i |= MAP_FIXED;
	env->me_map = mmap(env->me_head.mh_address, env->me_mapsize, PROT_READ, i,
		env->me_fd, 0);
	if (env->me_map == MAP_FAILED)
		return errno;

	if (newenv) {
		env->me_head.mh_mapsize = env->me_mapsize;
		if (flags & MDB_FIXEDMAP)
			env->me_head.mh_address = env->me_map;
		i = mdbenv_write_header(env);
		if (i != MDB_SUCCESS) {
			munmap(env->me_map, env->me_mapsize);
			return i;
		}
928
929
930
931
932
		i = mdbenv_init_meta(env);
		if (i != MDB_SUCCESS) {
			munmap(env->me_map, env->me_mapsize);
			return i;
		}
Howard Chu's avatar
Howard Chu committed
933
934
	}

935
	if ((i = mdbenv_read_meta(env)) != 0)
Howard Chu's avatar
Howard Chu committed
936
937
938
939
940
941
942
943
944
945
946
947
948
949
		return i;

	DPRINTF("opened database version %u, pagesize %u",
	    env->me_head.mh_version, env->me_head.mh_psize);
	DPRINTF("depth: %u", env->me_meta.mm_stat.ms_depth);
	DPRINTF("entries: %lu", env->me_meta.mm_stat.ms_entries);
	DPRINTF("branch pages: %lu", env->me_meta.mm_stat.ms_branch_pages);
	DPRINTF("leaf pages: %lu", env->me_meta.mm_stat.ms_leaf_pages);
	DPRINTF("overflow pages: %lu", env->me_meta.mm_stat.ms_overflow_pages);
	DPRINTF("root: %lu", env->me_meta.mm_root);

	return MDB_SUCCESS;
}

Howard Chu's avatar
Howard Chu committed
950
951
952
953
954
955
956
957
958
959
static void
mdbenv_reader_dest(void *ptr)
{
	MDB_reader *reader = ptr;

	reader->mr_txnid = 0;
	reader->mr_pid = 0;
	reader->mr_tid = 0;
}

Howard Chu's avatar
Howard Chu committed
960
961
962
int
mdbenv_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
{
963
964
965
966
967
968
969
	int		oflags, rc, shmid;
	off_t	size;


	if (F_ISSET(flags, MDB_RDONLY))
		oflags = O_RDONLY;
	else
970
		oflags = O_RDWR | O_CREAT;
971
972

	if ((env->me_fd = open(path, oflags, mode)) == -1)
Howard Chu's avatar
Howard Chu committed
973
974
		return errno;

975
976
977
978
979
980
981
982
983
	env->me_shmkey = ftok(path, 'm');
	size = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
	shmid = shmget(env->me_shmkey, size, IPC_CREAT|IPC_EXCL|mode);
	if (shmid == -1) {
		if (errno == EEXIST) {
			shmid = shmget(env->me_shmkey, size, IPC_CREAT|mode);
			if (shmid == -1)
				return errno;
			env->me_txns = shmat(shmid, NULL, 0);
984
985
986
987
988
989
990
			if (env->me_txns->mt_magic != MDB_MAGIC ||
				env->me_txns->mt_version != MDB_VERSION) {
					DPRINTF("invalid lock region %d", shmid);
					shmdt(env->me_txns);
					env->me_txns = NULL;
					return EIO;
				}
991
992
		} else {
			return errno;
Howard Chu's avatar
Howard Chu committed
993
994
995
996
		}
	} else {
		pthread_mutexattr_t mattr;

997
		env->me_txns = shmat(shmid, NULL, 0);
Howard Chu's avatar
Howard Chu committed
998
999
1000
		pthread_mutexattr_init(&mattr);
		pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
		pthread_mutex_init(&env->me_txns->mt_mutex, &mattr);