mdb.c 80.2 KB
Newer Older
Howard Chu's avatar
Howard Chu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* mdb.c - memory-mapped database library */
/*
 * Copyright 2011 Howard Chu, Symas Corp.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted only as authorized by the OpenLDAP
 * Public License.
 *
 * A copy of this license is available in the file LICENSE in the
 * top-level directory of the distribution or, alternatively, at
 * <http://www.OpenLDAP.org/license.html>.
 *
 * This code is derived from btree.c written by Martin Hedenfalk.
 *
 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
Howard Chu's avatar
Howard Chu committed
30
31
32
33
34
35
36
37
38
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/queue.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
Howard Chu's avatar
Howard Chu committed
39
#include <fcntl.h>
Howard Chu's avatar
Howard Chu committed
40
41
42
43
44
45
46
47
48
49

#include <assert.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
Howard Chu's avatar
Howard Chu committed
50
#include <pthread.h>
Howard Chu's avatar
Howard Chu committed
51
52

#include "mdb.h"
Howard Chu's avatar
Howard Chu committed
53

Howard Chu's avatar
Howard Chu committed
54
55
#define ULONG		unsigned long
typedef ULONG		pgno_t;
Howard Chu's avatar
Howard Chu committed
56

Howard Chu's avatar
Howard Chu committed
57
#include "midl.h"
Howard Chu's avatar
Howard Chu committed
58

59
60
61
#ifndef DEBUG
#define DEBUG 1
#endif
Howard Chu's avatar
Howard Chu committed
62

Howard Chu's avatar
Howard Chu committed
63
#if DEBUG && defined(__GNUC__)
64
65
# define DPRINTF(fmt, ...) \
	fprintf(stderr, "%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
Howard Chu's avatar
Howard Chu committed
66
#else
67
# define DPRINTF(...)	((void) 0)
Howard Chu's avatar
Howard Chu committed
68
69
70
71
72
73
#endif

#define PAGESIZE	 4096
#define MDB_MINKEYS	 4
#define MDB_MAGIC	 0xBEEFC0DE
#define MDB_VERSION	 1
Howard Chu's avatar
Howard Chu committed
74
#define MAXKEYSIZE	 511
Howard Chu's avatar
Howard Chu committed
75

Hallvard Furuseth's avatar
Hallvard Furuseth committed
76
#define P_INVALID	 (~0UL)
Howard Chu's avatar
Howard Chu committed
77
78
79
80
81

#define F_ISSET(w, f)	 (((w) & (f)) == (f))

typedef uint16_t	 indx_t;

Howard Chu's avatar
Howard Chu committed
82
83
84
85
86
#define DEFAULT_READERS	126
#define DEFAULT_MAPSIZE	1048576

/* Lock descriptor stuff */
#define RXBODY	\
Howard Chu's avatar
Howard Chu committed
87
	ULONG		mr_txnid; \
Howard Chu's avatar
Howard Chu committed
88
89
90
91
92
93
94
	pid_t		mr_pid; \
	pthread_t	mr_tid
typedef struct MDB_rxbody {
	RXBODY;
} MDB_rxbody;

#ifndef CACHELINE
Howard Chu's avatar
Howard Chu committed
95
96
97
98
99
# ifdef __APPLE__
#  define CACHELINE	128	/* 64 is too small to contain a mutex */
# else
#  define CACHELINE	64	/* most CPUs. Itanium uses 128 */
# endif
Howard Chu's avatar
Howard Chu committed
100
101
102
103
104
105
106
107
108
109
110
111
#endif

typedef struct MDB_reader {
	RXBODY;
	/* cache line alignment */
	char pad[CACHELINE-sizeof(MDB_rxbody)];
} MDB_reader;

#define	TXBODY \
	uint32_t	mt_magic;	\
	uint32_t	mt_version;	\
	pthread_mutex_t	mt_mutex;	\
Howard Chu's avatar
Howard Chu committed
112
	ULONG		mt_txnid;	\
Howard Chu's avatar
Howard Chu committed
113
114
115
116
117
118
119
120
121
122
123
124
125
	uint32_t	mt_numreaders
typedef struct MDB_txbody {
	TXBODY;
} MDB_txbody;

typedef struct MDB_txninfo {
	TXBODY;
	char pad[CACHELINE-sizeof(MDB_txbody)];
	pthread_mutex_t	mt_wmutex;
	char pad2[CACHELINE-sizeof(pthread_mutex_t)];
	MDB_reader	mt_readers[1];
} MDB_txninfo;

Howard Chu's avatar
Howard Chu committed
126
127
128
129
130
/* Common header for all page types. Overflow pages
 * occupy a number of contiguous pages with no
 * headers on any page after the first.
 */
typedef struct MDB_page {		/* represents a page of storage */
Howard Chu's avatar
Howard Chu committed
131
132
133
134
135
#define	mp_pgno		mp_p.p_pgno
	union padded {
		pgno_t		p_pgno;		/* page number */
		void *		p_pad;
	} mp_p;
Howard Chu's avatar
Howard Chu committed
136
137
138
139
#define	P_BRANCH	 0x01		/* branch page */
#define	P_LEAF		 0x02		/* leaf page */
#define	P_OVERFLOW	 0x04		/* overflow page */
#define	P_META		 0x08		/* meta page */
Howard Chu's avatar
Howard Chu committed
140
#define	P_DIRTY		 0x10		/* dirty page */
Howard Chu's avatar
Howard Chu committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	uint32_t	mp_flags;
#define mp_lower	mp_pb.pb.pb_lower
#define mp_upper	mp_pb.pb.pb_upper
#define mp_pages	mp_pb.pb_pages
	union page_bounds {
		struct {
			indx_t		pb_lower;		/* lower bound of free space */
			indx_t		pb_upper;		/* upper bound of free space */
		} pb;
		uint32_t	pb_pages;	/* number of overflow pages */
	} mp_pb;
	indx_t		mp_ptrs[1];		/* dynamic size */
} MDB_page;

155
#define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
Howard Chu's avatar
Howard Chu committed
156
157
158

#define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
Howard Chu's avatar
Howard Chu committed
159
160
#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
				((env)->me_psize - PAGEHDRSZ))
Howard Chu's avatar
Howard Chu committed
161
162
163
164
#define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)

Howard Chu's avatar
Howard Chu committed
165
166
#define OVPAGES(size, psize)	(PAGEHDRSZ + size + psize - 1) / psize;

Howard Chu's avatar
Howard Chu committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
typedef struct MDB_db {
	uint32_t	md_pad;
	uint16_t	md_flags;
	uint16_t	md_depth;
	ULONG		md_branch_pages;
	ULONG		md_leaf_pages;
	ULONG		md_overflow_pages;
	ULONG		md_entries;
	pgno_t		md_root;
} MDB_db;

#define	FREE_DBI	0
#define	MAIN_DBI	1

Howard Chu's avatar
Howard Chu committed
181
typedef struct MDB_meta {			/* meta (footer) page content */
Howard Chu's avatar
Howard Chu committed
182
183
184
185
	uint32_t	mm_magic;
	uint32_t	mm_version;
	void		*mm_address;		/* address for fixed mapping */
	size_t		mm_mapsize;			/* size of mmap region */
Howard Chu's avatar
Howard Chu committed
186
187
188
	MDB_db		mm_dbs[2];			/* first is free space, 2nd is main db */
#define	mm_psize	mm_dbs[0].md_pad
#define	mm_flags	mm_dbs[0].md_flags
Howard Chu's avatar
Howard Chu committed
189
190
	pgno_t		mm_last_pg;			/* last used page in file */
	ULONG		mm_txnid;			/* txnid that committed this page */
Howard Chu's avatar
Howard Chu committed
191
192
193
} MDB_meta;

typedef struct MDB_dhead {					/* a dirty page */
Howard Chu's avatar
Howard Chu committed
194
	STAILQ_ENTRY(MDB_dpage)	 md_next;	/* queue of dirty pages */
Howard Chu's avatar
Howard Chu committed
195
	MDB_page	*md_parent;
196
	unsigned	md_pi;				/* parent index */
Howard Chu's avatar
Howard Chu committed
197
198
199
200
201
202
203
204
	int			md_num;
} MDB_dhead;

typedef struct MDB_dpage {
	MDB_dhead	h;
	MDB_page	p;
} MDB_dpage;

Howard Chu's avatar
Howard Chu committed
205
STAILQ_HEAD(dirty_queue, MDB_dpage);	/* FIXME: use a sorted data structure */
Howard Chu's avatar
Howard Chu committed
206

Howard Chu's avatar
Howard Chu committed
207
208
typedef struct MDB_oldpages {
	struct MDB_oldpages *mo_next;
Howard Chu's avatar
Howard Chu committed
209
	ULONG		mo_txnid;
Howard Chu's avatar
Howard Chu committed
210
211
212
	pgno_t		mo_pages[1];	/* dynamic */
} MDB_oldpages;

Howard Chu's avatar
Howard Chu committed
213
214
215
typedef struct MDB_pageparent {
	MDB_page *mp_page;
	MDB_page *mp_parent;
216
	unsigned mp_pi;
Howard Chu's avatar
Howard Chu committed
217
218
} MDB_pageparent;

Howard Chu's avatar
Howard Chu committed
219
static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num);
Howard Chu's avatar
Howard Chu committed
220
221
222
223
224
225
226
227
228
static int 		mdb_touch(MDB_txn *txn, MDB_pageparent *mp);

typedef struct MDB_ppage {					/* ordered list of pages */
	SLIST_ENTRY(MDB_ppage)	 mp_entry;
	MDB_page		*mp_page;
	unsigned int	mp_ki;		/* cursor index on page */
} MDB_ppage;
SLIST_HEAD(page_stack, MDB_ppage);

Howard Chu's avatar
Howard Chu committed
229
230
231
/* FIXME: tree depth is mostly bounded, we should just
 * use a fixed array and avoid malloc/pointer chasing
 */
Howard Chu's avatar
Howard Chu committed
232
233
234
235
236
#define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->mc_stack)
#define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->mc_stack)
#define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->mc_stack, mp_entry)
#define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->mc_stack, p, mp_entry)

Howard Chu's avatar
Howard Chu committed
237
struct MDB_xcursor;
238

Howard Chu's avatar
Howard Chu committed
239
240
241
struct MDB_cursor {
	MDB_txn		*mc_txn;
	struct page_stack	 mc_stack;		/* stack of parent pages */
Howard Chu's avatar
Howard Chu committed
242
	MDB_dbi		mc_dbi;
Howard Chu's avatar
Howard Chu committed
243
244
	short		mc_initialized;	/* 1 if initialized */
	short		mc_eof;		/* 1 if end is reached */
Howard Chu's avatar
Howard Chu committed
245
	struct MDB_xcursor	*mc_xcursor;
Howard Chu's avatar
Howard Chu committed
246
247
248
249
250
251
252
253
254
255
256
257
};

#define METAHASHLEN	 offsetof(MDB_meta, mm_hash)
#define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))

typedef struct MDB_node {
#define mn_pgno		 mn_p.np_pgno
#define mn_dsize	 mn_p.np_dsize
	union {
		pgno_t		 np_pgno;	/* child page number */
		uint32_t	 np_dsize;	/* leaf data size */
	} mn_p;
Howard Chu's avatar
Howard Chu committed
258
259
	unsigned int	mn_flags:4;
	unsigned int	mn_ksize:12;			/* key size */
Howard Chu's avatar
Howard Chu committed
260
#define F_BIGDATA	 0x01			/* data put on overflow page */
Howard Chu's avatar
Howard Chu committed
261
#define F_SUBDATA	 0x02			/* data is a sub-database */
Howard Chu's avatar
Howard Chu committed
262
263
264
	char		mn_data[1];
} MDB_node;

Howard Chu's avatar
Howard Chu committed
265
typedef struct MDB_dbx {
266
	MDB_val		md_name;
Howard Chu's avatar
Howard Chu committed
267
268
269
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_cmp_func	*md_dcmp;		/* user dupsort function */
	MDB_rel_func	*md_rel;		/* user relocate function */
270
271
	MDB_dbi	md_parent;
	unsigned int	md_dirty;
Howard Chu's avatar
Howard Chu committed
272
273
} MDB_dbx;

Howard Chu's avatar
Howard Chu committed
274
275
struct MDB_txn {
	pgno_t		mt_next_pgno;	/* next unallocated page */
Howard Chu's avatar
Howard Chu committed
276
277
	ULONG		mt_txnid;
	ULONG		mt_oldest;
Howard Chu's avatar
Howard Chu committed
278
	MDB_env		*mt_env;	
Howard Chu's avatar
Howard Chu committed
279
	pgno_t		*mt_free_pgs;	/* this is an IDL */
Howard Chu's avatar
Howard Chu committed
280
281
282
283
	union {
		struct dirty_queue	*dirty_queue;	/* modified pages */
		MDB_reader	*reader;
	} mt_u;
Howard Chu's avatar
Howard Chu committed
284
	MDB_dbx		*mt_dbxs;		/* array */
Howard Chu's avatar
Howard Chu committed
285
	MDB_db		*mt_dbs;
Howard Chu's avatar
Howard Chu committed
286
287
	unsigned int	mt_numdbs;

Howard Chu's avatar
Howard Chu committed
288
289
#define MDB_TXN_RDONLY		 0x01		/* read-only transaction */
#define MDB_TXN_ERROR		 0x02		/* an error has occurred */
Howard Chu's avatar
Howard Chu committed
290
#define MDB_TXN_METOGGLE	0x04		/* used meta page 1 */
Howard Chu's avatar
Howard Chu committed
291
	unsigned int	mt_flags;
Howard Chu's avatar
Howard Chu committed
292
293
};

Howard Chu's avatar
Howard Chu committed
294
295
296
297
298
299
300
301
/* Context for sorted-dup records */
typedef struct MDB_xcursor {
	MDB_cursor mx_cursor;
	MDB_txn mx_txn;
	MDB_dbx	mx_dbxs[4];
	MDB_db	mx_dbs[4];
} MDB_xcursor;

Howard Chu's avatar
Howard Chu committed
302
303
struct MDB_env {
	int			me_fd;
Howard Chu's avatar
Howard Chu committed
304
	int			me_lfd;
Howard Chu's avatar
Howard Chu committed
305
	uint32_t	me_flags;
Howard Chu's avatar
Howard Chu committed
306
307
308
	unsigned int	me_maxreaders;
	unsigned int	me_numdbs;
	unsigned int	me_maxdbs;
Howard Chu's avatar
Howard Chu committed
309
	char		*me_path;
Howard Chu's avatar
Howard Chu committed
310
	char		*me_map;
Howard Chu's avatar
Howard Chu committed
311
	MDB_txninfo	*me_txns;
Howard Chu's avatar
Howard Chu committed
312
313
	MDB_meta	*me_metas[2];
	MDB_meta	*me_meta;
Howard Chu's avatar
Howard Chu committed
314
	MDB_txn		*me_txn;		/* current write transaction */
Howard Chu's avatar
Howard Chu committed
315
316
	size_t		me_mapsize;
	off_t		me_size;		/* current file size */
Howard Chu's avatar
Howard Chu committed
317
318
319
320
	unsigned int	me_psize;
	int			me_db_toggle;
	MDB_dbx		*me_dbxs;		/* array */
	MDB_db		*me_dbs[2];
Howard Chu's avatar
Howard Chu committed
321
	MDB_oldpages *me_pghead;
Howard Chu's avatar
Howard Chu committed
322
	pthread_key_t	me_txkey;	/* thread-key for readers */
323
	pgno_t		me_free_pgs[MDB_IDL_UM_SIZE];
Howard Chu's avatar
Howard Chu committed
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
};

#define NODESIZE	 offsetof(MDB_node, mn_data)

#define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node)	 (void *)((node)->mn_data)
#define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node)	 ((node)->mn_pgno)
#define NODEDSZ(node)	 ((node)->mn_dsize)

#define MDB_COMMIT_PAGES	 64	/* max number of pages to write in one commit */
#define MDB_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */

Howard Chu's avatar
Howard Chu committed
339
340
static int  mdb_search_page_root(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
341
342
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);
Howard Chu's avatar
Howard Chu committed
343
344
static int  mdb_search_page(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
345
346
347
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);

Howard Chu's avatar
Howard Chu committed
348
349
350
static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
static int  mdb_env_read_meta(MDB_env *env, int *which);
static int  mdb_env_write_meta(MDB_txn *txn);
Howard Chu's avatar
Howard Chu committed
351
static MDB_page *mdb_get_page(MDB_txn *txn, pgno_t pgno);
Howard Chu's avatar
Howard Chu committed
352

Howard Chu's avatar
Howard Chu committed
353
static MDB_node *mdb_search_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
354
			    MDB_val *key, int *exactp, unsigned int *kip);
Howard Chu's avatar
Howard Chu committed
355
static int  mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
356
357
			    indx_t indx, MDB_val *key, MDB_val *data,
			    pgno_t pgno, uint8_t flags);
Howard Chu's avatar
Howard Chu committed
358
static void mdb_del_node(MDB_page *mp, indx_t indx);
Howard Chu's avatar
Howard Chu committed
359
360
static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, unsigned int ki,
    MDB_pageparent *mpp, MDB_node *leaf);
Howard Chu's avatar
Howard Chu committed
361
362
static int mdb_put0(MDB_txn *txn, MDB_dbi dbi,
    MDB_val *key, MDB_val *data, unsigned int flags);
Howard Chu's avatar
Howard Chu committed
363
static int  mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
364
365
366
367

static int		 mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mp);
static int		 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
static int		 mdb_move_node(MDB_txn *txn, MDB_dbi dbi, 
Howard Chu's avatar
Howard Chu committed
368
369
				MDB_pageparent *src, indx_t srcindx,
				MDB_pageparent *dst, indx_t dstindx);
Howard Chu's avatar
Howard Chu committed
370
static int		 mdb_merge(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src,
Howard Chu's avatar
Howard Chu committed
371
			    MDB_pageparent *dst);
Howard Chu's avatar
Howard Chu committed
372
static int		 mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp,
Howard Chu's avatar
Howard Chu committed
373
374
			    unsigned int *newindxp, MDB_val *newkey,
			    MDB_val *newdata, pgno_t newpgno);
Howard Chu's avatar
Howard Chu committed
375
static MDB_dpage *mdb_new_page(MDB_txn *txn, MDB_dbi dbi, uint32_t flags, int num);
Howard Chu's avatar
Howard Chu committed
376
377
378
379
380

static void		 cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
			    MDB_page *mp);

Howard Chu's avatar
Howard Chu committed
381
static int		 mdb_set_key(MDB_node *node, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
382
383
static int		 mdb_sibling(MDB_cursor *cursor, int move_right);
static int		 mdb_cursor_next(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
384
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
385
static int		 mdb_cursor_prev(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
386
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
387
static int		 mdb_cursor_set(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
388
			    MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp);
Howard Chu's avatar
Howard Chu committed
389
390
static int		 mdb_cursor_first(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
391
392
static int		 mdb_cursor_last(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
393

Howard Chu's avatar
Howard Chu committed
394
static void		mdb_xcursor_init0(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
Howard Chu's avatar
Howard Chu committed
395
static void		mdb_xcursor_init1(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx, MDB_node *node);
Howard Chu's avatar
Howard Chu committed
396
397
static void		mdb_xcursor_fini(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);

Howard Chu's avatar
Howard Chu committed
398
static size_t		 mdb_leaf_size(MDB_env *env, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
399
			    MDB_val *data);
Howard Chu's avatar
Howard Chu committed
400
static size_t		 mdb_branch_size(MDB_env *env, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
401
402
403
404
405
406
407
408
409

static int		 memncmp(const void *s1, size_t n1,
				 const void *s2, size_t n2);
static int		 memnrcmp(const void *s1, size_t n1,
				  const void *s2, size_t n2);

static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
410
411
412
413
414
	int diff, len_diff = -1;

	if (n1 >= n2) {
		len_diff = (n1 > n2);
		n1 = n2;
Howard Chu's avatar
Howard Chu committed
415
	}
416
417
	diff = memcmp(s1, s2, n1);
	return diff ? diff : len_diff;
Howard Chu's avatar
Howard Chu committed
418
419
420
421
422
}

static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
423
	const unsigned char	*p1, *p2, *p1_lim;
Howard Chu's avatar
Howard Chu committed
424
425

	if (n2 == 0)
426
427
428
		return n1 != 0;
	if (n1 == 0)
		return -1;
Howard Chu's avatar
Howard Chu committed
429
430
431
432

	p1 = (const unsigned char *)s1 + n1 - 1;
	p2 = (const unsigned char *)s2 + n2 - 1;

433
434
435
	for (p1_lim = (n1 <= n2 ? s1 : s2);  *p1 == *p2;  p1--, p2--) {
		if (p1 == p1_lim)
			return (p1 != s1) ? (p1 != p2) : (p2 != s2) ? -1 : 0;
Howard Chu's avatar
Howard Chu committed
436
437
438
439
440
	}
	return *p1 - *p2;
}

int
Howard Chu's avatar
Howard Chu committed
441
mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
Howard Chu's avatar
Howard Chu committed
442
{
Howard Chu's avatar
Howard Chu committed
443
	return txn->mt_dbxs[dbi].md_cmp(a, b);
Howard Chu's avatar
Howard Chu committed
444
445
446
}

static int
Howard Chu's avatar
Howard Chu committed
447
_mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *key1, const MDB_val *key2)
Howard Chu's avatar
Howard Chu committed
448
{
Howard Chu's avatar
Howard Chu committed
449
450
451
452
453
	if (txn->mt_dbs[dbi].md_flags & (MDB_REVERSEKEY
#if __BYTE_ORDER == __LITTLE_ENDIAN
		|MDB_INTEGERKEY
#endif
	))
Howard Chu's avatar
Howard Chu committed
454
455
456
457
458
459
460
		return memnrcmp(key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
	else
		return memncmp((char *)key1->mv_data, key1->mv_size, key2->mv_data, key2->mv_size);
}

/* Allocate new page(s) for writing */
static MDB_dpage *
Howard Chu's avatar
Howard Chu committed
461
mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
Howard Chu's avatar
Howard Chu committed
462
463
{
	MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
464
	pgno_t pgno = P_INVALID;
Howard Chu's avatar
Howard Chu committed
465
	ULONG oldest;
Howard Chu's avatar
Howard Chu committed
466

Howard Chu's avatar
Howard Chu committed
467
468
469
	if (txn->mt_txnid > 2) {

	oldest = txn->mt_txnid - 2;
Howard Chu's avatar
Howard Chu committed
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
	if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
		/* See if there's anything in the free DB */
		MDB_pageparent mpp;
		MDB_node *leaf;
		ULONG *kptr;

		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp);
		leaf = NODEPTR(mpp.mp_page, 0);
		kptr = (ULONG *)NODEKEY(leaf);

		/* It's potentially usable, unless there are still
		 * older readers outstanding. Grab it.
		 */
		if (oldest > *kptr) {
			MDB_oldpages *mop;
			MDB_val data;
			pgno_t *idl;

Howard Chu's avatar
Howard Chu committed
490
			mdb_read_data(txn, leaf, &data);
Howard Chu's avatar
Howard Chu committed
491
492
493
494
495
496
497
			idl = (ULONG *)data.mv_data;
			mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
			mop->mo_next = txn->mt_env->me_pghead;
			mop->mo_txnid = *kptr;
			txn->mt_env->me_pghead = mop;
			memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));

Howard Chu's avatar
Howard Chu committed
498
499
500
501
502
503
504
505
506
507
#if DEBUG > 1
			{
				unsigned int i;
				DPRINTF("IDL read txn %lu root %lu num %lu",
					mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
				for (i=0; i<idl[0]; i++) {
					DPRINTF("IDL %lu", idl[i+1]);
				}
			}
#endif
Howard Chu's avatar
Howard Chu committed
508
509
510
511
512
513
514
515
			/* drop this IDL from the DB */
			mpp.mp_parent = NULL;
			mpp.mp_pi = 0;
			mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp);
			leaf = NODEPTR(mpp.mp_page, 0);
			mdb_del0(txn, FREE_DBI, 0, &mpp, leaf);
		}
	}
Howard Chu's avatar
Howard Chu committed
516
	if (txn->mt_env->me_pghead) {
Howard Chu's avatar
Howard Chu committed
517
		unsigned int i;
Howard Chu's avatar
Howard Chu committed
518
		for (i=0; i<txn->mt_env->me_txns->mt_numreaders; i++) {
Howard Chu's avatar
Howard Chu committed
519
520
521
			ULONG mr = txn->mt_env->me_txns->mt_readers[i].mr_txnid;
			if (!mr) continue;
			if (mr < oldest)
Howard Chu's avatar
Howard Chu committed
522
523
524
525
526
527
				oldest = txn->mt_env->me_txns->mt_readers[i].mr_txnid;
		}
		if (oldest > txn->mt_env->me_pghead->mo_txnid) {
			MDB_oldpages *mop = txn->mt_env->me_pghead;
			txn->mt_oldest = oldest;
			if (num > 1) {
528
529
530
531
				/* FIXME: For now, always use fresh pages. We
				 * really ought to search the free list for a
				 * contiguous range.
				 */
Howard Chu's avatar
Howard Chu committed
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
				;
			} else {
				/* peel pages off tail, so we only have to truncate the list */
				pgno = MDB_IDL_LAST(mop->mo_pages);
				if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
					mop->mo_pages[2]++;
					if (mop->mo_pages[2] > mop->mo_pages[1])
						mop->mo_pages[0] = 0;
				} else {
					mop->mo_pages[0]--;
				}
				if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
					txn->mt_env->me_pghead = mop->mo_next;
					free(mop);
				}
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
550
	}
Howard Chu's avatar
Howard Chu committed
551

Howard Chu's avatar
Howard Chu committed
552
	if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL)
Howard Chu's avatar
Howard Chu committed
553
554
555
556
		return NULL;
	dp->h.md_num = num;
	dp->h.md_parent = parent;
	dp->h.md_pi = parent_idx;
Howard Chu's avatar
Howard Chu committed
557
	STAILQ_INSERT_TAIL(txn->mt_u.dirty_queue, dp, h.md_next);
Howard Chu's avatar
Howard Chu committed
558
559
560
561
562
563
	if (pgno == P_INVALID) {
		dp->p.mp_pgno = txn->mt_next_pgno;
		txn->mt_next_pgno += num;
	} else {
		dp->p.mp_pgno = pgno;
	}
Howard Chu's avatar
Howard Chu committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579

	return dp;
}

/* Touch a page: make it dirty and re-insert into tree with updated pgno.
 */
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
	MDB_page *mp = pp->mp_page;
	pgno_t	pgno;
	assert(txn != NULL);
	assert(pp != NULL);

	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
		MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
580
		if ((dp = mdb_alloc_page(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
Howard Chu's avatar
Howard Chu committed
581
			return ENOMEM;
Howard Chu's avatar
Howard Chu committed
582
		DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
583
		mdb_midl_insert(txn->mt_free_pgs, mp->mp_pgno);
Howard Chu's avatar
Howard Chu committed
584
		pgno = dp->p.mp_pgno;
Howard Chu's avatar
Howard Chu committed
585
		memcpy(&dp->p, mp, txn->mt_env->me_psize);
Howard Chu's avatar
Howard Chu committed
586
587
588
589
590
591
592
593
594
595
596
597
598
		mp = &dp->p;
		mp->mp_pgno = pgno;
		mp->mp_flags |= P_DIRTY;

		/* Update the page number to new touched page. */
		if (pp->mp_parent != NULL)
			NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
		pp->mp_page = mp;
	}
	return 0;
}

int
Howard Chu's avatar
Howard Chu committed
599
mdb_env_sync(MDB_env *env)
Howard Chu's avatar
Howard Chu committed
600
601
602
603
604
605
606
607
608
609
610
611
612
{
	int rc = 0;
	if (!F_ISSET(env->me_flags, MDB_NOSYNC)) {
		if (fsync(env->me_fd))
			rc = errno;
	}
	return rc;
}

int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
	MDB_txn	*txn;
Howard Chu's avatar
Howard Chu committed
613
	int rc, toggle;
Howard Chu's avatar
Howard Chu committed
614

Howard Chu's avatar
Howard Chu committed
615
	if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) {
Howard Chu's avatar
Howard Chu committed
616
617
618
619
620
621
622
		DPRINTF("calloc: %s", strerror(errno));
		return ENOMEM;
	}

	if (rdonly) {
		txn->mt_flags |= MDB_TXN_RDONLY;
	} else {
Howard Chu's avatar
Howard Chu committed
623
624
		txn->mt_u.dirty_queue = calloc(1, sizeof(*txn->mt_u.dirty_queue));
		if (txn->mt_u.dirty_queue == NULL) {
Howard Chu's avatar
Howard Chu committed
625
626
627
			free(txn);
			return ENOMEM;
		}
Howard Chu's avatar
Howard Chu committed
628
		STAILQ_INIT(txn->mt_u.dirty_queue);
Howard Chu's avatar
Howard Chu committed
629

Howard Chu's avatar
Howard Chu committed
630
631
		pthread_mutex_lock(&env->me_txns->mt_wmutex);
		env->me_txns->mt_txnid++;
632
		txn->mt_free_pgs = env->me_free_pgs;
Howard Chu's avatar
Howard Chu committed
633
		txn->mt_free_pgs[0] = 0;
Howard Chu's avatar
Howard Chu committed
634
	}
Howard Chu's avatar
Howard Chu committed
635

Howard Chu's avatar
Howard Chu committed
636
637
638
639
	txn->mt_txnid = env->me_txns->mt_txnid;
	if (rdonly) {
		MDB_reader *r = pthread_getspecific(env->me_txkey);
		if (!r) {
Howard Chu's avatar
Howard Chu committed
640
			unsigned int i;
Howard Chu's avatar
Howard Chu committed
641
			pthread_mutex_lock(&env->me_txns->mt_mutex);
Howard Chu's avatar
Howard Chu committed
642
643
			for (i=0; i<env->me_txns->mt_numreaders; i++)
				if (env->me_txns->mt_readers[i].mr_pid == 0)
Howard Chu's avatar
Howard Chu committed
644
645
					break;
			if (i == env->me_maxreaders) {
Howard Chu's avatar
Howard Chu committed
646
				pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
647
648
				return ENOSPC;
			}
Howard Chu's avatar
Howard Chu committed
649
650
651
652
653
654
655
			env->me_txns->mt_readers[i].mr_pid = getpid();
			env->me_txns->mt_readers[i].mr_tid = pthread_self();
			r = &env->me_txns->mt_readers[i];
			pthread_setspecific(env->me_txkey, r);
			if (i >= env->me_txns->mt_numreaders)
				env->me_txns->mt_numreaders = i+1;
			pthread_mutex_unlock(&env->me_txns->mt_mutex);
Howard Chu's avatar
Howard Chu committed
656
		}
Howard Chu's avatar
Howard Chu committed
657
658
659
		r->mr_txnid = txn->mt_txnid;
		txn->mt_u.reader = r;
	} else {
Howard Chu's avatar
Howard Chu committed
660
661
662
663
664
		env->me_txn = txn;
	}

	txn->mt_env = env;

Howard Chu's avatar
Howard Chu committed
665
	if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
666
667
668
669
		mdb_txn_abort(txn);
		return rc;
	}

Howard Chu's avatar
Howard Chu committed
670
671
	/* Copy the DB arrays */
	txn->mt_numdbs = env->me_numdbs;
Howard Chu's avatar
Howard Chu committed
672
673
674
675
676
677
	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
	txn->mt_dbs = malloc(env->me_maxdbs * sizeof(MDB_db));
	memcpy(txn->mt_dbs, env->me_meta->mm_dbs, 2 * sizeof(MDB_db));
	if (txn->mt_numdbs > 2)
		memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2,
			(txn->mt_numdbs - 2) * sizeof(MDB_db));
Howard Chu's avatar
Howard Chu committed
678
679
680
681

	if (!rdonly) {
		if (toggle)
			txn->mt_flags |= MDB_TXN_METOGGLE;
Howard Chu's avatar
Howard Chu committed
682
		txn->mt_next_pgno = env->me_meta->mm_last_pg+1;
Howard Chu's avatar
Howard Chu committed
683
	}
Howard Chu's avatar
Howard Chu committed
684

685
	DPRINTF("begin transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
686
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701

	*ret = txn;
	return MDB_SUCCESS;
}

void
mdb_txn_abort(MDB_txn *txn)
{
	MDB_dpage *dp;
	MDB_env	*env;

	if (txn == NULL)
		return;

	env = txn->mt_env;
702
	DPRINTF("abort transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
703
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
704

Howard Chu's avatar
Howard Chu committed
705
	free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
706

Howard Chu's avatar
Howard Chu committed
707
708
709
	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		txn->mt_u.reader->mr_txnid = 0;
	} else {
Howard Chu's avatar
Howard Chu committed
710
		MDB_oldpages *mop;
711
712
		unsigned int i;

Howard Chu's avatar
Howard Chu committed
713
		/* Discard all dirty pages. */
Howard Chu's avatar
Howard Chu committed
714
715
716
		while (!STAILQ_EMPTY(txn->mt_u.dirty_queue)) {
			dp = STAILQ_FIRST(txn->mt_u.dirty_queue);
			STAILQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next);
Howard Chu's avatar
Howard Chu committed
717
718
			free(dp);
		}
Howard Chu's avatar
Howard Chu committed
719
		free(txn->mt_u.dirty_queue);
Howard Chu's avatar
Howard Chu committed
720
721
722
723
724
725

		while ((mop = txn->mt_env->me_pghead)) {
			txn->mt_env->me_pghead = mop->mo_next;
			free(mop);
		}

Howard Chu's avatar
Howard Chu committed
726
		env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
727
		env->me_txns->mt_txnid--;
728
729
		for (i=2; i<env->me_numdbs; i++)
			env->me_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
730
		pthread_mutex_unlock(&env->me_txns->mt_wmutex);
Howard Chu's avatar
Howard Chu committed
731
732
733
734
735
736
737
738
739
	}

	free(txn);
}

int
mdb_txn_commit(MDB_txn *txn)
{
	int		 n, done;
Howard Chu's avatar
Howard Chu committed
740
	unsigned int i;
Howard Chu's avatar
Howard Chu committed
741
742
743
744
	ssize_t		 rc;
	off_t		 size;
	MDB_dpage	*dp;
	MDB_env	*env;
745
	pgno_t	next;
Howard Chu's avatar
Howard Chu committed
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
	struct iovec	 iov[MDB_COMMIT_PAGES];

	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	env = txn->mt_env;

	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		DPRINTF("attempt to commit read-only transaction");
		mdb_txn_abort(txn);
		return EPERM;
	}

	if (txn != env->me_txn) {
		DPRINTF("attempt to commit unknown transaction");
		mdb_txn_abort(txn);
		return EINVAL;
	}

	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
		DPRINTF("error flag is set, can't commit");
		mdb_txn_abort(txn);
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
771
	if (STAILQ_EMPTY(txn->mt_u.dirty_queue))
Howard Chu's avatar
Howard Chu committed
772
773
		goto done;

Howard Chu's avatar
Howard Chu committed
774
	DPRINTF("committing transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
775
776
	    txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);

Howard Chu's avatar
Howard Chu committed
777
778
779
780
781
782
783
784
785
786
	/* should only be one record now */
	if (env->me_pghead) {
		MDB_val key, data;
		MDB_oldpages *mop;

		mop = env->me_pghead;
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&mop->mo_txnid;
		data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
		data.mv_data = mop->mo_pages;
Howard Chu's avatar
Howard Chu committed
787
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
		free(env->me_pghead);
		env->me_pghead = NULL;
	}
	/* save to free list */
	if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
		MDB_val key, data;
		MDB_pageparent mpp;

		/* make sure last page of freeDB is touched and on freelist */
		key.mv_size = MAXKEYSIZE+1;
		key.mv_data = NULL;
		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, &key, NULL, 1, &mpp);

Howard Chu's avatar
Howard Chu committed
803
804
805
806
807
808
809
810
811
812
813
#if DEBUG > 1
		{
			unsigned int i;
			ULONG *idl = txn->mt_free_pgs;
			DPRINTF("IDL write txn %lu root %lu num %lu",
				txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
			for (i=0; i<idl[0]; i++) {
				DPRINTF("IDL %lu", idl[i+1]);
			}
		}
#endif
Howard Chu's avatar
Howard Chu committed
814
815
816
817
818
		/* write to last page of freeDB */
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&txn->mt_txnid;
		data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
		data.mv_data = txn->mt_free_pgs;
Howard Chu's avatar
Howard Chu committed
819
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
820
821
	}

Howard Chu's avatar
Howard Chu committed
822
823
824
825
826
827
828
	/* Update DB root pointers. Their pages have already been
	 * touched so this is all in-place and cannot fail.
	 */
	{
		MDB_val data;
		data.mv_size = sizeof(MDB_db);

829
830
		for (i = 2; i < txn->mt_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
831
				data.mv_data = &txn->mt_dbs[i];
Howard Chu's avatar
Howard Chu committed
832
				mdb_put0(txn, MAIN_DBI, &txn->mt_dbxs[i].md_name, &data, 0);
Howard Chu's avatar
Howard Chu committed
833
834
835
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
836
837
838

	/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
	 */
839
	next = 0;
Howard Chu's avatar
Howard Chu committed
840
841
842
	do {
		n = 0;
		done = 1;
843
		size = 0;
Howard Chu's avatar
Howard Chu committed
844
		STAILQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) {
845
			if (dp->p.mp_pgno != next) {
Howard Chu's avatar
Howard Chu committed
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
				if (n) {
					DPRINTF("committing %u dirty pages", n);
					rc = writev(env->me_fd, iov, n);
					if (rc != size) {
						n = errno;
						if (rc > 0)
							DPRINTF("short write, filesystem full?");
						else
							DPRINTF("writev: %s", strerror(errno));
						mdb_txn_abort(txn);
						return n;
					}
					n = 0;
					size = 0;
				}
Howard Chu's avatar
Howard Chu committed
861
				lseek(env->me_fd, dp->p.mp_pgno * env->me_psize, SEEK_SET);
862
863
				next = dp->p.mp_pgno;
			}
Howard Chu's avatar
Howard Chu committed
864
			DPRINTF("committing page %lu", dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
865
			iov[n].iov_len = env->me_psize * dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
866
			iov[n].iov_base = &dp->p;
867
868
			size += iov[n].iov_len;
			next = dp->p.mp_pgno + dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
869
870
871
872
873
874
875
876
877
878
879
880
881
			/* clear dirty flag */
			dp->p.mp_flags &= ~P_DIRTY;
			if (++n >= MDB_COMMIT_PAGES) {
				done = 0;
				break;
			}
		}

		if (n == 0)
			break;

		DPRINTF("committing %u dirty pages", n);
		rc = writev(env->me_fd, iov, n);
882
		if (rc != size) {
Howard Chu's avatar
Howard Chu committed
883
884
885
886
887
888
889
890
891
892
893
			n = errno;
			if (rc > 0)
				DPRINTF("short write, filesystem full?");
			else
				DPRINTF("writev: %s", strerror(errno));
			mdb_txn_abort(txn);
			return n;
		}

	} while (!done);

Howard Chu's avatar
Howard Chu committed
894
895
	/* Drop the dirty pages.
	 */
Howard Chu's avatar
Howard Chu committed
896
897
898
	while (!STAILQ_EMPTY(txn->mt_u.dirty_queue)) {
		dp = STAILQ_FIRST(txn->mt_u.dirty_queue);
		STAILQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next);
Howard Chu's avatar
Howard Chu committed
899
900
901
		free(dp);
	}

Howard Chu's avatar
Howard Chu committed
902
903
904
	if ((n = mdb_env_sync(env)) != 0 ||
	    (n = mdb_env_write_meta(txn)) != MDB_SUCCESS ||
	    (n = mdb_env_sync(env)) != 0) {
Howard Chu's avatar
Howard Chu committed
905
906
907
908
		mdb_txn_abort(txn);
		return n;
	}
	env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
909

Howard Chu's avatar
Howard Chu committed
910
	/* update the DB tables */
Howard Chu's avatar
Howard Chu committed
911
	{
Howard Chu's avatar
Howard Chu committed
912
		int toggle = !env->me_db_toggle;
Howard Chu's avatar
Howard Chu committed
913

914
915
		for (i = 2; i < env->me_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
916
				env->me_dbs[toggle][i] = txn->mt_dbs[i];
917
918
				txn->mt_dbxs[i].md_dirty = 0;
			}
Howard Chu's avatar
Howard Chu committed
919
920
		}
		for (i = env->me_numdbs; i < txn->mt_numdbs; i++) {
921
			txn->mt_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
922
923
924
925
			env->me_dbxs[i] = txn->mt_dbxs[i];
			env->me_dbs[toggle][i] = txn->mt_dbs[i];
		}
		env->me_db_toggle = toggle;
Howard Chu's avatar
Howard Chu committed
926
		env->me_numdbs = txn->mt_numdbs;
Howard Chu's avatar
Howard Chu committed
927

Howard Chu's avatar
Howard Chu committed
928
		free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
929
930
	}

Howard Chu's avatar
Howard Chu committed
931
	pthread_mutex_unlock(&env->me_txns->mt_wmutex);
932
933
	free(txn->mt_u.dirty_queue);
	free(txn);
Howard Chu's avatar
Howard Chu committed
934
	txn = NULL;
Howard Chu's avatar
Howard Chu committed
935
936
937
938
939
940
941
942

done:
	mdb_txn_abort(txn);

	return MDB_SUCCESS;
}

static int
Howard Chu's avatar
Howard Chu committed
943
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
944
945
946
{
	char		 page[PAGESIZE];
	MDB_page	*p;
Howard Chu's avatar
Howard Chu committed
947
	MDB_meta	*m;
Howard Chu's avatar
Howard Chu committed
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
	int		 rc;

	assert(env != NULL);

	/* We don't know the page size yet, so use a minimum value.
	 */

	if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
		return ENOENT;
	} else if (rc != PAGESIZE) {
		if (rc > 0)
			errno = EINVAL;
		DPRINTF("read: %s", strerror(errno));
		return errno;
	}

	p = (MDB_page *)page;

Howard Chu's avatar
Howard Chu committed
966
967
	if (!F_ISSET(p->mp_flags, P_META)) {
		DPRINTF("page %lu not a meta page", p->mp_pgno);
Howard Chu's avatar
Howard Chu committed
968
969
970
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
971
972
973
	m = METADATA(p);
	if (m->mm_magic != MDB_MAGIC) {
		DPRINTF("meta has invalid magic");
Howard Chu's avatar
Howard Chu committed
974
975
976
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
977
	if (m->mm_version != MDB_VERSION) {
Howard Chu's avatar
Howard Chu committed
978
		DPRINTF("database is version %u, expected version %u",
Howard Chu's avatar
Howard Chu committed
979
		    m->mm_version, MDB_VERSION);
Howard Chu's avatar
Howard Chu committed
980
		return MDB_VERSION_MISMATCH;
Howard Chu's avatar
Howard Chu committed
981
982
	}

Howard Chu's avatar
Howard Chu committed
983
	memcpy(meta, m, sizeof(*m));
Howard Chu's avatar
Howard Chu committed
984
985
986
987
	return 0;
}

static int
Howard Chu's avatar
Howard Chu committed
988
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
989
{
990
	MDB_page *p, *q;
Howard Chu's avatar
Howard Chu committed
991
	MDB_meta *m;
992
	int rc;
Howard Chu's avatar
Howard Chu committed
993
	unsigned int	 psize;
Howard Chu's avatar
Howard Chu committed
994

Howard Chu's avatar
Howard Chu committed
995
996
997
	DPRINTF("writing new meta page");
	psize = sysconf(_SC_PAGE_SIZE);

Howard Chu's avatar
Howard Chu committed
998
999
1000
	meta->mm_magic = MDB_MAGIC;
	meta->mm_version = MDB_VERSION;
	meta->mm_psize = psize;