mdb.c 83.4 KB
Newer Older
Howard Chu's avatar
Howard Chu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* mdb.c - memory-mapped database library */
/*
 * Copyright 2011 Howard Chu, Symas Corp.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted only as authorized by the OpenLDAP
 * Public License.
 *
 * A copy of this license is available in the file LICENSE in the
 * top-level directory of the distribution or, alternatively, at
 * <http://www.OpenLDAP.org/license.html>.
 *
 * This code is derived from btree.c written by Martin Hedenfalk.
 *
 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
Howard Chu's avatar
Howard Chu committed
30
31
32
33
34
35
36
37
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
Howard Chu's avatar
Howard Chu committed
38
#include <fcntl.h>
Howard Chu's avatar
Howard Chu committed
39
40
41
42
43
44
45
46
47
48

#include <assert.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
Howard Chu's avatar
Howard Chu committed
49
#include <pthread.h>
Howard Chu's avatar
Howard Chu committed
50
51

#include "mdb.h"
Howard Chu's avatar
Howard Chu committed
52

Howard Chu's avatar
Howard Chu committed
53
54
#define ULONG		unsigned long
typedef ULONG		pgno_t;
Howard Chu's avatar
Howard Chu committed
55

Howard Chu's avatar
Howard Chu committed
56
#include "midl.h"
Howard Chu's avatar
Howard Chu committed
57

58
59
60
#ifndef DEBUG
#define DEBUG 1
#endif
Howard Chu's avatar
Howard Chu committed
61

62
63
64
65
66
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
# define DPRINTF	(void)	/* Vararg macros may be unsupported */
#elif DEBUG
# define DPRINTF(fmt, ...)	/* Requires 2 or more args */ \
	fprintf(stderr, "%s:%d: " fmt "\n", __func__, __LINE__, __VA_ARGS__)
Howard Chu's avatar
Howard Chu committed
67
#else
68
# define DPRINTF(fmt, ...)	((void) 0)
Howard Chu's avatar
Howard Chu committed
69
#endif
70
#define DPUTS(arg)	DPRINTF("%s", arg)
Howard Chu's avatar
Howard Chu committed
71
72
73
74
75

#define PAGESIZE	 4096
#define MDB_MINKEYS	 4
#define MDB_MAGIC	 0xBEEFC0DE
#define MDB_VERSION	 1
Howard Chu's avatar
Howard Chu committed
76
#define MAXKEYSIZE	 511
Howard Chu's avatar
Howard Chu committed
77

Hallvard Furuseth's avatar
Hallvard Furuseth committed
78
#define P_INVALID	 (~0UL)
Howard Chu's avatar
Howard Chu committed
79
80
81
82
83

#define F_ISSET(w, f)	 (((w) & (f)) == (f))

typedef uint16_t	 indx_t;

Howard Chu's avatar
Howard Chu committed
84
85
86
87
88
#define DEFAULT_READERS	126
#define DEFAULT_MAPSIZE	1048576

/* Lock descriptor stuff */
#ifndef CACHELINE
Howard Chu's avatar
Howard Chu committed
89
#define CACHELINE	64	/* most CPUs. Itanium uses 128 */
Howard Chu's avatar
Howard Chu committed
90
91
#endif

Howard Chu's avatar
Howard Chu committed
92
93
94
95
96
97
typedef struct MDB_rxbody {
	ULONG		mrb_txnid;
	pid_t		mrb_pid;
	pthread_t	mrb_tid;
} MDB_rxbody;

Howard Chu's avatar
Howard Chu committed
98
typedef struct MDB_reader {
Howard Chu's avatar
Howard Chu committed
99
100
101
102
103
104
105
106
	union {
		MDB_rxbody mrx;
#define	mr_txnid	mru.mrx.mrb_txnid
#define	mr_pid	mru.mrx.mrb_pid
#define	mr_tid	mru.mrx.mrb_tid
		/* cache line alignment */
		char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mru;
Howard Chu's avatar
Howard Chu committed
107
108
109
} MDB_reader;

typedef struct MDB_txbody {
Howard Chu's avatar
Howard Chu committed
110
111
112
113
114
	uint32_t	mtb_magic;
	uint32_t	mtb_version;
	pthread_mutex_t	mtb_mutex;
	ULONG		mtb_txnid;
	uint32_t	mtb_numreaders;
Howard Chu's avatar
Howard Chu committed
115
	uint32_t	mtb_me_toggle;
Howard Chu's avatar
Howard Chu committed
116
117
118
} MDB_txbody;

typedef struct MDB_txninfo {
Howard Chu's avatar
Howard Chu committed
119
120
121
122
123
124
125
	union {
		MDB_txbody mtb;
#define mti_magic	mt1.mtb.mtb_magic
#define mti_version	mt1.mtb.mtb_version
#define mti_mutex	mt1.mtb.mtb_mutex
#define mti_txnid	mt1.mtb.mtb_txnid
#define mti_numreaders	mt1.mtb.mtb_numreaders
Howard Chu's avatar
Howard Chu committed
126
#define mti_me_toggle	mt1.mtb.mtb_me_toggle
Howard Chu's avatar
Howard Chu committed
127
128
129
130
131
132
133
134
		char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt1;
	union {
		pthread_mutex_t	mt2_wmutex;
#define mti_wmutex	mt2.mt2_wmutex
		char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt2;
	MDB_reader	mti_readers[1];
Howard Chu's avatar
Howard Chu committed
135
136
} MDB_txninfo;

Howard Chu's avatar
Howard Chu committed
137
138
139
140
141
/* Common header for all page types. Overflow pages
 * occupy a number of contiguous pages with no
 * headers on any page after the first.
 */
typedef struct MDB_page {		/* represents a page of storage */
Howard Chu's avatar
Howard Chu committed
142
143
144
#define	mp_pgno		mp_p.p_pgno
	union padded {
		pgno_t		p_pgno;		/* page number */
145
		void *		p_align;	/* for IL32P64 */
Howard Chu's avatar
Howard Chu committed
146
	} mp_p;
Howard Chu's avatar
Howard Chu committed
147
148
149
150
#define	P_BRANCH	 0x01		/* branch page */
#define	P_LEAF		 0x02		/* leaf page */
#define	P_OVERFLOW	 0x04		/* overflow page */
#define	P_META		 0x08		/* meta page */
Howard Chu's avatar
Howard Chu committed
151
#define	P_DIRTY		 0x10		/* dirty page */
Howard Chu's avatar
Howard Chu committed
152
#define	P_LEAF2		 0x20		/* DB with small, fixed size keys and no data */
Howard Chu's avatar
Howard Chu committed
153
154
155
156
157
158
159
160
161
162
	uint32_t	mp_flags;
#define mp_lower	mp_pb.pb.pb_lower
#define mp_upper	mp_pb.pb.pb_upper
#define mp_pages	mp_pb.pb_pages
	union page_bounds {
		struct {
			indx_t		pb_lower;		/* lower bound of free space */
			indx_t		pb_upper;		/* upper bound of free space */
		} pb;
		uint32_t	pb_pages;	/* number of overflow pages */
Howard Chu's avatar
Howard Chu committed
163
164
165
166
		struct {
			indx_t	pb_ksize;	/* on a LEAF2 page */
			indx_t	pb_numkeys;
		} pb2;
Howard Chu's avatar
Howard Chu committed
167
168
169
170
	} mp_pb;
	indx_t		mp_ptrs[1];		/* dynamic size */
} MDB_page;

171
#define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
Howard Chu's avatar
Howard Chu committed
172
173
174

#define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
Howard Chu's avatar
Howard Chu committed
175
176
#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
				((env)->me_psize - PAGEHDRSZ))
Howard Chu's avatar
Howard Chu committed
177
178
179
180
#define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)

Howard Chu's avatar
Howard Chu committed
181
182
#define OVPAGES(size, psize)	(PAGEHDRSZ + size + psize - 1) / psize;

Howard Chu's avatar
Howard Chu committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
typedef struct MDB_db {
	uint32_t	md_pad;
	uint16_t	md_flags;
	uint16_t	md_depth;
	ULONG		md_branch_pages;
	ULONG		md_leaf_pages;
	ULONG		md_overflow_pages;
	ULONG		md_entries;
	pgno_t		md_root;
} MDB_db;

#define	FREE_DBI	0
#define	MAIN_DBI	1

Howard Chu's avatar
Howard Chu committed
197
typedef struct MDB_meta {			/* meta (footer) page content */
Howard Chu's avatar
Howard Chu committed
198
199
200
201
	uint32_t	mm_magic;
	uint32_t	mm_version;
	void		*mm_address;		/* address for fixed mapping */
	size_t		mm_mapsize;			/* size of mmap region */
Howard Chu's avatar
Howard Chu committed
202
203
204
	MDB_db		mm_dbs[2];			/* first is free space, 2nd is main db */
#define	mm_psize	mm_dbs[0].md_pad
#define	mm_flags	mm_dbs[0].md_flags
Howard Chu's avatar
Howard Chu committed
205
206
	pgno_t		mm_last_pg;			/* last used page in file */
	ULONG		mm_txnid;			/* txnid that committed this page */
Howard Chu's avatar
Howard Chu committed
207
208
209
210
} MDB_meta;

typedef struct MDB_dhead {					/* a dirty page */
	MDB_page	*md_parent;
211
	unsigned	md_pi;				/* parent index */
Howard Chu's avatar
Howard Chu committed
212
213
214
215
216
217
218
219
	int			md_num;
} MDB_dhead;

typedef struct MDB_dpage {
	MDB_dhead	h;
	MDB_page	p;
} MDB_dpage;

Howard Chu's avatar
Howard Chu committed
220
221
typedef struct MDB_oldpages {
	struct MDB_oldpages *mo_next;
Howard Chu's avatar
Howard Chu committed
222
	ULONG		mo_txnid;
Howard Chu's avatar
Howard Chu committed
223
224
225
	pgno_t		mo_pages[1];	/* dynamic */
} MDB_oldpages;

Howard Chu's avatar
Howard Chu committed
226
227
228
typedef struct MDB_pageparent {
	MDB_page *mp_page;
	MDB_page *mp_parent;
229
	unsigned mp_pi;
Howard Chu's avatar
Howard Chu committed
230
231
} MDB_pageparent;

Howard Chu's avatar
Howard Chu committed
232
static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num);
Howard Chu's avatar
Howard Chu committed
233
234
235
236
237
238
239
static int 		mdb_touch(MDB_txn *txn, MDB_pageparent *mp);

typedef struct MDB_ppage {					/* ordered list of pages */
	MDB_page		*mp_page;
	unsigned int	mp_ki;		/* cursor index on page */
} MDB_ppage;

Howard Chu's avatar
Howard Chu committed
240
241
#define CURSOR_TOP(c)		 (&(c)->mc_stack[(c)->mc_snum-1])
#define CURSOR_PARENT(c)	 (&(c)->mc_stack[(c)->mc_snum-2])
Howard Chu's avatar
Howard Chu committed
242

Howard Chu's avatar
Howard Chu committed
243
struct MDB_xcursor;
244

Howard Chu's avatar
Howard Chu committed
245
246
struct MDB_cursor {
	MDB_txn		*mc_txn;
Howard Chu's avatar
Howard Chu committed
247
248
	MDB_ppage	mc_stack[32];		/* stack of parent pages */
	unsigned int	mc_snum;		/* number of pushed pages */
Howard Chu's avatar
Howard Chu committed
249
	MDB_dbi		mc_dbi;
Howard Chu's avatar
Howard Chu committed
250
251
	short		mc_initialized;	/* 1 if initialized */
	short		mc_eof;		/* 1 if end is reached */
Howard Chu's avatar
Howard Chu committed
252
	struct MDB_xcursor	*mc_xcursor;
Howard Chu's avatar
Howard Chu committed
253
254
255
256
257
258
259
260
261
262
263
};

#define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))

typedef struct MDB_node {
#define mn_pgno		 mn_p.np_pgno
#define mn_dsize	 mn_p.np_dsize
	union {
		pgno_t		 np_pgno;	/* child page number */
		uint32_t	 np_dsize;	/* leaf data size */
	} mn_p;
Howard Chu's avatar
Howard Chu committed
264
265
	unsigned int	mn_flags:4;
	unsigned int	mn_ksize:12;			/* key size */
Howard Chu's avatar
Howard Chu committed
266
#define F_BIGDATA	 0x01			/* data put on overflow page */
Howard Chu's avatar
Howard Chu committed
267
#define F_SUBDATA	 0x02			/* data is a sub-database */
Howard Chu's avatar
Howard Chu committed
268
#define F_DUPDATA	 0x04			/* data has duplicates */
Howard Chu's avatar
Howard Chu committed
269
270
271
	char		mn_data[1];
} MDB_node;

Howard Chu's avatar
Howard Chu committed
272
typedef struct MDB_dbx {
273
	MDB_val		md_name;
Howard Chu's avatar
Howard Chu committed
274
275
276
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_cmp_func	*md_dcmp;		/* user dupsort function */
	MDB_rel_func	*md_rel;		/* user relocate function */
277
278
	MDB_dbi	md_parent;
	unsigned int	md_dirty;
Howard Chu's avatar
Howard Chu committed
279
280
} MDB_dbx;

Howard Chu's avatar
Howard Chu committed
281
282
struct MDB_txn {
	pgno_t		mt_next_pgno;	/* next unallocated page */
Howard Chu's avatar
Howard Chu committed
283
284
	ULONG		mt_txnid;
	ULONG		mt_oldest;
Howard Chu's avatar
Howard Chu committed
285
	MDB_env		*mt_env;	
Howard Chu's avatar
Howard Chu committed
286
	pgno_t		*mt_free_pgs;	/* this is an IDL */
Howard Chu's avatar
Howard Chu committed
287
	union {
Howard Chu's avatar
Howard Chu committed
288
		MIDL2	*dirty_list;	/* modified pages */
Howard Chu's avatar
Howard Chu committed
289
290
		MDB_reader	*reader;
	} mt_u;
Howard Chu's avatar
Howard Chu committed
291
	MDB_dbx		*mt_dbxs;		/* array */
Howard Chu's avatar
Howard Chu committed
292
	MDB_db		*mt_dbs;
Howard Chu's avatar
Howard Chu committed
293
294
	unsigned int	mt_numdbs;

Howard Chu's avatar
Howard Chu committed
295
296
#define MDB_TXN_RDONLY		0x01		/* read-only transaction */
#define MDB_TXN_ERROR		0x02		/* an error has occurred */
Howard Chu's avatar
Howard Chu committed
297
#define MDB_TXN_METOGGLE	0x04		/* used meta page 1 */
Howard Chu's avatar
Howard Chu committed
298
	unsigned int	mt_flags;
Howard Chu's avatar
Howard Chu committed
299
300
};

Howard Chu's avatar
Howard Chu committed
301
302
303
304
305
306
307
308
/* Context for sorted-dup records */
typedef struct MDB_xcursor {
	MDB_cursor mx_cursor;
	MDB_txn mx_txn;
	MDB_dbx	mx_dbxs[4];
	MDB_db	mx_dbs[4];
} MDB_xcursor;

Howard Chu's avatar
Howard Chu committed
309
310
struct MDB_env {
	int			me_fd;
Howard Chu's avatar
Howard Chu committed
311
	int			me_lfd;
Howard Chu's avatar
Howard Chu committed
312
	int			me_mfd;			/* just for writing the meta pages */
Howard Chu's avatar
Howard Chu committed
313
314
315
#define	MDB_FATAL_ERROR	0x80000000U
	uint32_t 	me_flags;
	uint32_t	me_extrapad;	/* unused for now */
Howard Chu's avatar
Howard Chu committed
316
317
318
	unsigned int	me_maxreaders;
	unsigned int	me_numdbs;
	unsigned int	me_maxdbs;
Howard Chu's avatar
Howard Chu committed
319
	char		*me_path;
Howard Chu's avatar
Howard Chu committed
320
	char		*me_map;
Howard Chu's avatar
Howard Chu committed
321
	MDB_txninfo	*me_txns;
Howard Chu's avatar
Howard Chu committed
322
323
	MDB_meta	*me_metas[2];
	MDB_meta	*me_meta;
Howard Chu's avatar
Howard Chu committed
324
	MDB_txn		*me_txn;		/* current write transaction */
Howard Chu's avatar
Howard Chu committed
325
326
	size_t		me_mapsize;
	off_t		me_size;		/* current file size */
Howard Chu's avatar
Howard Chu committed
327
328
329
	pgno_t		me_maxpg;		/* me_mapsize / me_psize */
	unsigned int	me_psize;
	unsigned int	me_db_toggle;
Howard Chu's avatar
Howard Chu committed
330
331
	MDB_dbx		*me_dbxs;		/* array */
	MDB_db		*me_dbs[2];
Howard Chu's avatar
Howard Chu committed
332
	MDB_oldpages *me_pghead;
Howard Chu's avatar
Howard Chu committed
333
	pthread_key_t	me_txkey;	/* thread-key for readers */
334
	pgno_t		me_free_pgs[MDB_IDL_UM_SIZE];
Howard Chu's avatar
Howard Chu committed
335
	MIDL2		me_dirty_list[MDB_IDL_DB_SIZE];
Howard Chu's avatar
Howard Chu committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
};

#define NODESIZE	 offsetof(MDB_node, mn_data)

#define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node)	 (void *)((node)->mn_data)
#define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node)	 ((node)->mn_pgno)
#define NODEDSZ(node)	 ((node)->mn_dsize)

#define MDB_COMMIT_PAGES	 64	/* max number of pages to write in one commit */

Howard Chu's avatar
Howard Chu committed
350
351
static int  mdb_search_page_root(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
352
353
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);
Howard Chu's avatar
Howard Chu committed
354
355
static int  mdb_search_page(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
356
357
358
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);

Howard Chu's avatar
Howard Chu committed
359
360
361
static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
static int  mdb_env_read_meta(MDB_env *env, int *which);
static int  mdb_env_write_meta(MDB_txn *txn);
Howard Chu's avatar
Howard Chu committed
362
static MDB_page *mdb_get_page(MDB_txn *txn, pgno_t pgno);
Howard Chu's avatar
Howard Chu committed
363

Howard Chu's avatar
Howard Chu committed
364
static MDB_node *mdb_search_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
365
			    MDB_val *key, int *exactp, unsigned int *kip);
Howard Chu's avatar
Howard Chu committed
366
static int  mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
367
368
			    indx_t indx, MDB_val *key, MDB_val *data,
			    pgno_t pgno, uint8_t flags);
Howard Chu's avatar
Howard Chu committed
369
static void mdb_del_node(MDB_page *mp, indx_t indx);
Howard Chu's avatar
Howard Chu committed
370
371
static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, unsigned int ki,
    MDB_pageparent *mpp, MDB_node *leaf);
Howard Chu's avatar
Howard Chu committed
372
373
static int mdb_put0(MDB_txn *txn, MDB_dbi dbi,
    MDB_val *key, MDB_val *data, unsigned int flags);
Howard Chu's avatar
Howard Chu committed
374
static int  mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
375
376
377
378

static int		 mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mp);
static int		 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
static int		 mdb_move_node(MDB_txn *txn, MDB_dbi dbi, 
Howard Chu's avatar
Howard Chu committed
379
380
				MDB_pageparent *src, indx_t srcindx,
				MDB_pageparent *dst, indx_t dstindx);
Howard Chu's avatar
Howard Chu committed
381
static int		 mdb_merge(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src,
Howard Chu's avatar
Howard Chu committed
382
			    MDB_pageparent *dst);
Howard Chu's avatar
Howard Chu committed
383
static int		 mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp,
Howard Chu's avatar
Howard Chu committed
384
385
			    unsigned int *newindxp, MDB_val *newkey,
			    MDB_val *newdata, pgno_t newpgno);
Howard Chu's avatar
Howard Chu committed
386
static MDB_dpage *mdb_new_page(MDB_txn *txn, MDB_dbi dbi, uint32_t flags, int num);
Howard Chu's avatar
Howard Chu committed
387
388
389
390
391

static void		 cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
			    MDB_page *mp);

Howard Chu's avatar
Howard Chu committed
392
static int		 mdb_set_key(MDB_node *node, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
393
394
static int		 mdb_sibling(MDB_cursor *cursor, int move_right);
static int		 mdb_cursor_next(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
395
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
396
static int		 mdb_cursor_prev(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
397
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
398
static int		 mdb_cursor_set(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
399
			    MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp);
Howard Chu's avatar
Howard Chu committed
400
401
static int		 mdb_cursor_first(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
402
403
static int		 mdb_cursor_last(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
404

Howard Chu's avatar
Howard Chu committed
405
static void		mdb_xcursor_init0(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
Howard Chu's avatar
Howard Chu committed
406
static void		mdb_xcursor_init1(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx, MDB_node *node);
Howard Chu's avatar
Howard Chu committed
407
408
static void		mdb_xcursor_fini(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);

Howard Chu's avatar
Howard Chu committed
409
static size_t		 mdb_leaf_size(MDB_env *env, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
410
			    MDB_val *data);
Howard Chu's avatar
Howard Chu committed
411
static size_t		 mdb_branch_size(MDB_env *env, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
412
413
414
415
416
417
418
419
420

static int		 memncmp(const void *s1, size_t n1,
				 const void *s2, size_t n2);
static int		 memnrcmp(const void *s1, size_t n1,
				  const void *s2, size_t n2);

static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
421
422
423
424
425
	int diff, len_diff = -1;

	if (n1 >= n2) {
		len_diff = (n1 > n2);
		n1 = n2;
Howard Chu's avatar
Howard Chu committed
426
	}
427
428
	diff = memcmp(s1, s2, n1);
	return diff ? diff : len_diff;
Howard Chu's avatar
Howard Chu committed
429
430
431
432
433
}

static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
434
	const unsigned char	*p1, *p2, *p1_lim;
Howard Chu's avatar
Howard Chu committed
435
436

	if (n2 == 0)
437
438
439
		return n1 != 0;
	if (n1 == 0)
		return -1;
Howard Chu's avatar
Howard Chu committed
440
441
442
443

	p1 = (const unsigned char *)s1 + n1 - 1;
	p2 = (const unsigned char *)s2 + n2 - 1;

444
445
446
	for (p1_lim = (n1 <= n2 ? s1 : s2);  *p1 == *p2;  p1--, p2--) {
		if (p1 == p1_lim)
			return (p1 != s1) ? (p1 != p2) : (p2 != s2) ? -1 : 0;
Howard Chu's avatar
Howard Chu committed
447
448
449
450
	}
	return *p1 - *p2;
}

Howard Chu's avatar
Howard Chu committed
451
452
453
454
455
456
457
458
459
char *
mdb_version(int *maj, int *min, int *pat)
{
	*maj = MDB_VERSION_MAJOR;
	*min = MDB_VERSION_MINOR;
	*pat = MDB_VERSION_PATCH;
	return MDB_VERSION_STRING;
}

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
static const char *errstr[] = {
	"MDB_KEYEXIST: Key/data pair already exists",
	"MDB_NOTFOUND: No matching key/data pair found",
	"MDB_PAGE_NOTFOUND: Requested page not found",
	"MDB_CORRUPTED: Located page was wrong type",
	"MDB_PANIC: Update of meta page failed",
	"MDB_VERSION_MISMATCH: Database environment version mismatch"
};

char *
mdb_strerror(int err)
{
	if (!err)
		return ("Successful return: 0");

	if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
		return (char *)errstr[err - MDB_KEYEXIST];

	return strerror(err);
}

Howard Chu's avatar
Howard Chu committed
481
int
Howard Chu's avatar
Howard Chu committed
482
mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
Howard Chu's avatar
Howard Chu committed
483
{
Howard Chu's avatar
Howard Chu committed
484
485
	if (txn->mt_dbxs[dbi].md_cmp)
		return txn->mt_dbxs[dbi].md_cmp(a, b);
Howard Chu's avatar
Howard Chu committed
486

Howard Chu's avatar
Howard Chu committed
487
488
489
490
491
	if (txn->mt_dbs[dbi].md_flags & (MDB_REVERSEKEY
#if __BYTE_ORDER == __LITTLE_ENDIAN
		|MDB_INTEGERKEY
#endif
	))
Howard Chu's avatar
Howard Chu committed
492
		return memnrcmp(a->mv_data, a->mv_size, b->mv_data, b->mv_size);
Howard Chu's avatar
Howard Chu committed
493
	else
Howard Chu's avatar
Howard Chu committed
494
495
496
497
498
499
500
501
502
503
		return memncmp((char *)a->mv_data, a->mv_size, b->mv_data, b->mv_size);
}

int
mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
{
	if (txn->mt_dbxs[dbi].md_dcmp)
		return txn->mt_dbxs[dbi].md_dcmp(a, b);

	return memncmp((char *)a->mv_data, a->mv_size, b->mv_data, b->mv_size);
Howard Chu's avatar
Howard Chu committed
504
505
506
507
}

/* Allocate new page(s) for writing */
static MDB_dpage *
Howard Chu's avatar
Howard Chu committed
508
mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
Howard Chu's avatar
Howard Chu committed
509
510
{
	MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
511
	pgno_t pgno = P_INVALID;
Howard Chu's avatar
Howard Chu committed
512
	ULONG oldest;
Howard Chu's avatar
Howard Chu committed
513
	MIDL2 mid;
Howard Chu's avatar
Howard Chu committed
514

Howard Chu's avatar
Howard Chu committed
515
516
517
	if (txn->mt_txnid > 2) {

	oldest = txn->mt_txnid - 2;
Howard Chu's avatar
Howard Chu committed
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
	if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
		/* See if there's anything in the free DB */
		MDB_pageparent mpp;
		MDB_node *leaf;
		ULONG *kptr;

		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp);
		leaf = NODEPTR(mpp.mp_page, 0);
		kptr = (ULONG *)NODEKEY(leaf);

		/* It's potentially usable, unless there are still
		 * older readers outstanding. Grab it.
		 */
		if (oldest > *kptr) {
			MDB_oldpages *mop;
			MDB_val data;
			pgno_t *idl;

Howard Chu's avatar
Howard Chu committed
538
			mdb_read_data(txn, leaf, &data);
Howard Chu's avatar
Howard Chu committed
539
540
541
542
543
544
545
			idl = (ULONG *)data.mv_data;
			mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
			mop->mo_next = txn->mt_env->me_pghead;
			mop->mo_txnid = *kptr;
			txn->mt_env->me_pghead = mop;
			memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));

Howard Chu's avatar
Howard Chu committed
546
547
548
549
550
551
552
553
554
555
#if DEBUG > 1
			{
				unsigned int i;
				DPRINTF("IDL read txn %lu root %lu num %lu",
					mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
				for (i=0; i<idl[0]; i++) {
					DPRINTF("IDL %lu", idl[i+1]);
				}
			}
#endif
Howard Chu's avatar
Howard Chu committed
556
557
558
559
560
561
562
563
			/* drop this IDL from the DB */
			mpp.mp_parent = NULL;
			mpp.mp_pi = 0;
			mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp);
			leaf = NODEPTR(mpp.mp_page, 0);
			mdb_del0(txn, FREE_DBI, 0, &mpp, leaf);
		}
	}
Howard Chu's avatar
Howard Chu committed
564
	if (txn->mt_env->me_pghead) {
Howard Chu's avatar
Howard Chu committed
565
		unsigned int i;
Howard Chu's avatar
Howard Chu committed
566
567
		for (i=0; i<txn->mt_env->me_txns->mti_numreaders; i++) {
			ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
568
569
			if (!mr) continue;
			if (mr < oldest)
Howard Chu's avatar
Howard Chu committed
570
				oldest = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
571
572
573
574
575
		}
		if (oldest > txn->mt_env->me_pghead->mo_txnid) {
			MDB_oldpages *mop = txn->mt_env->me_pghead;
			txn->mt_oldest = oldest;
			if (num > 1) {
576
577
578
579
				/* FIXME: For now, always use fresh pages. We
				 * really ought to search the free list for a
				 * contiguous range.
				 */
Howard Chu's avatar
Howard Chu committed
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
				;
			} else {
				/* peel pages off tail, so we only have to truncate the list */
				pgno = MDB_IDL_LAST(mop->mo_pages);
				if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
					mop->mo_pages[2]++;
					if (mop->mo_pages[2] > mop->mo_pages[1])
						mop->mo_pages[0] = 0;
				} else {
					mop->mo_pages[0]--;
				}
				if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
					txn->mt_env->me_pghead = mop->mo_next;
					free(mop);
				}
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
598
	}
Howard Chu's avatar
Howard Chu committed
599

Howard Chu's avatar
Howard Chu committed
600
601
602
603
604
	if (pgno == P_INVALID) {
		/* DB size is maxed out */
		if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg)
			return NULL;
	}
Howard Chu's avatar
Howard Chu committed
605
	if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL)
Howard Chu's avatar
Howard Chu committed
606
607
608
609
		return NULL;
	dp->h.md_num = num;
	dp->h.md_parent = parent;
	dp->h.md_pi = parent_idx;
Howard Chu's avatar
Howard Chu committed
610
611
612
613
614
615
	if (pgno == P_INVALID) {
		dp->p.mp_pgno = txn->mt_next_pgno;
		txn->mt_next_pgno += num;
	} else {
		dp->p.mp_pgno = pgno;
	}
Howard Chu's avatar
Howard Chu committed
616
617
618
	mid.mid = dp->p.mp_pgno;
	mid.mptr = dp;
	mdb_midl2_insert(txn->mt_u.dirty_list, &mid);
Howard Chu's avatar
Howard Chu committed
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634

	return dp;
}

/* Touch a page: make it dirty and re-insert into tree with updated pgno.
 */
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
	MDB_page *mp = pp->mp_page;
	pgno_t	pgno;
	assert(txn != NULL);
	assert(pp != NULL);

	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
		MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
635
		if ((dp = mdb_alloc_page(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
Howard Chu's avatar
Howard Chu committed
636
			return ENOMEM;
Howard Chu's avatar
Howard Chu committed
637
		DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
638
		mdb_midl_insert(txn->mt_free_pgs, mp->mp_pgno);
Howard Chu's avatar
Howard Chu committed
639
		pgno = dp->p.mp_pgno;
Howard Chu's avatar
Howard Chu committed
640
		memcpy(&dp->p, mp, txn->mt_env->me_psize);
Howard Chu's avatar
Howard Chu committed
641
642
643
644
645
646
647
648
649
650
651
652
653
		mp = &dp->p;
		mp->mp_pgno = pgno;
		mp->mp_flags |= P_DIRTY;

		/* Update the page number to new touched page. */
		if (pp->mp_parent != NULL)
			NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
		pp->mp_page = mp;
	}
	return 0;
}

int
Howard Chu's avatar
Howard Chu committed
654
mdb_env_sync(MDB_env *env, int force)
Howard Chu's avatar
Howard Chu committed
655
656
{
	int rc = 0;
Howard Chu's avatar
Howard Chu committed
657
	if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
Howard Chu's avatar
Howard Chu committed
658
		if (fdatasync(env->me_fd))
Howard Chu's avatar
Howard Chu committed
659
660
661
662
663
664
665
666
667
			rc = errno;
	}
	return rc;
}

int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
	MDB_txn	*txn;
Howard Chu's avatar
Howard Chu committed
668
	int rc, toggle;
Howard Chu's avatar
Howard Chu committed
669

Howard Chu's avatar
Howard Chu committed
670
	if (env->me_flags & MDB_FATAL_ERROR) {
671
		DPUTS("mdb_txn_begin: environment had fatal error, must shutdown!");
Howard Chu's avatar
Howard Chu committed
672
673
		return MDB_PANIC;
	}
Howard Chu's avatar
Howard Chu committed
674
	if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) {
Howard Chu's avatar
Howard Chu committed
675
676
677
678
679
680
681
		DPRINTF("calloc: %s", strerror(errno));
		return ENOMEM;
	}

	if (rdonly) {
		txn->mt_flags |= MDB_TXN_RDONLY;
	} else {
Howard Chu's avatar
Howard Chu committed
682
683
684
685
		txn->mt_u.dirty_list = env->me_dirty_list;
		txn->mt_u.dirty_list[0].mid = 0;
		txn->mt_free_pgs = env->me_free_pgs;
		txn->mt_free_pgs[0] = 0;
Howard Chu's avatar
Howard Chu committed
686

Howard Chu's avatar
Howard Chu committed
687
688
		pthread_mutex_lock(&env->me_txns->mti_wmutex);
		env->me_txns->mti_txnid++;
Howard Chu's avatar
Howard Chu committed
689
	}
Howard Chu's avatar
Howard Chu committed
690

Howard Chu's avatar
Howard Chu committed
691
	txn->mt_txnid = env->me_txns->mti_txnid;
Howard Chu's avatar
Howard Chu committed
692
693
694
	if (rdonly) {
		MDB_reader *r = pthread_getspecific(env->me_txkey);
		if (!r) {
Howard Chu's avatar
Howard Chu committed
695
			unsigned int i;
Howard Chu's avatar
Howard Chu committed
696
697
698
			pthread_mutex_lock(&env->me_txns->mti_mutex);
			for (i=0; i<env->me_txns->mti_numreaders; i++)
				if (env->me_txns->mti_readers[i].mr_pid == 0)
Howard Chu's avatar
Howard Chu committed
699
700
					break;
			if (i == env->me_maxreaders) {
Howard Chu's avatar
Howard Chu committed
701
				pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
702
703
				return ENOSPC;
			}
Howard Chu's avatar
Howard Chu committed
704
705
706
			env->me_txns->mti_readers[i].mr_pid = getpid();
			env->me_txns->mti_readers[i].mr_tid = pthread_self();
			r = &env->me_txns->mti_readers[i];
Howard Chu's avatar
Howard Chu committed
707
			pthread_setspecific(env->me_txkey, r);
Howard Chu's avatar
Howard Chu committed
708
709
710
			if (i >= env->me_txns->mti_numreaders)
				env->me_txns->mti_numreaders = i+1;
			pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
711
		}
Howard Chu's avatar
Howard Chu committed
712
713
714
		r->mr_txnid = txn->mt_txnid;
		txn->mt_u.reader = r;
	} else {
Howard Chu's avatar
Howard Chu committed
715
716
717
718
719
		env->me_txn = txn;
	}

	txn->mt_env = env;

Howard Chu's avatar
Howard Chu committed
720
	toggle = env->me_txns->mti_me_toggle;
Howard Chu's avatar
Howard Chu committed
721
	if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
722
723
724
725
		mdb_txn_abort(txn);
		return rc;
	}

Howard Chu's avatar
Howard Chu committed
726
727
	/* Copy the DB arrays */
	txn->mt_numdbs = env->me_numdbs;
Howard Chu's avatar
Howard Chu committed
728
729
730
731
732
733
	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
	txn->mt_dbs = malloc(env->me_maxdbs * sizeof(MDB_db));
	memcpy(txn->mt_dbs, env->me_meta->mm_dbs, 2 * sizeof(MDB_db));
	if (txn->mt_numdbs > 2)
		memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2,
			(txn->mt_numdbs - 2) * sizeof(MDB_db));
Howard Chu's avatar
Howard Chu committed
734
735
736
737

	if (!rdonly) {
		if (toggle)
			txn->mt_flags |= MDB_TXN_METOGGLE;
Howard Chu's avatar
Howard Chu committed
738
		txn->mt_next_pgno = env->me_meta->mm_last_pg+1;
Howard Chu's avatar
Howard Chu committed
739
	}
Howard Chu's avatar
Howard Chu committed
740

741
	DPRINTF("begin transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
742
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
743
744
745
746
747
748
749
750
751
752
753
754
755
756

	*ret = txn;
	return MDB_SUCCESS;
}

void
mdb_txn_abort(MDB_txn *txn)
{
	MDB_env	*env;

	if (txn == NULL)
		return;

	env = txn->mt_env;
757
	DPRINTF("abort transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
758
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
759

Howard Chu's avatar
Howard Chu committed
760
	free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
761

Howard Chu's avatar
Howard Chu committed
762
763
764
	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		txn->mt_u.reader->mr_txnid = 0;
	} else {
Howard Chu's avatar
Howard Chu committed
765
		MDB_oldpages *mop;
766
767
		unsigned int i;

Howard Chu's avatar
Howard Chu committed
768
		/* Discard all dirty pages. */
Howard Chu's avatar
Howard Chu committed
769
770
		for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++)
			free(txn->mt_u.dirty_list[i].mptr);
Howard Chu's avatar
Howard Chu committed
771
772
773
774
775
776

		while ((mop = txn->mt_env->me_pghead)) {
			txn->mt_env->me_pghead = mop->mo_next;
			free(mop);
		}

Howard Chu's avatar
Howard Chu committed
777
		env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
778
		env->me_txns->mti_txnid--;
779
780
		for (i=2; i<env->me_numdbs; i++)
			env->me_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
781
		pthread_mutex_unlock(&env->me_txns->mti_wmutex);
Howard Chu's avatar
Howard Chu committed
782
783
784
785
786
787
788
789
790
	}

	free(txn);
}

int
mdb_txn_commit(MDB_txn *txn)
{
	int		 n, done;
Howard Chu's avatar
Howard Chu committed
791
	unsigned int i;
Howard Chu's avatar
Howard Chu committed
792
793
794
795
	ssize_t		 rc;
	off_t		 size;
	MDB_dpage	*dp;
	MDB_env	*env;
796
	pgno_t	next;
Howard Chu's avatar
Howard Chu committed
797
798
799
800
801
802
803
804
805
	struct iovec	 iov[MDB_COMMIT_PAGES];

	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	env = txn->mt_env;

	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		mdb_txn_abort(txn);
806
		return MDB_SUCCESS;
Howard Chu's avatar
Howard Chu committed
807
808
809
	}

	if (txn != env->me_txn) {
810
		DPUTS("attempt to commit unknown transaction");
Howard Chu's avatar
Howard Chu committed
811
812
813
814
815
		mdb_txn_abort(txn);
		return EINVAL;
	}

	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
816
		DPUTS("error flag is set, can't commit");
Howard Chu's avatar
Howard Chu committed
817
818
819
820
		mdb_txn_abort(txn);
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
821
	if (!txn->mt_u.dirty_list[0].mid)
Howard Chu's avatar
Howard Chu committed
822
823
		goto done;

Howard Chu's avatar
Howard Chu committed
824
	DPRINTF("committing transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
825
826
	    txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);

Howard Chu's avatar
Howard Chu committed
827
828
829
830
831
832
833
834
835
836
	/* should only be one record now */
	if (env->me_pghead) {
		MDB_val key, data;
		MDB_oldpages *mop;

		mop = env->me_pghead;
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&mop->mo_txnid;
		data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
		data.mv_data = mop->mo_pages;
Howard Chu's avatar
Howard Chu committed
837
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
		free(env->me_pghead);
		env->me_pghead = NULL;
	}
	/* save to free list */
	if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
		MDB_val key, data;
		MDB_pageparent mpp;

		/* make sure last page of freeDB is touched and on freelist */
		key.mv_size = MAXKEYSIZE+1;
		key.mv_data = NULL;
		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, &key, NULL, 1, &mpp);

Howard Chu's avatar
Howard Chu committed
853
854
855
856
857
858
859
860
861
862
863
#if DEBUG > 1
		{
			unsigned int i;
			ULONG *idl = txn->mt_free_pgs;
			DPRINTF("IDL write txn %lu root %lu num %lu",
				txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
			for (i=0; i<idl[0]; i++) {
				DPRINTF("IDL %lu", idl[i+1]);
			}
		}
#endif
Howard Chu's avatar
Howard Chu committed
864
865
866
867
868
		/* write to last page of freeDB */
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&txn->mt_txnid;
		data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
		data.mv_data = txn->mt_free_pgs;
Howard Chu's avatar
Howard Chu committed
869
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
870
871
	}

Howard Chu's avatar
Howard Chu committed
872
873
874
875
876
877
878
	/* Update DB root pointers. Their pages have already been
	 * touched so this is all in-place and cannot fail.
	 */
	{
		MDB_val data;
		data.mv_size = sizeof(MDB_db);

879
880
		for (i = 2; i < txn->mt_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
881
				data.mv_data = &txn->mt_dbs[i];
Howard Chu's avatar
Howard Chu committed
882
				mdb_put0(txn, MAIN_DBI, &txn->mt_dbxs[i].md_name, &data, 0);
Howard Chu's avatar
Howard Chu committed
883
884
885
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
886
887
888

	/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
	 */
889
	next = 0;
Howard Chu's avatar
Howard Chu committed
890
	i = 1;
Howard Chu's avatar
Howard Chu committed
891
892
893
	do {
		n = 0;
		done = 1;
894
		size = 0;
Howard Chu's avatar
Howard Chu committed
895
		for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
Howard Chu's avatar
Howard Chu committed
896
			dp = txn->mt_u.dirty_list[i].mptr;
897
			if (dp->p.mp_pgno != next) {
Howard Chu's avatar
Howard Chu committed
898
899
900
901
902
903
				if (n) {
					DPRINTF("committing %u dirty pages", n);
					rc = writev(env->me_fd, iov, n);
					if (rc != size) {
						n = errno;
						if (rc > 0)
904
							DPUTS("short write, filesystem full?");
Howard Chu's avatar
Howard Chu committed
905
906
907
908
909
910
911
912
						else
							DPRINTF("writev: %s", strerror(errno));
						mdb_txn_abort(txn);
						return n;
					}
					n = 0;
					size = 0;
				}
Howard Chu's avatar
Howard Chu committed
913
				lseek(env->me_fd, dp->p.mp_pgno * env->me_psize, SEEK_SET);
914
915
				next = dp->p.mp_pgno;
			}
Howard Chu's avatar
Howard Chu committed
916
			DPRINTF("committing page %lu", dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
917
			iov[n].iov_len = env->me_psize * dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
918
			iov[n].iov_base = &dp->p;
919
920
			size += iov[n].iov_len;
			next = dp->p.mp_pgno + dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
921
922
923
924
925
926
927
928
929
930
931
932
933
			/* clear dirty flag */
			dp->p.mp_flags &= ~P_DIRTY;
			if (++n >= MDB_COMMIT_PAGES) {
				done = 0;
				break;
			}
		}

		if (n == 0)
			break;

		DPRINTF("committing %u dirty pages", n);
		rc = writev(env->me_fd, iov, n);
934
		if (rc != size) {
Howard Chu's avatar
Howard Chu committed
935
936
			n = errno;
			if (rc > 0)
937
				DPUTS("short write, filesystem full?");
Howard Chu's avatar
Howard Chu committed
938
939
940
941
942
943
944
945
			else
				DPRINTF("writev: %s", strerror(errno));
			mdb_txn_abort(txn);
			return n;
		}

	} while (!done);

Howard Chu's avatar
Howard Chu committed
946
947
	/* Drop the dirty pages.
	 */
Howard Chu's avatar
Howard Chu committed
948
949
	for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++)
		free(txn->mt_u.dirty_list[i].mptr);
Howard Chu's avatar
Howard Chu committed
950

Howard Chu's avatar
Howard Chu committed
951
	if ((n = mdb_env_sync(env, 0)) != 0 ||
Howard Chu's avatar
Howard Chu committed
952
	    (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
953
954
955
		mdb_txn_abort(txn);
		return n;
	}
Howard Chu's avatar
Howard Chu committed
956

957
958
done:
	env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
959
	/* update the DB tables */
Howard Chu's avatar
Howard Chu committed
960
	{
Howard Chu's avatar
Howard Chu committed
961
		int toggle = !env->me_db_toggle;
Howard Chu's avatar
Howard Chu committed
962

963
964
		for (i = 2; i < env->me_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
965
				env->me_dbs[toggle][i] = txn->mt_dbs[i];
966
967
				txn->mt_dbxs[i].md_dirty = 0;
			}
Howard Chu's avatar
Howard Chu committed
968
969
		}
		for (i = env->me_numdbs; i < txn->mt_numdbs; i++) {
970
			txn->mt_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
971
972
973
974
			env->me_dbxs[i] = txn->mt_dbxs[i];
			env->me_dbs[toggle][i] = txn->mt_dbs[i];
		}
		env->me_db_toggle = toggle;
Howard Chu's avatar
Howard Chu committed
975
		env->me_numdbs = txn->mt_numdbs;
Howard Chu's avatar
Howard Chu committed
976

Howard Chu's avatar
Howard Chu committed
977
		free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
978
979
	}

Howard Chu's avatar
Howard Chu committed
980
	pthread_mutex_unlock(&env->me_txns->mti_wmutex);
981
	free(txn);
Howard Chu's avatar
Howard Chu committed
982
983
984
985
986

	return MDB_SUCCESS;
}

static int
Howard Chu's avatar
Howard Chu committed
987
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
988
989
990
{
	char		 page[PAGESIZE];
	MDB_page	*p;
Howard Chu's avatar
Howard Chu committed
991
	MDB_meta	*m;
Howard Chu's avatar
Howard Chu committed
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
	int		 rc;

	assert(env != NULL);

	/* We don't know the page size yet, so use a minimum value.
	 */

	if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
		return ENOENT;
	} else if (rc != PAGESIZE) {
		if (rc > 0)
			errno = EINVAL;
		DPRINTF("read: %s", strerror(errno));
		return errno;
	}

	p = (MDB_page *)page;

Howard Chu's avatar
Howard Chu committed
1010
1011
	if (!F_ISSET(p->mp_flags, P_META)) {
		DPRINTF("page %lu not a meta page", p->mp_pgno);
Howard Chu's avatar
Howard Chu committed
1012
1013
1014
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
1015
1016
	m = METADATA(p);
	if (m->mm_magic != MDB_MAGIC) {
1017
		DPUTS("meta has invalid magic");
Howard Chu's avatar
Howard Chu committed
1018
1019
1020
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
1021
	if (m->mm_version != MDB_VERSION) {
Howard Chu's avatar
Howard Chu committed
1022
		DPRINTF("database is version %u, expected version %u",
Howard Chu's avatar
Howard Chu committed
1023
		    m->mm_version, MDB_VERSION);
Howard Chu's avatar
Howard Chu committed
1024
		return MDB_VERSION_MISMATCH;
Howard Chu's avatar
Howard Chu committed
1025
1026
	}

Howard Chu's avatar
Howard Chu committed
1027
	memcpy(meta, m, sizeof(*m));
Howard Chu's avatar
Howard Chu committed
1028
1029
1030
1031
	return 0;
}

static int
Howard Chu's avatar
Howard Chu committed
1032
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
1033
{
1034
	MDB_page *p, *q;
Howard Chu's avatar
Howard Chu committed
1035
	MDB_meta *m;
1036
	int rc;
Howard Chu's avatar
Howard Chu committed
1037
	unsigned int	 psize;
Howard Chu's avatar
Howard Chu committed
1038

1039
	DPUTS("writing new meta page");
Howard Chu's avatar
Howard Chu committed
1040
1041
	psize = sysconf(_SC_PAGE_SIZE);

Howard Chu's avatar
Howard Chu committed
1042
1043
1044
1045
1046
	meta->mm_magic = MDB_MAGIC;
	meta->mm_version = MDB_VERSION;
	meta->mm_psize = psize;
	meta->mm_last_pg = 1;
	meta->mm_flags = env->me_flags & 0xffff;
Howard Chu's avatar
Howard Chu committed
1047
	meta->mm_flags |= MDB_INTEGERKEY;
Howard Chu's avatar
Howard Chu committed
1048
1049
	meta->mm_dbs[0].md_root = P_INVALID;
	meta->mm_dbs[1].md_root = P_INVALID;
Howard Chu's avatar
Howard Chu committed
1050
1051
1052

	p = calloc(2, psize);
	p->mp_pgno = 0;
1053
	p->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
1054

Howard Chu's avatar
Howard Chu committed
1055
1056
	m = METADATA(p);
	memcpy(m, meta, sizeof(*meta));
Howard Chu's avatar
Howard Chu committed
1057

Howard Chu's avatar
Howard Chu committed
1058
	q = (MDB_page *)((char *)p + psize);
Howard Chu's avatar
Howard Chu committed
1059

Howard Chu's avatar
Howard Chu committed
1060
	q->mp_pgno = 1;
1061
	q->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
1062

Howard Chu's avatar
Howard Chu committed
1063
1064
	m = METADATA(q);
	memcpy(m, meta, sizeof(*meta));
Howard Chu's avatar
Howard Chu committed
1065

Howard Chu's avatar
Howard Chu committed
1066
	rc = write(env->me_fd, p, psize * 2);
1067
	free(p);
Howard Chu's avatar
Howard Chu committed
1068
	return (rc == (int)psize * 2) ? MDB_SUCCESS : errno;
1069
1070
1071
}

static int
Howard Chu's avatar
Howard Chu committed
1072
mdb_env_write_meta(MDB_txn *txn)
1073
1074
{
	MDB_env *env;
Howard Chu's avatar
Howard Chu committed
1075
	MDB_meta	meta, metab;
1076
	off_t off;
Howard Chu's avatar
Howard Chu committed
1077
	int rc, len, toggle;
Howard Chu's avatar
Howard Chu committed
1078
	char *ptr;
Howard Chu's avatar
Howard Chu committed
1079

1080
1081
1082
	assert(txn != NULL);
	assert(txn->mt_env != NULL);

Howard Chu's avatar
Howard Chu committed
1083
	toggle = !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE);
Howard Chu's avatar
Howard Chu committed
1084
	DPRINTF("writing meta page %d for root page %lu",
Howard Chu's avatar
Howard Chu committed
1085
		toggle, txn->mt_dbs[MAIN_DBI].md_root);
1086
1087
1088

	env = txn->mt_env;

Howard Chu's avatar
Howard Chu committed
1089
1090
1091
	metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
	metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;

Howard Chu's avatar
Howard Chu committed
1092
	ptr = (char *)&meta;
Howard Chu's avatar
Howard Chu committed
1093
	off = offsetof(MDB_meta, mm_dbs[0].md_depth);
Howard Chu's avatar
Howard Chu committed
1094
1095
1096
	len = sizeof(MDB_meta) - off;

	ptr += off;
Howard Chu's avatar
Howard Chu committed
1097
1098
	meta.mm_dbs[0] = txn->mt_dbs[0];
	meta.mm_dbs[1] = txn->mt_dbs[1];
Howard Chu's avatar
Howard Chu committed
1099
1100
	meta.mm_last_pg = txn->mt_next_pgno - 1;
	meta.mm_txnid = txn->mt_txnid;
1101

Howard Chu's avatar
Howard Chu committed
1102
	if (toggle)
Howard Chu's avatar
Howard Chu committed
1103
		off += env->me_psize;
1104
1105
	off += PAGEHDRSZ;

Howard Chu's avatar
Howard Chu committed
1106
1107
	/* Write to the SYNC fd */
	rc = pwrite(env->me_mfd, ptr, len, off);
Howard Chu's avatar
Howard Chu committed
1108
	if (rc != len) {
1109
		DPUTS("write failed, disk error?");
Howard Chu's avatar
Howard Chu committed
1110
1111
1112
1113
1114
1115
1116
1117
		/* On a failure, the pagecache still contains the new data.
		 * Write some old data back, to prevent it from being used.
		 * Use the non-SYNC fd; we know it will fail anyway.
		 */
		meta.mm_last_pg = metab.mm_last_pg;
		meta.mm_txnid = metab.mm_txnid;
		rc = pwrite(env->me_fd, ptr, len, off);
		env->me_flags |= MDB_FATAL_ERROR;
1118
		return errno;
Howard Chu's avatar
Howard Chu committed
1119
	}
Howard Chu's avatar
Howard Chu committed
1120
	txn->mt_env->me_txns->mti_me_toggle = toggle;
Howard Chu's avatar
Howard Chu committed
1121
1122
1123
1124
1125

	return MDB_SUCCESS;
}

static int
Howard Chu's avatar
Howard Chu committed
1126
mdb_env_read_meta(MDB_env *env, int *which)
Howard Chu's avatar
Howard Chu committed
1127
{
Howard Chu's avatar
Howard Chu committed
1128
	int toggle = 0;
Howard Chu's avatar
Howard Chu committed
1129
1130
1131

	assert(env != NULL);

Howard Chu's avatar
Howard Chu committed
1132
1133
1134
	if (which)
		toggle = *which;
	else if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
1135
		toggle = 1;
Howard Chu's avatar
Howard Chu committed
1136

Howard Chu's avatar
Howard Chu committed
1137
1138
	if (env->me_meta != env->me_metas[toggle])
		env->me_meta = env->me_metas[toggle];
Howard Chu's avatar
Howard Chu committed
1139