mdb.c 84.3 KB
Newer Older
Howard Chu's avatar
Howard Chu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* mdb.c - memory-mapped database library */
/*
 * Copyright 2011 Howard Chu, Symas Corp.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted only as authorized by the OpenLDAP
 * Public License.
 *
 * A copy of this license is available in the file LICENSE in the
 * top-level directory of the distribution or, alternatively, at
 * <http://www.OpenLDAP.org/license.html>.
 *
 * This code is derived from btree.c written by Martin Hedenfalk.
 *
 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
Howard Chu's avatar
Howard Chu committed
30
31
32
33
34
35
36
37
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
Howard Chu's avatar
Howard Chu committed
38
#include <fcntl.h>
Howard Chu's avatar
Howard Chu committed
39
40
41
42
43
44
45
46
47
48

#include <assert.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
Howard Chu's avatar
Howard Chu committed
49
#include <pthread.h>
Howard Chu's avatar
Howard Chu committed
50
51

#include "mdb.h"
Howard Chu's avatar
Howard Chu committed
52

Howard Chu's avatar
Howard Chu committed
53
54
#define ULONG		unsigned long
typedef ULONG		pgno_t;
Howard Chu's avatar
Howard Chu committed
55

Howard Chu's avatar
Howard Chu committed
56
#include "midl.h"
Howard Chu's avatar
Howard Chu committed
57

Hallvard Furuseth's avatar
Hallvard Furuseth committed
58
59
60
61
62
63
64
65
/* Note: If O_DSYNC is undefined but exists in /usr/include,
 * preferably set some compiler flag to get the definition.
 * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
 */
#ifndef MDB_DSYNC
# define MDB_DSYNC	O_DSYNC
#endif

66
67
68
#ifndef DEBUG
#define DEBUG 1
#endif
Howard Chu's avatar
Howard Chu committed
69

70
71
72
73
74
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
# define DPRINTF	(void)	/* Vararg macros may be unsupported */
#elif DEBUG
# define DPRINTF(fmt, ...)	/* Requires 2 or more args */ \
	fprintf(stderr, "%s:%d: " fmt "\n", __func__, __LINE__, __VA_ARGS__)
Howard Chu's avatar
Howard Chu committed
75
#else
76
# define DPRINTF(fmt, ...)	((void) 0)
Howard Chu's avatar
Howard Chu committed
77
#endif
78
#define DPUTS(arg)	DPRINTF("%s", arg)
Howard Chu's avatar
Howard Chu committed
79
80
81
82
83

#define PAGESIZE	 4096
#define MDB_MINKEYS	 4
#define MDB_MAGIC	 0xBEEFC0DE
#define MDB_VERSION	 1
Howard Chu's avatar
Howard Chu committed
84
#define MAXKEYSIZE	 511
Howard Chu's avatar
Howard Chu committed
85

Hallvard Furuseth's avatar
Hallvard Furuseth committed
86
#define P_INVALID	 (~0UL)
Howard Chu's avatar
Howard Chu committed
87
88
89
90
91

#define F_ISSET(w, f)	 (((w) & (f)) == (f))

typedef uint16_t	 indx_t;

Howard Chu's avatar
Howard Chu committed
92
93
94
95
96
#define DEFAULT_READERS	126
#define DEFAULT_MAPSIZE	1048576

/* Lock descriptor stuff */
#ifndef CACHELINE
Howard Chu's avatar
Howard Chu committed
97
#define CACHELINE	64	/* most CPUs. Itanium uses 128 */
Howard Chu's avatar
Howard Chu committed
98
99
#endif

Howard Chu's avatar
Howard Chu committed
100
101
102
103
104
105
typedef struct MDB_rxbody {
	ULONG		mrb_txnid;
	pid_t		mrb_pid;
	pthread_t	mrb_tid;
} MDB_rxbody;

Howard Chu's avatar
Howard Chu committed
106
typedef struct MDB_reader {
Howard Chu's avatar
Howard Chu committed
107
108
109
110
111
112
113
114
	union {
		MDB_rxbody mrx;
#define	mr_txnid	mru.mrx.mrb_txnid
#define	mr_pid	mru.mrx.mrb_pid
#define	mr_tid	mru.mrx.mrb_tid
		/* cache line alignment */
		char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mru;
Howard Chu's avatar
Howard Chu committed
115
116
117
} MDB_reader;

typedef struct MDB_txbody {
Howard Chu's avatar
Howard Chu committed
118
119
120
121
122
	uint32_t	mtb_magic;
	uint32_t	mtb_version;
	pthread_mutex_t	mtb_mutex;
	ULONG		mtb_txnid;
	uint32_t	mtb_numreaders;
Howard Chu's avatar
Howard Chu committed
123
	uint32_t	mtb_me_toggle;
Howard Chu's avatar
Howard Chu committed
124
125
126
} MDB_txbody;

typedef struct MDB_txninfo {
Howard Chu's avatar
Howard Chu committed
127
128
129
130
131
132
133
	union {
		MDB_txbody mtb;
#define mti_magic	mt1.mtb.mtb_magic
#define mti_version	mt1.mtb.mtb_version
#define mti_mutex	mt1.mtb.mtb_mutex
#define mti_txnid	mt1.mtb.mtb_txnid
#define mti_numreaders	mt1.mtb.mtb_numreaders
Howard Chu's avatar
Howard Chu committed
134
#define mti_me_toggle	mt1.mtb.mtb_me_toggle
Howard Chu's avatar
Howard Chu committed
135
136
137
138
139
140
141
142
		char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt1;
	union {
		pthread_mutex_t	mt2_wmutex;
#define mti_wmutex	mt2.mt2_wmutex
		char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
	} mt2;
	MDB_reader	mti_readers[1];
Howard Chu's avatar
Howard Chu committed
143
144
} MDB_txninfo;

Howard Chu's avatar
Howard Chu committed
145
146
147
148
149
/* Common header for all page types. Overflow pages
 * occupy a number of contiguous pages with no
 * headers on any page after the first.
 */
typedef struct MDB_page {		/* represents a page of storage */
Howard Chu's avatar
Howard Chu committed
150
151
152
#define	mp_pgno		mp_p.p_pgno
	union padded {
		pgno_t		p_pgno;		/* page number */
153
		void *		p_align;	/* for IL32P64 */
Howard Chu's avatar
Howard Chu committed
154
	} mp_p;
Howard Chu's avatar
Howard Chu committed
155
156
157
158
#define	P_BRANCH	 0x01		/* branch page */
#define	P_LEAF		 0x02		/* leaf page */
#define	P_OVERFLOW	 0x04		/* overflow page */
#define	P_META		 0x08		/* meta page */
Howard Chu's avatar
Howard Chu committed
159
#define	P_DIRTY		 0x10		/* dirty page */
Howard Chu's avatar
Howard Chu committed
160
#define	P_LEAF2		 0x20		/* DB with small, fixed size keys and no data */
Howard Chu's avatar
Howard Chu committed
161
162
163
164
165
166
167
168
169
170
	uint32_t	mp_flags;
#define mp_lower	mp_pb.pb.pb_lower
#define mp_upper	mp_pb.pb.pb_upper
#define mp_pages	mp_pb.pb_pages
	union page_bounds {
		struct {
			indx_t		pb_lower;		/* lower bound of free space */
			indx_t		pb_upper;		/* upper bound of free space */
		} pb;
		uint32_t	pb_pages;	/* number of overflow pages */
Howard Chu's avatar
Howard Chu committed
171
172
173
174
		struct {
			indx_t	pb_ksize;	/* on a LEAF2 page */
			indx_t	pb_numkeys;
		} pb2;
Howard Chu's avatar
Howard Chu committed
175
176
177
178
	} mp_pb;
	indx_t		mp_ptrs[1];		/* dynamic size */
} MDB_page;

179
#define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
Howard Chu's avatar
Howard Chu committed
180
181
182

#define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
#define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
Howard Chu's avatar
Howard Chu committed
183
184
#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
				((env)->me_psize - PAGEHDRSZ))
Howard Chu's avatar
Howard Chu committed
185
186
187
188
#define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
#define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
#define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)

189
#define OVPAGES(size, psize)	((PAGEHDRSZ-1 + (size)) / (psize) + 1)
Howard Chu's avatar
Howard Chu committed
190

Howard Chu's avatar
Howard Chu committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
typedef struct MDB_db {
	uint32_t	md_pad;
	uint16_t	md_flags;
	uint16_t	md_depth;
	ULONG		md_branch_pages;
	ULONG		md_leaf_pages;
	ULONG		md_overflow_pages;
	ULONG		md_entries;
	pgno_t		md_root;
} MDB_db;

#define	FREE_DBI	0
#define	MAIN_DBI	1

Howard Chu's avatar
Howard Chu committed
205
typedef struct MDB_meta {			/* meta (footer) page content */
Howard Chu's avatar
Howard Chu committed
206
207
208
209
	uint32_t	mm_magic;
	uint32_t	mm_version;
	void		*mm_address;		/* address for fixed mapping */
	size_t		mm_mapsize;			/* size of mmap region */
Howard Chu's avatar
Howard Chu committed
210
211
212
	MDB_db		mm_dbs[2];			/* first is free space, 2nd is main db */
#define	mm_psize	mm_dbs[0].md_pad
#define	mm_flags	mm_dbs[0].md_flags
Howard Chu's avatar
Howard Chu committed
213
214
	pgno_t		mm_last_pg;			/* last used page in file */
	ULONG		mm_txnid;			/* txnid that committed this page */
Howard Chu's avatar
Howard Chu committed
215
216
217
218
} MDB_meta;

typedef struct MDB_dhead {					/* a dirty page */
	MDB_page	*md_parent;
219
	unsigned	md_pi;				/* parent index */
Howard Chu's avatar
Howard Chu committed
220
221
222
223
224
225
226
227
	int			md_num;
} MDB_dhead;

typedef struct MDB_dpage {
	MDB_dhead	h;
	MDB_page	p;
} MDB_dpage;

Howard Chu's avatar
Howard Chu committed
228
229
typedef struct MDB_oldpages {
	struct MDB_oldpages *mo_next;
Howard Chu's avatar
Howard Chu committed
230
	ULONG		mo_txnid;
Howard Chu's avatar
Howard Chu committed
231
232
233
	pgno_t		mo_pages[1];	/* dynamic */
} MDB_oldpages;

Howard Chu's avatar
Howard Chu committed
234
235
236
typedef struct MDB_pageparent {
	MDB_page *mp_page;
	MDB_page *mp_parent;
237
	unsigned mp_pi;
Howard Chu's avatar
Howard Chu committed
238
239
} MDB_pageparent;

Howard Chu's avatar
Howard Chu committed
240
static MDB_dpage *mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num);
Howard Chu's avatar
Howard Chu committed
241
242
243
244
245
246
247
static int 		mdb_touch(MDB_txn *txn, MDB_pageparent *mp);

typedef struct MDB_ppage {					/* ordered list of pages */
	MDB_page		*mp_page;
	unsigned int	mp_ki;		/* cursor index on page */
} MDB_ppage;

Howard Chu's avatar
Howard Chu committed
248
249
#define CURSOR_TOP(c)		 (&(c)->mc_stack[(c)->mc_snum-1])
#define CURSOR_PARENT(c)	 (&(c)->mc_stack[(c)->mc_snum-2])
Howard Chu's avatar
Howard Chu committed
250

Howard Chu's avatar
Howard Chu committed
251
struct MDB_xcursor;
252

Howard Chu's avatar
Howard Chu committed
253
254
struct MDB_cursor {
	MDB_txn		*mc_txn;
Howard Chu's avatar
Howard Chu committed
255
256
	MDB_ppage	mc_stack[32];		/* stack of parent pages */
	unsigned int	mc_snum;		/* number of pushed pages */
Howard Chu's avatar
Howard Chu committed
257
	MDB_dbi		mc_dbi;
Howard Chu's avatar
Howard Chu committed
258
259
	short		mc_initialized;	/* 1 if initialized */
	short		mc_eof;		/* 1 if end is reached */
Howard Chu's avatar
Howard Chu committed
260
	struct MDB_xcursor	*mc_xcursor;
Howard Chu's avatar
Howard Chu committed
261
262
};

263
#define METADATA(p)	 ((void *)((char *)(p) + PAGEHDRSZ))
Howard Chu's avatar
Howard Chu committed
264
265
266
267
268
269
270
271

typedef struct MDB_node {
#define mn_pgno		 mn_p.np_pgno
#define mn_dsize	 mn_p.np_dsize
	union {
		pgno_t		 np_pgno;	/* child page number */
		uint32_t	 np_dsize;	/* leaf data size */
	} mn_p;
Howard Chu's avatar
Howard Chu committed
272
273
	unsigned int	mn_flags:4;
	unsigned int	mn_ksize:12;			/* key size */
Howard Chu's avatar
Howard Chu committed
274
#define F_BIGDATA	 0x01			/* data put on overflow page */
Howard Chu's avatar
Howard Chu committed
275
#define F_SUBDATA	 0x02			/* data is a sub-database */
Howard Chu's avatar
Howard Chu committed
276
#define F_DUPDATA	 0x04			/* data has duplicates */
Howard Chu's avatar
Howard Chu committed
277
278
279
	char		mn_data[1];
} MDB_node;

Howard Chu's avatar
Howard Chu committed
280
typedef struct MDB_dbx {
281
	MDB_val		md_name;
Howard Chu's avatar
Howard Chu committed
282
283
284
	MDB_cmp_func	*md_cmp;		/* user compare function */
	MDB_cmp_func	*md_dcmp;		/* user dupsort function */
	MDB_rel_func	*md_rel;		/* user relocate function */
285
286
	MDB_dbi	md_parent;
	unsigned int	md_dirty;
Howard Chu's avatar
Howard Chu committed
287
288
} MDB_dbx;

Howard Chu's avatar
Howard Chu committed
289
290
struct MDB_txn {
	pgno_t		mt_next_pgno;	/* next unallocated page */
Howard Chu's avatar
Howard Chu committed
291
292
	ULONG		mt_txnid;
	ULONG		mt_oldest;
Howard Chu's avatar
Howard Chu committed
293
	MDB_env		*mt_env;	
Howard Chu's avatar
Howard Chu committed
294
	pgno_t		*mt_free_pgs;	/* this is an IDL */
Howard Chu's avatar
Howard Chu committed
295
	union {
Howard Chu's avatar
Howard Chu committed
296
		MIDL2	*dirty_list;	/* modified pages */
Howard Chu's avatar
Howard Chu committed
297
298
		MDB_reader	*reader;
	} mt_u;
Howard Chu's avatar
Howard Chu committed
299
	MDB_dbx		*mt_dbxs;		/* array */
Howard Chu's avatar
Howard Chu committed
300
	MDB_db		*mt_dbs;
Howard Chu's avatar
Howard Chu committed
301
302
	unsigned int	mt_numdbs;

Howard Chu's avatar
Howard Chu committed
303
304
#define MDB_TXN_RDONLY		0x01		/* read-only transaction */
#define MDB_TXN_ERROR		0x02		/* an error has occurred */
Howard Chu's avatar
Howard Chu committed
305
#define MDB_TXN_METOGGLE	0x04		/* used meta page 1 */
Howard Chu's avatar
Howard Chu committed
306
	unsigned int	mt_flags;
Howard Chu's avatar
Howard Chu committed
307
308
};

Howard Chu's avatar
Howard Chu committed
309
310
311
312
313
314
315
316
/* Context for sorted-dup records */
typedef struct MDB_xcursor {
	MDB_cursor mx_cursor;
	MDB_txn mx_txn;
	MDB_dbx	mx_dbxs[4];
	MDB_db	mx_dbs[4];
} MDB_xcursor;

Howard Chu's avatar
Howard Chu committed
317
318
struct MDB_env {
	int			me_fd;
Howard Chu's avatar
Howard Chu committed
319
	int			me_lfd;
Howard Chu's avatar
Howard Chu committed
320
	int			me_mfd;			/* just for writing the meta pages */
Howard Chu's avatar
Howard Chu committed
321
322
323
#define	MDB_FATAL_ERROR	0x80000000U
	uint32_t 	me_flags;
	uint32_t	me_extrapad;	/* unused for now */
Howard Chu's avatar
Howard Chu committed
324
325
326
	unsigned int	me_maxreaders;
	unsigned int	me_numdbs;
	unsigned int	me_maxdbs;
Howard Chu's avatar
Howard Chu committed
327
	char		*me_path;
Howard Chu's avatar
Howard Chu committed
328
	char		*me_map;
Howard Chu's avatar
Howard Chu committed
329
	MDB_txninfo	*me_txns;
Howard Chu's avatar
Howard Chu committed
330
331
	MDB_meta	*me_metas[2];
	MDB_meta	*me_meta;
Howard Chu's avatar
Howard Chu committed
332
	MDB_txn		*me_txn;		/* current write transaction */
Howard Chu's avatar
Howard Chu committed
333
334
	size_t		me_mapsize;
	off_t		me_size;		/* current file size */
Howard Chu's avatar
Howard Chu committed
335
336
337
	pgno_t		me_maxpg;		/* me_mapsize / me_psize */
	unsigned int	me_psize;
	unsigned int	me_db_toggle;
Howard Chu's avatar
Howard Chu committed
338
339
	MDB_dbx		*me_dbxs;		/* array */
	MDB_db		*me_dbs[2];
Howard Chu's avatar
Howard Chu committed
340
	MDB_oldpages *me_pghead;
Howard Chu's avatar
Howard Chu committed
341
	pthread_key_t	me_txkey;	/* thread-key for readers */
342
	MDB_dpage	*me_dpages;
343
	pgno_t		me_free_pgs[MDB_IDL_UM_SIZE];
Howard Chu's avatar
Howard Chu committed
344
	MIDL2		me_dirty_list[MDB_IDL_DB_SIZE];
Howard Chu's avatar
Howard Chu committed
345
346
347
348
349
350
351
352
353
354
355
356
357
358
};

#define NODESIZE	 offsetof(MDB_node, mn_data)

#define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
#define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
#define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
#define NODEKEY(node)	 (void *)((node)->mn_data)
#define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
#define NODEPGNO(node)	 ((node)->mn_pgno)
#define NODEDSZ(node)	 ((node)->mn_dsize)

#define MDB_COMMIT_PAGES	 64	/* max number of pages to write in one commit */

Howard Chu's avatar
Howard Chu committed
359
360
static int  mdb_search_page_root(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
361
362
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);
Howard Chu's avatar
Howard Chu committed
363
364
static int  mdb_search_page(MDB_txn *txn,
			    MDB_dbi dbi, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
365
366
367
			    MDB_cursor *cursor, int modify,
			    MDB_pageparent *mpp);

Howard Chu's avatar
Howard Chu committed
368
369
370
static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
static int  mdb_env_read_meta(MDB_env *env, int *which);
static int  mdb_env_write_meta(MDB_txn *txn);
Howard Chu's avatar
Howard Chu committed
371
static MDB_page *mdb_get_page(MDB_txn *txn, pgno_t pgno);
Howard Chu's avatar
Howard Chu committed
372

Howard Chu's avatar
Howard Chu committed
373
static MDB_node *mdb_search_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
374
			    MDB_val *key, int *exactp, unsigned int *kip);
Howard Chu's avatar
Howard Chu committed
375
static int  mdb_add_node(MDB_txn *txn, MDB_dbi dbi, MDB_page *mp,
Howard Chu's avatar
Howard Chu committed
376
377
			    indx_t indx, MDB_val *key, MDB_val *data,
			    pgno_t pgno, uint8_t flags);
Howard Chu's avatar
Howard Chu committed
378
static void mdb_del_node(MDB_page *mp, indx_t indx);
Howard Chu's avatar
Howard Chu committed
379
380
static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, unsigned int ki,
    MDB_pageparent *mpp, MDB_node *leaf);
Howard Chu's avatar
Howard Chu committed
381
382
static int mdb_put0(MDB_txn *txn, MDB_dbi dbi,
    MDB_val *key, MDB_val *data, unsigned int flags);
Howard Chu's avatar
Howard Chu committed
383
static int  mdb_read_data(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
384
385
386
387

static int		 mdb_rebalance(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *mp);
static int		 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
static int		 mdb_move_node(MDB_txn *txn, MDB_dbi dbi, 
Howard Chu's avatar
Howard Chu committed
388
389
				MDB_pageparent *src, indx_t srcindx,
				MDB_pageparent *dst, indx_t dstindx);
Howard Chu's avatar
Howard Chu committed
390
static int		 mdb_merge(MDB_txn *txn, MDB_dbi dbi, MDB_pageparent *src,
Howard Chu's avatar
Howard Chu committed
391
			    MDB_pageparent *dst);
Howard Chu's avatar
Howard Chu committed
392
static int		 mdb_split(MDB_txn *txn, MDB_dbi dbi, MDB_page **mpp,
Howard Chu's avatar
Howard Chu committed
393
394
			    unsigned int *newindxp, MDB_val *newkey,
			    MDB_val *newdata, pgno_t newpgno);
Howard Chu's avatar
Howard Chu committed
395
static MDB_dpage *mdb_new_page(MDB_txn *txn, MDB_dbi dbi, uint32_t flags, int num);
Howard Chu's avatar
Howard Chu committed
396
397
398
399
400

static void		 cursor_pop_page(MDB_cursor *cursor);
static MDB_ppage *cursor_push_page(MDB_cursor *cursor,
			    MDB_page *mp);

Howard Chu's avatar
Howard Chu committed
401
static int		 mdb_set_key(MDB_node *node, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
402
403
static int		 mdb_sibling(MDB_cursor *cursor, int move_right);
static int		 mdb_cursor_next(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
404
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
405
static int		 mdb_cursor_prev(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
406
			    MDB_val *key, MDB_val *data, MDB_cursor_op op);
Howard Chu's avatar
Howard Chu committed
407
static int		 mdb_cursor_set(MDB_cursor *cursor,
Howard Chu's avatar
Howard Chu committed
408
			    MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp);
Howard Chu's avatar
Howard Chu committed
409
410
static int		 mdb_cursor_first(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
411
412
static int		 mdb_cursor_last(MDB_cursor *cursor,
			    MDB_val *key, MDB_val *data);
Howard Chu's avatar
Howard Chu committed
413

Howard Chu's avatar
Howard Chu committed
414
static void		mdb_xcursor_init0(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
Howard Chu's avatar
Howard Chu committed
415
static void		mdb_xcursor_init1(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx, MDB_node *node);
Howard Chu's avatar
Howard Chu committed
416
417
static void		mdb_xcursor_fini(MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);

Howard Chu's avatar
Howard Chu committed
418
static size_t		 mdb_leaf_size(MDB_env *env, MDB_val *key,
Howard Chu's avatar
Howard Chu committed
419
			    MDB_val *data);
Howard Chu's avatar
Howard Chu committed
420
static size_t		 mdb_branch_size(MDB_env *env, MDB_val *key);
Howard Chu's avatar
Howard Chu committed
421
422
423
424
425
426
427
428
429

static int		 memncmp(const void *s1, size_t n1,
				 const void *s2, size_t n2);
static int		 memnrcmp(const void *s1, size_t n1,
				  const void *s2, size_t n2);

static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
430
431
432
433
434
	int diff, len_diff = -1;

	if (n1 >= n2) {
		len_diff = (n1 > n2);
		n1 = n2;
Howard Chu's avatar
Howard Chu committed
435
	}
436
437
	diff = memcmp(s1, s2, n1);
	return diff ? diff : len_diff;
Howard Chu's avatar
Howard Chu committed
438
439
440
441
442
}

static int
memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
443
	const unsigned char	*p1, *p2, *p1_lim;
Howard Chu's avatar
Howard Chu committed
444
445

	if (n2 == 0)
446
447
448
		return n1 != 0;
	if (n1 == 0)
		return -1;
Howard Chu's avatar
Howard Chu committed
449
450
451
452

	p1 = (const unsigned char *)s1 + n1 - 1;
	p2 = (const unsigned char *)s2 + n2 - 1;

453
454
455
	for (p1_lim = (n1 <= n2 ? s1 : s2);  *p1 == *p2;  p1--, p2--) {
		if (p1 == p1_lim)
			return (p1 != s1) ? (p1 != p2) : (p2 != s2) ? -1 : 0;
Howard Chu's avatar
Howard Chu committed
456
457
458
459
	}
	return *p1 - *p2;
}

Howard Chu's avatar
Howard Chu committed
460
461
462
463
464
465
466
467
468
char *
mdb_version(int *maj, int *min, int *pat)
{
	*maj = MDB_VERSION_MAJOR;
	*min = MDB_VERSION_MINOR;
	*pat = MDB_VERSION_PATCH;
	return MDB_VERSION_STRING;
}

469
static char *const errstr[] = {
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
	"MDB_KEYEXIST: Key/data pair already exists",
	"MDB_NOTFOUND: No matching key/data pair found",
	"MDB_PAGE_NOTFOUND: Requested page not found",
	"MDB_CORRUPTED: Located page was wrong type",
	"MDB_PANIC: Update of meta page failed",
	"MDB_VERSION_MISMATCH: Database environment version mismatch"
};

char *
mdb_strerror(int err)
{
	if (!err)
		return ("Successful return: 0");

	if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
485
		return errstr[err - MDB_KEYEXIST];
486
487
488
489

	return strerror(err);
}

Howard Chu's avatar
Howard Chu committed
490
int
Howard Chu's avatar
Howard Chu committed
491
mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
Howard Chu's avatar
Howard Chu committed
492
{
Howard Chu's avatar
Howard Chu committed
493
494
	if (txn->mt_dbxs[dbi].md_cmp)
		return txn->mt_dbxs[dbi].md_cmp(a, b);
Howard Chu's avatar
Howard Chu committed
495

Howard Chu's avatar
Howard Chu committed
496
497
498
499
500
	if (txn->mt_dbs[dbi].md_flags & (MDB_REVERSEKEY
#if __BYTE_ORDER == __LITTLE_ENDIAN
		|MDB_INTEGERKEY
#endif
	))
Howard Chu's avatar
Howard Chu committed
501
		return memnrcmp(a->mv_data, a->mv_size, b->mv_data, b->mv_size);
Howard Chu's avatar
Howard Chu committed
502
	else
Howard Chu's avatar
Howard Chu committed
503
504
505
506
507
508
509
510
511
512
		return memncmp((char *)a->mv_data, a->mv_size, b->mv_data, b->mv_size);
}

int
mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
{
	if (txn->mt_dbxs[dbi].md_dcmp)
		return txn->mt_dbxs[dbi].md_dcmp(a, b);

	return memncmp((char *)a->mv_data, a->mv_size, b->mv_data, b->mv_size);
Howard Chu's avatar
Howard Chu committed
513
514
515
516
}

/* Allocate new page(s) for writing */
static MDB_dpage *
Howard Chu's avatar
Howard Chu committed
517
mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num)
Howard Chu's avatar
Howard Chu committed
518
519
{
	MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
520
	pgno_t pgno = P_INVALID;
Howard Chu's avatar
Howard Chu committed
521
	ULONG oldest;
Howard Chu's avatar
Howard Chu committed
522
	MIDL2 mid;
Howard Chu's avatar
Howard Chu committed
523

Howard Chu's avatar
Howard Chu committed
524
525
526
	if (txn->mt_txnid > 2) {

	oldest = txn->mt_txnid - 2;
Howard Chu's avatar
Howard Chu committed
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
	if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
		/* See if there's anything in the free DB */
		MDB_pageparent mpp;
		MDB_node *leaf;
		ULONG *kptr;

		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, NULL, NULL, 0, &mpp);
		leaf = NODEPTR(mpp.mp_page, 0);
		kptr = (ULONG *)NODEKEY(leaf);

		/* It's potentially usable, unless there are still
		 * older readers outstanding. Grab it.
		 */
		if (oldest > *kptr) {
			MDB_oldpages *mop;
			MDB_val data;
			pgno_t *idl;

Howard Chu's avatar
Howard Chu committed
547
			mdb_read_data(txn, leaf, &data);
Howard Chu's avatar
Howard Chu committed
548
549
550
551
552
553
554
			idl = (ULONG *)data.mv_data;
			mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
			mop->mo_next = txn->mt_env->me_pghead;
			mop->mo_txnid = *kptr;
			txn->mt_env->me_pghead = mop;
			memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));

Howard Chu's avatar
Howard Chu committed
555
556
557
558
559
560
561
562
563
564
#if DEBUG > 1
			{
				unsigned int i;
				DPRINTF("IDL read txn %lu root %lu num %lu",
					mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
				for (i=0; i<idl[0]; i++) {
					DPRINTF("IDL %lu", idl[i+1]);
				}
			}
#endif
Howard Chu's avatar
Howard Chu committed
565
566
567
568
569
570
571
572
			/* drop this IDL from the DB */
			mpp.mp_parent = NULL;
			mpp.mp_pi = 0;
			mdb_search_page(txn, FREE_DBI, NULL, NULL, 1, &mpp);
			leaf = NODEPTR(mpp.mp_page, 0);
			mdb_del0(txn, FREE_DBI, 0, &mpp, leaf);
		}
	}
Howard Chu's avatar
Howard Chu committed
573
	if (txn->mt_env->me_pghead) {
Howard Chu's avatar
Howard Chu committed
574
		unsigned int i;
Howard Chu's avatar
Howard Chu committed
575
576
		for (i=0; i<txn->mt_env->me_txns->mti_numreaders; i++) {
			ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
577
578
			if (!mr) continue;
			if (mr < oldest)
Howard Chu's avatar
Howard Chu committed
579
				oldest = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
Howard Chu's avatar
Howard Chu committed
580
581
582
583
584
		}
		if (oldest > txn->mt_env->me_pghead->mo_txnid) {
			MDB_oldpages *mop = txn->mt_env->me_pghead;
			txn->mt_oldest = oldest;
			if (num > 1) {
585
586
587
588
				/* FIXME: For now, always use fresh pages. We
				 * really ought to search the free list for a
				 * contiguous range.
				 */
Howard Chu's avatar
Howard Chu committed
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
				;
			} else {
				/* peel pages off tail, so we only have to truncate the list */
				pgno = MDB_IDL_LAST(mop->mo_pages);
				if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
					mop->mo_pages[2]++;
					if (mop->mo_pages[2] > mop->mo_pages[1])
						mop->mo_pages[0] = 0;
				} else {
					mop->mo_pages[0]--;
				}
				if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
					txn->mt_env->me_pghead = mop->mo_next;
					free(mop);
				}
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
607
	}
Howard Chu's avatar
Howard Chu committed
608

Howard Chu's avatar
Howard Chu committed
609
610
611
612
613
	if (pgno == P_INVALID) {
		/* DB size is maxed out */
		if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg)
			return NULL;
	}
614
615
616
617
618
619
620
	if (txn->mt_env->me_dpages && num == 1) {
		dp = txn->mt_env->me_dpages;
		txn->mt_env->me_dpages = (MDB_dpage *)dp->h.md_parent;
	} else {
		if ((dp = malloc(txn->mt_env->me_psize * num + sizeof(MDB_dhead))) == NULL)
			return NULL;
	}
Howard Chu's avatar
Howard Chu committed
621
622
623
	dp->h.md_num = num;
	dp->h.md_parent = parent;
	dp->h.md_pi = parent_idx;
Howard Chu's avatar
Howard Chu committed
624
625
626
627
628
629
	if (pgno == P_INVALID) {
		dp->p.mp_pgno = txn->mt_next_pgno;
		txn->mt_next_pgno += num;
	} else {
		dp->p.mp_pgno = pgno;
	}
Howard Chu's avatar
Howard Chu committed
630
631
632
	mid.mid = dp->p.mp_pgno;
	mid.mptr = dp;
	mdb_midl2_insert(txn->mt_u.dirty_list, &mid);
Howard Chu's avatar
Howard Chu committed
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648

	return dp;
}

/* Touch a page: make it dirty and re-insert into tree with updated pgno.
 */
static int
mdb_touch(MDB_txn *txn, MDB_pageparent *pp)
{
	MDB_page *mp = pp->mp_page;
	pgno_t	pgno;
	assert(txn != NULL);
	assert(pp != NULL);

	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
		MDB_dpage *dp;
Howard Chu's avatar
Howard Chu committed
649
		if ((dp = mdb_alloc_page(txn, pp->mp_parent, pp->mp_pi, 1)) == NULL)
Howard Chu's avatar
Howard Chu committed
650
			return ENOMEM;
Howard Chu's avatar
Howard Chu committed
651
		DPRINTF("touched page %lu -> %lu", mp->mp_pgno, dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
652
		mdb_midl_insert(txn->mt_free_pgs, mp->mp_pgno);
Howard Chu's avatar
Howard Chu committed
653
		pgno = dp->p.mp_pgno;
Howard Chu's avatar
Howard Chu committed
654
		memcpy(&dp->p, mp, txn->mt_env->me_psize);
Howard Chu's avatar
Howard Chu committed
655
656
657
658
659
660
661
662
663
664
665
666
667
		mp = &dp->p;
		mp->mp_pgno = pgno;
		mp->mp_flags |= P_DIRTY;

		/* Update the page number to new touched page. */
		if (pp->mp_parent != NULL)
			NODEPGNO(NODEPTR(pp->mp_parent, pp->mp_pi)) = mp->mp_pgno;
		pp->mp_page = mp;
	}
	return 0;
}

int
Howard Chu's avatar
Howard Chu committed
668
mdb_env_sync(MDB_env *env, int force)
Howard Chu's avatar
Howard Chu committed
669
670
{
	int rc = 0;
Howard Chu's avatar
Howard Chu committed
671
	if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
Howard Chu's avatar
Howard Chu committed
672
		if (fdatasync(env->me_fd))
Howard Chu's avatar
Howard Chu committed
673
674
675
676
677
678
679
680
681
			rc = errno;
	}
	return rc;
}

int
mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret)
{
	MDB_txn	*txn;
Howard Chu's avatar
Howard Chu committed
682
	int rc, toggle;
Howard Chu's avatar
Howard Chu committed
683

Howard Chu's avatar
Howard Chu committed
684
	if (env->me_flags & MDB_FATAL_ERROR) {
685
		DPUTS("mdb_txn_begin: environment had fatal error, must shutdown!");
Howard Chu's avatar
Howard Chu committed
686
687
		return MDB_PANIC;
	}
Howard Chu's avatar
Howard Chu committed
688
	if ((txn = calloc(1, sizeof(MDB_txn))) == NULL) {
Howard Chu's avatar
Howard Chu committed
689
690
691
692
693
694
695
		DPRINTF("calloc: %s", strerror(errno));
		return ENOMEM;
	}

	if (rdonly) {
		txn->mt_flags |= MDB_TXN_RDONLY;
	} else {
Howard Chu's avatar
Howard Chu committed
696
697
698
699
		txn->mt_u.dirty_list = env->me_dirty_list;
		txn->mt_u.dirty_list[0].mid = 0;
		txn->mt_free_pgs = env->me_free_pgs;
		txn->mt_free_pgs[0] = 0;
Howard Chu's avatar
Howard Chu committed
700

Howard Chu's avatar
Howard Chu committed
701
702
		pthread_mutex_lock(&env->me_txns->mti_wmutex);
		env->me_txns->mti_txnid++;
Howard Chu's avatar
Howard Chu committed
703
	}
Howard Chu's avatar
Howard Chu committed
704

Howard Chu's avatar
Howard Chu committed
705
	txn->mt_txnid = env->me_txns->mti_txnid;
Howard Chu's avatar
Howard Chu committed
706
707
708
	if (rdonly) {
		MDB_reader *r = pthread_getspecific(env->me_txkey);
		if (!r) {
Howard Chu's avatar
Howard Chu committed
709
			unsigned int i;
Howard Chu's avatar
Howard Chu committed
710
711
712
			pthread_mutex_lock(&env->me_txns->mti_mutex);
			for (i=0; i<env->me_txns->mti_numreaders; i++)
				if (env->me_txns->mti_readers[i].mr_pid == 0)
Howard Chu's avatar
Howard Chu committed
713
714
					break;
			if (i == env->me_maxreaders) {
Howard Chu's avatar
Howard Chu committed
715
				pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
716
717
				return ENOSPC;
			}
Howard Chu's avatar
Howard Chu committed
718
719
720
			env->me_txns->mti_readers[i].mr_pid = getpid();
			env->me_txns->mti_readers[i].mr_tid = pthread_self();
			r = &env->me_txns->mti_readers[i];
Howard Chu's avatar
Howard Chu committed
721
			pthread_setspecific(env->me_txkey, r);
Howard Chu's avatar
Howard Chu committed
722
723
724
			if (i >= env->me_txns->mti_numreaders)
				env->me_txns->mti_numreaders = i+1;
			pthread_mutex_unlock(&env->me_txns->mti_mutex);
Howard Chu's avatar
Howard Chu committed
725
		}
Howard Chu's avatar
Howard Chu committed
726
727
728
		r->mr_txnid = txn->mt_txnid;
		txn->mt_u.reader = r;
	} else {
Howard Chu's avatar
Howard Chu committed
729
730
731
732
733
		env->me_txn = txn;
	}

	txn->mt_env = env;

Howard Chu's avatar
Howard Chu committed
734
	toggle = env->me_txns->mti_me_toggle;
Howard Chu's avatar
Howard Chu committed
735
	if ((rc = mdb_env_read_meta(env, &toggle)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
736
737
738
739
		mdb_txn_abort(txn);
		return rc;
	}

Howard Chu's avatar
Howard Chu committed
740
741
	/* Copy the DB arrays */
	txn->mt_numdbs = env->me_numdbs;
Howard Chu's avatar
Howard Chu committed
742
743
744
745
746
747
	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
	txn->mt_dbs = malloc(env->me_maxdbs * sizeof(MDB_db));
	memcpy(txn->mt_dbs, env->me_meta->mm_dbs, 2 * sizeof(MDB_db));
	if (txn->mt_numdbs > 2)
		memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2,
			(txn->mt_numdbs - 2) * sizeof(MDB_db));
Howard Chu's avatar
Howard Chu committed
748
749
750
751

	if (!rdonly) {
		if (toggle)
			txn->mt_flags |= MDB_TXN_METOGGLE;
Howard Chu's avatar
Howard Chu committed
752
		txn->mt_next_pgno = env->me_meta->mm_last_pg+1;
Howard Chu's avatar
Howard Chu committed
753
	}
Howard Chu's avatar
Howard Chu committed
754

755
	DPRINTF("begin transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
756
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
757
758
759
760
761
762
763
764
765
766
767
768
769
770

	*ret = txn;
	return MDB_SUCCESS;
}

void
mdb_txn_abort(MDB_txn *txn)
{
	MDB_env	*env;

	if (txn == NULL)
		return;

	env = txn->mt_env;
771
	DPRINTF("abort transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
772
		txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
Howard Chu's avatar
Howard Chu committed
773

Howard Chu's avatar
Howard Chu committed
774
	free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
775

Howard Chu's avatar
Howard Chu committed
776
777
778
	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		txn->mt_u.reader->mr_txnid = 0;
	} else {
Howard Chu's avatar
Howard Chu committed
779
		MDB_oldpages *mop;
780
		MDB_dpage *dp;
781
782
		unsigned int i;

783
784
785
786
787
788
789
790
791
792
793
		/* return all dirty pages to dpage list */
		for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
			dp = txn->mt_u.dirty_list[i].mptr;
			if (dp->h.md_num == 1) {
				dp->h.md_parent = (MDB_page *)txn->mt_env->me_dpages;
				txn->mt_env->me_dpages = dp;
			} else {
				/* large pages just get freed directly */
				free(dp);
			}
		}
Howard Chu's avatar
Howard Chu committed
794
795
796
797
798
799

		while ((mop = txn->mt_env->me_pghead)) {
			txn->mt_env->me_pghead = mop->mo_next;
			free(mop);
		}

Howard Chu's avatar
Howard Chu committed
800
		env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
801
		env->me_txns->mti_txnid--;
802
803
		for (i=2; i<env->me_numdbs; i++)
			env->me_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
804
		pthread_mutex_unlock(&env->me_txns->mti_wmutex);
Howard Chu's avatar
Howard Chu committed
805
806
807
808
809
810
811
812
813
	}

	free(txn);
}

int
mdb_txn_commit(MDB_txn *txn)
{
	int		 n, done;
Howard Chu's avatar
Howard Chu committed
814
	unsigned int i;
Howard Chu's avatar
Howard Chu committed
815
816
817
818
	ssize_t		 rc;
	off_t		 size;
	MDB_dpage	*dp;
	MDB_env	*env;
819
	pgno_t	next;
Howard Chu's avatar
Howard Chu committed
820
821
822
823
824
825
826
827
828
	struct iovec	 iov[MDB_COMMIT_PAGES];

	assert(txn != NULL);
	assert(txn->mt_env != NULL);

	env = txn->mt_env;

	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
		mdb_txn_abort(txn);
829
		return MDB_SUCCESS;
Howard Chu's avatar
Howard Chu committed
830
831
832
	}

	if (txn != env->me_txn) {
833
		DPUTS("attempt to commit unknown transaction");
Howard Chu's avatar
Howard Chu committed
834
835
836
837
838
		mdb_txn_abort(txn);
		return EINVAL;
	}

	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
839
		DPUTS("error flag is set, can't commit");
Howard Chu's avatar
Howard Chu committed
840
841
842
843
		mdb_txn_abort(txn);
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
844
	if (!txn->mt_u.dirty_list[0].mid)
Howard Chu's avatar
Howard Chu committed
845
846
		goto done;

Howard Chu's avatar
Howard Chu committed
847
	DPRINTF("committing transaction %lu on mdbenv %p, root page %lu",
Howard Chu's avatar
Howard Chu committed
848
849
	    txn->mt_txnid, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);

Howard Chu's avatar
Howard Chu committed
850
851
852
853
854
855
856
857
858
859
	/* should only be one record now */
	if (env->me_pghead) {
		MDB_val key, data;
		MDB_oldpages *mop;

		mop = env->me_pghead;
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&mop->mo_txnid;
		data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
		data.mv_data = mop->mo_pages;
Howard Chu's avatar
Howard Chu committed
860
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
		free(env->me_pghead);
		env->me_pghead = NULL;
	}
	/* save to free list */
	if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
		MDB_val key, data;
		MDB_pageparent mpp;

		/* make sure last page of freeDB is touched and on freelist */
		key.mv_size = MAXKEYSIZE+1;
		key.mv_data = NULL;
		mpp.mp_parent = NULL;
		mpp.mp_pi = 0;
		mdb_search_page(txn, FREE_DBI, &key, NULL, 1, &mpp);

Howard Chu's avatar
Howard Chu committed
876
877
878
879
880
881
882
883
884
885
886
#if DEBUG > 1
		{
			unsigned int i;
			ULONG *idl = txn->mt_free_pgs;
			DPRINTF("IDL write txn %lu root %lu num %lu",
				txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
			for (i=0; i<idl[0]; i++) {
				DPRINTF("IDL %lu", idl[i+1]);
			}
		}
#endif
Howard Chu's avatar
Howard Chu committed
887
888
889
890
891
		/* write to last page of freeDB */
		key.mv_size = sizeof(pgno_t);
		key.mv_data = (char *)&txn->mt_txnid;
		data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
		data.mv_data = txn->mt_free_pgs;
Howard Chu's avatar
Howard Chu committed
892
		mdb_put0(txn, FREE_DBI, &key, &data, 0);
Howard Chu's avatar
Howard Chu committed
893
894
	}

Howard Chu's avatar
Howard Chu committed
895
896
897
898
899
900
901
	/* Update DB root pointers. Their pages have already been
	 * touched so this is all in-place and cannot fail.
	 */
	{
		MDB_val data;
		data.mv_size = sizeof(MDB_db);

902
903
		for (i = 2; i < txn->mt_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
904
				data.mv_data = &txn->mt_dbs[i];
Howard Chu's avatar
Howard Chu committed
905
				mdb_put0(txn, MAIN_DBI, &txn->mt_dbxs[i].md_name, &data, 0);
Howard Chu's avatar
Howard Chu committed
906
907
908
			}
		}
	}
Howard Chu's avatar
Howard Chu committed
909
910
911

	/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
	 */
912
	next = 0;
Howard Chu's avatar
Howard Chu committed
913
	i = 1;
Howard Chu's avatar
Howard Chu committed
914
915
916
	do {
		n = 0;
		done = 1;
917
		size = 0;
Howard Chu's avatar
Howard Chu committed
918
		for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
Howard Chu's avatar
Howard Chu committed
919
			dp = txn->mt_u.dirty_list[i].mptr;
920
			if (dp->p.mp_pgno != next) {
Howard Chu's avatar
Howard Chu committed
921
922
923
924
925
926
				if (n) {
					DPRINTF("committing %u dirty pages", n);
					rc = writev(env->me_fd, iov, n);
					if (rc != size) {
						n = errno;
						if (rc > 0)
927
							DPUTS("short write, filesystem full?");
Howard Chu's avatar
Howard Chu committed
928
929
930
931
932
933
934
935
						else
							DPRINTF("writev: %s", strerror(errno));
						mdb_txn_abort(txn);
						return n;
					}
					n = 0;
					size = 0;
				}
Howard Chu's avatar
Howard Chu committed
936
				lseek(env->me_fd, dp->p.mp_pgno * env->me_psize, SEEK_SET);
937
938
				next = dp->p.mp_pgno;
			}
Howard Chu's avatar
Howard Chu committed
939
			DPRINTF("committing page %lu", dp->p.mp_pgno);
Howard Chu's avatar
Howard Chu committed
940
			iov[n].iov_len = env->me_psize * dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
941
			iov[n].iov_base = &dp->p;
942
943
			size += iov[n].iov_len;
			next = dp->p.mp_pgno + dp->h.md_num;
Howard Chu's avatar
Howard Chu committed
944
945
946
947
948
949
950
951
952
953
954
955
956
			/* clear dirty flag */
			dp->p.mp_flags &= ~P_DIRTY;
			if (++n >= MDB_COMMIT_PAGES) {
				done = 0;
				break;
			}
		}

		if (n == 0)
			break;

		DPRINTF("committing %u dirty pages", n);
		rc = writev(env->me_fd, iov, n);
957
		if (rc != size) {
Howard Chu's avatar
Howard Chu committed
958
959
			n = errno;
			if (rc > 0)
960
				DPUTS("short write, filesystem full?");
Howard Chu's avatar
Howard Chu committed
961
962
963
964
965
966
967
968
			else
				DPRINTF("writev: %s", strerror(errno));
			mdb_txn_abort(txn);
			return n;
		}

	} while (!done);

Howard Chu's avatar
Howard Chu committed
969
970
	/* Drop the dirty pages.
	 */
971
972
973
974
975
976
977
978
979
980
	for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
		dp = txn->mt_u.dirty_list[i].mptr;
		if (dp->h.md_num == 1) {
			dp->h.md_parent = (MDB_page *)txn->mt_env->me_dpages;
			txn->mt_env->me_dpages = dp;
		} else {
			free(dp);
		}
		txn->mt_u.dirty_list[i].mid = 0;
	}
Howard Chu's avatar
Howard Chu committed
981

Howard Chu's avatar
Howard Chu committed
982
	if ((n = mdb_env_sync(env, 0)) != 0 ||
Howard Chu's avatar
Howard Chu committed
983
	    (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
Howard Chu's avatar
Howard Chu committed
984
985
986
		mdb_txn_abort(txn);
		return n;
	}
Howard Chu's avatar
Howard Chu committed
987

988
989
done:
	env->me_txn = NULL;
Howard Chu's avatar
Howard Chu committed
990
	/* update the DB tables */
Howard Chu's avatar
Howard Chu committed
991
	{
Howard Chu's avatar
Howard Chu committed
992
		int toggle = !env->me_db_toggle;
Howard Chu's avatar
Howard Chu committed
993

994
995
		for (i = 2; i < env->me_numdbs; i++) {
			if (txn->mt_dbxs[i].md_dirty) {
Howard Chu's avatar
Howard Chu committed
996
				env->me_dbs[toggle][i] = txn->mt_dbs[i];
997
998
				txn->mt_dbxs[i].md_dirty = 0;
			}
Howard Chu's avatar
Howard Chu committed
999
1000
		}
		for (i = env->me_numdbs; i < txn->mt_numdbs; i++) {
1001
			txn->mt_dbxs[i].md_dirty = 0;
Howard Chu's avatar
Howard Chu committed
1002
1003
1004
1005
			env->me_dbxs[i] = txn->mt_dbxs[i];
			env->me_dbs[toggle][i] = txn->mt_dbs[i];
		}
		env->me_db_toggle = toggle;
Howard Chu's avatar
Howard Chu committed
1006
		env->me_numdbs = txn->mt_numdbs;
Howard Chu's avatar
Howard Chu committed
1007

Howard Chu's avatar
Howard Chu committed
1008
		free(txn->mt_dbs);
Howard Chu's avatar
Howard Chu committed
1009
1010
	}

Howard Chu's avatar
Howard Chu committed
1011
	pthread_mutex_unlock(&env->me_txns->mti_wmutex);
1012
	free(txn);
Howard Chu's avatar
Howard Chu committed
1013
1014
1015
1016
1017

	return MDB_SUCCESS;
}

static int
Howard Chu's avatar
Howard Chu committed
1018
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
1019
1020
1021
{
	char		 page[PAGESIZE];
	MDB_page	*p;
Howard Chu's avatar
Howard Chu committed
1022
	MDB_meta	*m;
Howard Chu's avatar
Howard Chu committed
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
	int		 rc;

	assert(env != NULL);

	/* We don't know the page size yet, so use a minimum value.
	 */

	if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
		return ENOENT;
	} else if (rc != PAGESIZE) {
		if (rc > 0)
			errno = EINVAL;
		DPRINTF("read: %s", strerror(errno));
		return errno;
	}

	p = (MDB_page *)page;

Howard Chu's avatar
Howard Chu committed
1041
1042
	if (!F_ISSET(p->mp_flags, P_META)) {
		DPRINTF("page %lu not a meta page", p->mp_pgno);
Howard Chu's avatar
Howard Chu committed
1043
1044
1045
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
1046
1047
	m = METADATA(p);
	if (m->mm_magic != MDB_MAGIC) {
1048
		DPUTS("meta has invalid magic");
Howard Chu's avatar
Howard Chu committed
1049
1050
1051
		return EINVAL;
	}

Howard Chu's avatar
Howard Chu committed
1052
	if (m->mm_version != MDB_VERSION) {
Howard Chu's avatar
Howard Chu committed
1053
		DPRINTF("database is version %u, expected version %u",
Howard Chu's avatar
Howard Chu committed
1054
		    m->mm_version, MDB_VERSION);
Howard Chu's avatar
Howard Chu committed
1055
		return MDB_VERSION_MISMATCH;
Howard Chu's avatar
Howard Chu committed
1056
1057
	}

Howard Chu's avatar
Howard Chu committed
1058
	memcpy(meta, m, sizeof(*m));
Howard Chu's avatar
Howard Chu committed
1059
1060
1061
1062
	return 0;
}

static int
Howard Chu's avatar
Howard Chu committed
1063
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
Howard Chu's avatar
Howard Chu committed
1064
{
1065
	MDB_page *p, *q;
Howard Chu's avatar
Howard Chu committed
1066
	MDB_meta *m;
1067
	int rc;
Howard Chu's avatar
Howard Chu committed
1068
	unsigned int	 psize;
Howard Chu's avatar
Howard Chu committed
1069

1070
	DPUTS("writing new meta page");
Howard Chu's avatar
Howard Chu committed
1071
1072
	psize = sysconf(_SC_PAGE_SIZE);

Howard Chu's avatar
Howard Chu committed
1073
1074
1075
1076
1077
	meta->mm_magic = MDB_MAGIC;
	meta->mm_version = MDB_VERSION;
	meta->mm_psize = psize;
	meta->mm_last_pg = 1;
	meta->mm_flags = env->me_flags & 0xffff;
Howard Chu's avatar
Howard Chu committed
1078
	meta->mm_flags |= MDB_INTEGERKEY;
Howard Chu's avatar
Howard Chu committed
1079
1080
	meta->mm_dbs[0].md_root = P_INVALID;
	meta->mm_dbs[1].md_root = P_INVALID;
Howard Chu's avatar
Howard Chu committed
1081
1082
1083

	p = calloc(2, psize);
	p->mp_pgno = 0;
1084
	p->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
1085

Howard Chu's avatar
Howard Chu committed
1086
1087
	m = METADATA(p);
	memcpy(m, meta, sizeof(*meta));
Howard Chu's avatar
Howard Chu committed
1088

Howard Chu's avatar
Howard Chu committed
1089
	q = (MDB_page *)((char *)p + psize);
Howard Chu's avatar
Howard Chu committed
1090

Howard Chu's avatar
Howard Chu committed
1091
	q->mp_pgno = 1;
1092
	q->mp_flags = P_META;
Howard Chu's avatar
Howard Chu committed
1093

Howard Chu's avatar
Howard Chu committed
1094
1095
	m = METADATA(q);
	memcpy(m, meta, sizeof(*meta));
Howard Chu's avatar
Howard Chu committed
1096

Howard Chu's avatar
Howard Chu committed
1097
	rc = write(env->me_fd, p, psize * 2);
1098
	free(p);
Howard Chu's avatar
Howard Chu committed
1099
	return (rc == (int)psize * 2) ? MDB_SUCCESS : errno;
1100
1101
1102
}

static int
Howard Chu's avatar
Howard Chu committed
1103
mdb_env_write_meta(MDB_txn *txn)
1104
1105
{
	MDB_env *env;
Howard Chu's avatar
Howard Chu committed
1106
	MDB_meta	meta, metab;
1107
	off_t off;
Howard Chu's avatar
Howard Chu committed
1108
	int rc, len, toggle;
Howard Chu's avatar
Howard Chu committed
1109
	char *ptr;
Howard Chu's avatar
Howard Chu committed
1110

1111
1112
1113
	assert(txn != NULL);
	assert(txn->mt_env != NULL);

Howard Chu's avatar
Howard Chu committed
1114
	toggle = !F_ISSET(txn->mt_flags, MDB_TXN_METOGGLE);
Howard Chu's avatar
Howard Chu committed
1115
	DPRINTF("writing meta page %d for root page %lu",
Howard Chu's avatar
Howard Chu committed
1116
		toggle, txn->mt_dbs[MAIN_DBI].md_root);
1117
1118
1119

	env = txn->mt_env;

Howard Chu's avatar
Howard Chu committed
1120
1121
1122
	metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
	metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;

Howard Chu's avatar
Howard Chu committed
1123
	ptr = (char *)&meta;
Howard Chu's avatar
Howard Chu committed
1124
	off = offsetof(MDB_meta, mm_dbs[0].md_depth);
Howard Chu's avatar
Howard Chu committed
1125
1126
1127
	len = sizeof(MDB_meta) - off;

	ptr += off;
Howard Chu's avatar
Howard Chu committed
1128
1129
	meta.mm_dbs[0] = txn->mt_dbs[0];
	meta.mm_dbs[1] = txn->mt_dbs[1];
Howard Chu's avatar
Howard Chu committed
1130
1131
	meta.mm_last_pg = txn->mt_next_pgno - 1;
	meta.mm_txnid = txn->mt_txnid;
1132

Howard Chu's avatar
Howard Chu committed
1133
	if (toggle)
Howard Chu's avatar
Howard Chu committed
1134
		off += env->me_psize;
1135
1136
	off += PAGEHDRSZ;

Howard Chu's avatar
Howard Chu committed
1137
1138
	/* Write to the SYNC fd */
	rc = pwrite(env->me_mfd, ptr, len, off);
Howard Chu's avatar
Howard Chu committed
1139
	if (rc != len) {
1140
		DPUTS("write failed, disk error?");
Howard Chu's avatar
Howard Chu committed
1141
1142
1143
1144
1145
1146
1147
1148
		/* On a failure, the pagecache still contains the new data.
		 * Write some old data back, to prevent it from being used.
		 * Use the non-SYNC fd; we know it will fail anyway.
		 */
		meta.mm_last_pg = metab.mm_last_pg;
		meta.mm_txnid = metab.mm_txnid;
		rc = pwrite(env->me_fd, ptr, len, off);
		env->me_flags |= MDB_FATAL_ERROR;
1149
		return errno;
Howard Chu's avatar
Howard Chu committed