diff --git a/libraries/liblmdb/Doxyfile b/libraries/liblmdb/Doxyfile index aa2c3058357d2f1dcae99bf31cf10ecf7dae5078..3fd0365c7daf616f96d9dada732e1039ddc0db53 100644 --- a/libraries/liblmdb/Doxyfile +++ b/libraries/liblmdb/Doxyfile @@ -582,7 +582,7 @@ WARN_LOGFILE = # directories like "/usr/src/myproject". Separate the files or directories # with spaces. -INPUT = mdb.h midl.h mdb.c midl.c +INPUT = lmdb.h midl.h mdb.c midl.c # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index bd10bb6bacf2e0fed0f63f7482a0dc8d8234b9f8..7719e3294aae3201374ded0659d568da8d952793 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -78,10 +78,11 @@ * database can grow quickly. Write transactions prevent * other write transactions, since writes are serialized. * - * ...when several processes can use a database concurrently: - * * - Avoid suspending a process with active transactions. These - * would then be "long-lived" as above. + * would then be "long-lived" as above. Also read transactions + * suspended when writers commit could sometimes see wrong data. + * + * ...when several processes can use a database concurrently: * * - Avoid aborting a process with an active transaction. * The transaction becomes "long-lived" as above until the lockfile @@ -221,7 +222,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel * Values do not overlap Database Flags. * @{ */ - /** mmap at a fixed address */ + /** mmap at a fixed address (experimental) */ #define MDB_FIXEDMAP 0x01 /** no environment directory */ #define MDB_NOSUBDIR 0x4000 @@ -233,7 +234,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOMETASYNC 0x40000 /** use writable mmap */ #define MDB_WRITEMAP 0x80000 - /** use asynchronous msync */ + /** use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 /** @} */ @@ -435,24 +436,43 @@ int mdb_env_create(MDB_env **env); * under that directory. With this option, \b path is used as-is for * the database main data file. The database lock file is the \b path * with "-lock" appended. - * <li>#MDB_NOSYNC - * Don't perform a synchronous flush after committing a transaction. This means - * transactions will exhibit the ACI (atomicity, consistency, and isolation) - * properties, but not D (durability); that is database integrity will be - * maintained but it is possible some number of the most recently committed - * transactions may be undone after a system crash. The number of transactions - * at risk is governed by how often the system flushes dirty buffers to disk - * and how often #mdb_env_sync() is called. This flag may be changed - * at any time using #mdb_env_set_flags(). - * <li>#MDB_NOMETASYNC - * Don't perform a synchronous flush of the meta page after committing - * a transaction. This is similar to the #MDB_NOSYNC case, but safer - * because the transaction data is still flushed. The meta page for any - * transaction N will be flushed by the data flush of transaction N+1. - * In case of a system crash, the last committed transaction may be - * lost. This flag may be changed at any time using #mdb_env_set_flags(). * <li>#MDB_RDONLY - * Open the environment in read-only mode. No write operations will be allowed. + * Open the environment in read-only mode. No write operations will be + * allowed. MDB will still modify the lock file - except on read-only + * filesystems, where MDB does not use locks. + * <li>#MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This is faster + * and uses fewer mallocs, but loses protection from application bugs + * like wild pointer writes and other bad updates into the database. + * Incompatible with nested transactions. + * <li>#MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_NOSYNC + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often #mdb_env_sync() is called. However, if the + * filesystem preserves write order and the #MDB_WRITEMAP flag is not + * used, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity + * is maintained, but a system crash may undo the final transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk, unless #mdb_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling #mdb_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using #mdb_env_set_flags(). * </ul> * @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. @@ -502,7 +522,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * Data is always written to disk when #mdb_txn_commit() is called, * but the operating system may keep it buffered. MDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC. + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes @@ -731,7 +751,7 @@ int mdb_txn_renew(MDB_txn *txn); * by the given transaction. Only one thread should call this function; * it is not mutex-protected in a read-only transaction. * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the enviorment. + * must be called before opening the environment. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] name The name of the database to open. If only a single * database is needed in the environment, this value may be NULL. @@ -796,7 +816,7 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); * * This call is not mutex protected. Handles should only be closed by * a single thread, and only if no other threads are going to reference - * the database handle any further. + * the database handle or one of its cursors any further. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] dbi A database handle returned by #mdb_dbi_open() */ diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 67defdba2585ff511f0b4f265124e9cbf3617080..575e494487640b73641eaa4cc09a4afe006e696e 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -342,19 +342,31 @@ static txnid_t mdb_debug_start; /** The version number for a database's file format. */ #define MDB_VERSION 1 - /** The maximum size of a key in the database. - * While data items have essentially unbounded size, we require that - * keys all fit onto a regular page. This limit could be raised a bit - * further if needed; to something just under #MDB_PAGESIZE / #MDB_MINKEYS. + /** @brief The maximum size of a key in the database. + * + * We require that keys all fit onto a regular page. This limit + * could be raised a bit further if needed; to something just + * under #MDB_PAGESIZE / #MDB_MINKEYS. + * + * Note that data items in an #MDB_DUPSORT database are actually keys + * of a subDB, so they're also limited to this size. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE 511 +#endif + + /** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. */ -#define MAXKEYSIZE 511 +#define MAXDATASIZE 0xffffffffUL #if MDB_DEBUG /** A key buffer. * @ingroup debug * This is used for printing a hex dump of a key's contents. */ -#define DKBUF char kbuf[(MAXKEYSIZE*2+1)] +#define DKBUF char kbuf[(MDB_MAXKEYSIZE*2+1)] /** Display a key in hex. * @ingroup debug * Invoke a function to display a key in hex. @@ -370,7 +382,7 @@ static txnid_t mdb_debug_start; */ #define P_INVALID (~(pgno_t)0) - /** Test if a flag \b f is set in a flag word \b w. */ + /** Test if the flags \b f are set in a flag word \b w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) /** Used for offsets within a single page. @@ -392,6 +404,8 @@ typedef uint16_t indx_t; * slot's address is saved in thread-specific data so that subsequent read * transactions started by the same thread need no further locking to proceed. * + * No reader table is used if the database is on a read-only filesystem. + * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which * readers are using data from which old transactions, so that we'll know @@ -798,8 +812,8 @@ struct MDB_txn { */ MDB_IDL mt_free_pgs; union { - MDB_ID2L dirty_list; /**< modified pages */ - MDB_reader *reader; /**< this thread's slot in the reader table */ + MDB_ID2L dirty_list; /**< for write txns: modified pages */ + MDB_reader *reader; /**< this thread's reader table slot or NULL */ } mt_u; /** Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; @@ -812,7 +826,7 @@ struct MDB_txn { #define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< DB record is older than txnID */ /** @} */ - /** Array of cursors for each DB */ + /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; @@ -929,7 +943,7 @@ struct MDB_env { pid_t me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ MDB_txn *me_txn; /**< current write transaction */ size_t me_mapsize; /**< size of the data memory map */ @@ -938,7 +952,7 @@ struct MDB_env { txnid_t me_pgfirst; /**< ID of first old page record we used */ txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of DB flags */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ MDB_oldpages *me_pghead; /**< list of old page records */ MDB_oldpages *me_pgfree; /**< list of page records to free */ pthread_key_t me_txkey; /**< thread-key for readers */ @@ -1081,8 +1095,8 @@ mdb_dkey(MDB_val *key, char *buf) char *ptr = buf; unsigned char *c = key->mv_data; unsigned int i; - if (key->mv_size > MAXKEYSIZE) - return "MAXKEYSIZE"; + if (key->mv_size > MDB_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; /* may want to make this a dynamic check: if the key is mostly * printable characters, print it as-is instead of converting to hex. */ @@ -2176,7 +2190,7 @@ free2: MDB_val key, data; /* make sure last page of freeDB is touched and on freelist */ - key.mv_size = MAXKEYSIZE+1; + key.mv_size = MDB_MAXKEYSIZE+1; key.mv_data = NULL; rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) @@ -3954,7 +3968,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) if (key == NULL) /* Initialize cursor to first page. */ i = 0; - else if (key->mv_size > MAXKEYSIZE && key->mv_data == NULL) { + else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) { /* cursor to last page */ i = NUMKEYS(mp)-1; } else { @@ -4130,7 +4144,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, if (txn == NULL || !dbi || dbi >= txn->mt_numdbs) return EINVAL; - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; } @@ -4572,7 +4586,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { MDB_val lkey; - lkey.mv_size = MAXKEYSIZE+1; + lkey.mv_size = MDB_MAXKEYSIZE+1; lkey.mv_data = NULL; rc = mdb_page_search(mc, &lkey, 0); if (rc != MDB_SUCCESS) @@ -4656,7 +4670,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET: case MDB_SET_KEY: case MDB_SET_RANGE: - if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key == NULL || key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { rc = EINVAL; } else if (op == MDB_SET_RANGE) rc = mdb_cursor_set(mc, key, data, op, NULL); @@ -4793,13 +4807,24 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, size_t nsize; int rc, rc2; MDB_pagebuf pbuf; - char dbuf[MAXKEYSIZE+1]; + char dbuf[MDB_MAXKEYSIZE+1]; unsigned int nflags; DKBUF; if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) + return EINVAL; + + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT) && data->mv_size > MDB_MAXKEYSIZE) + return EINVAL; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > MAXDATASIZE) + return EINVAL; +#endif + DPRINTF("==> put db %u key [%s], size %zu, data size %zu", mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size); @@ -5039,8 +5064,10 @@ current: */ if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = NODEDATA(leaf); - else + else if (data->mv_size) memcpy(NODEDATA(leaf), data->mv_data, data->mv_size); + else + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); goto done; } mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0); @@ -5769,7 +5796,7 @@ mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key) #if MDB_DEBUG { MDB_val k2; - char kbuf2[(MAXKEYSIZE*2+1)]; + char kbuf2[(MDB_MAXKEYSIZE*2+1)]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu", @@ -6321,7 +6348,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, return EACCES; } - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; } @@ -6760,7 +6787,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, return EACCES; } - if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) { + if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { return EINVAL; }