Commit 7fdf6720 authored by Hallvard Furuseth's avatar Hallvard Furuseth
Browse files

Update MDB comments: Caveats, flags, etc.

parent 8e7bb204
......@@ -78,10 +78,11 @@
* database can grow quickly. Write transactions prevent
* other write transactions, since writes are serialized.
*
* ...when several processes can use a database concurrently:
*
* - Avoid suspending a process with active transactions. These
* would then be "long-lived" as above.
* would then be "long-lived" as above. Also read transactions
* suspended when writers commit could sometimes see wrong data.
*
* ...when several processes can use a database concurrently:
*
* - Avoid aborting a process with an active transaction.
* The transaction becomes "long-lived" as above until the lockfile
......@@ -221,7 +222,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
* Values do not overlap Database Flags.
* @{
*/
/** mmap at a fixed address */
/** mmap at a fixed address (experimental) */
#define MDB_FIXEDMAP 0x01
/** no environment directory */
#define MDB_NOSUBDIR 0x4000
......@@ -233,7 +234,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
#define MDB_NOMETASYNC 0x40000
/** use writable mmap */
#define MDB_WRITEMAP 0x80000
/** use asynchronous msync */
/** use asynchronous msync when MDB_WRITEMAP is used */
#define MDB_MAPASYNC 0x100000
/** @} */
......@@ -435,24 +436,43 @@ int mdb_env_create(MDB_env **env);
* under that directory. With this option, \b path is used as-is for
* the database main data file. The database lock file is the \b path
* with "-lock" appended.
* <li>#MDB_NOSYNC
* Don't perform a synchronous flush after committing a transaction. This means
* transactions will exhibit the ACI (atomicity, consistency, and isolation)
* properties, but not D (durability); that is database integrity will be
* maintained but it is possible some number of the most recently committed
* transactions may be undone after a system crash. The number of transactions
* at risk is governed by how often the system flushes dirty buffers to disk
* and how often #mdb_env_sync() is called. This flag may be changed
* at any time using #mdb_env_set_flags().
* <li>#MDB_NOMETASYNC
* Don't perform a synchronous flush of the meta page after committing
* a transaction. This is similar to the #MDB_NOSYNC case, but safer
* because the transaction data is still flushed. The meta page for any
* transaction N will be flushed by the data flush of transaction N+1.
* In case of a system crash, the last committed transaction may be
* lost. This flag may be changed at any time using #mdb_env_set_flags().
* <li>#MDB_RDONLY
* Open the environment in read-only mode. No write operations will be allowed.
* Open the environment in read-only mode. No write operations will be
* allowed. MDB will still modify the lock file - except on read-only
* filesystems, where MDB does not use locks.
* <li>#MDB_WRITEMAP
* Use a writeable memory map unless MDB_RDONLY is set. This is faster
* and uses fewer mallocs, but loses protection from application bugs
* like wild pointer writes and other bad updates into the database.
* Incompatible with nested transactions.
* <li>#MDB_NOMETASYNC
* Flush system buffers to disk only once per transaction, omit the
* metadata flush. Defer that until the system flushes files to disk,
* or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization
* maintains database integrity, but a system crash may undo the last
* committed transaction. I.e. it preserves the ACI (atomicity,
* consistency, isolation) but not D (durability) database property.
* This flag may be changed at any time using #mdb_env_set_flags().
* <li>#MDB_NOSYNC
* Don't flush system buffers to disk when committing a transaction.
* This optimization means a system crash can corrupt the database or
* lose the last transactions if buffers are not yet flushed to disk.
* The risk is governed by how often the system flushes dirty buffers
* to disk and how often #mdb_env_sync() is called. However, if the
* filesystem preserves write order and the #MDB_WRITEMAP flag is not
* used, transactions exhibit ACI (atomicity, consistency, isolation)
* properties and only lose D (durability). I.e. database integrity
* is maintained, but a system crash may undo the final transactions.
* Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no
* hint for when to write transactions to disk, unless #mdb_env_sync()
* is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable.
* This flag may be changed at any time using #mdb_env_set_flags().
* <li>#MDB_MAPASYNC
* When using #MDB_WRITEMAP, use asynchronous flushes to disk.
* As with #MDB_NOSYNC, a system crash can then corrupt the
* database or lose the last transactions. Calling #mdb_env_sync()
* ensures on-disk database integrity until next commit.
* This flag may be changed at any time using #mdb_env_set_flags().
* </ul>
* @param[in] mode The UNIX permissions to set on created files. This parameter
* is ignored on Windows.
......@@ -502,7 +522,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
* Data is always written to disk when #mdb_txn_commit() is called,
* but the operating system may keep it buffered. MDB always flushes
* the OS buffers upon commit as well, unless the environment was
* opened with #MDB_NOSYNC.
* opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] force If non-zero, force a synchronous flush. Otherwise
* if the environment has the #MDB_NOSYNC flag set the flushes
......@@ -731,7 +751,7 @@ int mdb_txn_renew(MDB_txn *txn);
* by the given transaction. Only one thread should call this function;
* it is not mutex-protected in a read-only transaction.
* To use named databases (with name != NULL), #mdb_env_set_maxdbs()
* must be called before opening the enviorment.
* must be called before opening the environment.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] name The name of the database to open. If only a single
* database is needed in the environment, this value may be NULL.
......@@ -796,7 +816,7 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat);
*
* This call is not mutex protected. Handles should only be closed by
* a single thread, and only if no other threads are going to reference
* the database handle any further.
* the database handle or one of its cursors any further.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbi A database handle returned by #mdb_dbi_open()
*/
......
......@@ -382,7 +382,7 @@ static txnid_t mdb_debug_start;
*/
#define P_INVALID (~(pgno_t)0)
/** Test if a flag \b f is set in a flag word \b w. */
/** Test if the flags \b f are set in a flag word \b w. */
#define F_ISSET(w, f) (((w) & (f)) == (f))
/** Used for offsets within a single page.
......@@ -404,6 +404,8 @@ typedef uint16_t indx_t;
* slot's address is saved in thread-specific data so that subsequent read
* transactions started by the same thread need no further locking to proceed.
*
* No reader table is used if the database is on a read-only filesystem.
*
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
......@@ -810,8 +812,8 @@ struct MDB_txn {
*/
MDB_IDL mt_free_pgs;
union {
MDB_ID2L dirty_list; /**< modified pages */
MDB_reader *reader; /**< this thread's slot in the reader table */
MDB_ID2L dirty_list; /**< for write txns: modified pages */
MDB_reader *reader; /**< this thread's reader table slot or NULL */
} mt_u;
/** Array of records for each DB known in the environment. */
MDB_dbx *mt_dbxs;
......@@ -824,7 +826,7 @@ struct MDB_txn {
#define DB_DIRTY 0x01 /**< DB was written in this txn */
#define DB_STALE 0x02 /**< DB record is older than txnID */
/** @} */
/** Array of cursors for each DB */
/** In write txns, array of cursors for each DB */
MDB_cursor **mt_cursors;
/** Array of flags for each DB */
unsigned char *mt_dbflags;
......@@ -941,7 +943,7 @@ struct MDB_env {
pid_t me_pid; /**< process ID of this env */
char *me_path; /**< path to the DB files */
char *me_map; /**< the memory map of the data file */
MDB_txninfo *me_txns; /**< the memory map of the lock file */
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
MDB_txn *me_txn; /**< current write transaction */
size_t me_mapsize; /**< size of the data memory map */
......@@ -950,7 +952,7 @@ struct MDB_env {
txnid_t me_pgfirst; /**< ID of first old page record we used */
txnid_t me_pglast; /**< ID of last old page record we used */
MDB_dbx *me_dbxs; /**< array of static DB info */
uint16_t *me_dbflags; /**< array of DB flags */
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
MDB_oldpages *me_pghead; /**< list of old page records */
MDB_oldpages *me_pgfree; /**< list of page records to free */
pthread_key_t me_txkey; /**< thread-key for readers */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment