From 7fdf6720416d5eaadf5fd811c4513c87a2899a0f Mon Sep 17 00:00:00 2001
From: Hallvard Furuseth <hallvard@openldap.org>
Date: Wed, 16 Jan 2013 18:42:57 +0100
Subject: [PATCH] Update MDB comments: Caveats, flags, etc.

---
 libraries/liblmdb/lmdb.h | 70 ++++++++++++++++++++++++++--------------
 libraries/liblmdb/mdb.c  | 14 ++++----
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h
index bd10bb6bac..7719e3294a 100644
--- a/libraries/liblmdb/lmdb.h
+++ b/libraries/liblmdb/lmdb.h
@@ -78,10 +78,11 @@
  *	  database can grow quickly.  Write transactions prevent
  *	  other write transactions, since writes are serialized.
  *
- *	...when several processes can use a database concurrently:
- *
  *	- Avoid suspending a process with active transactions.  These
- *	  would then be "long-lived" as above.
+ *	  would then be "long-lived" as above.  Also read transactions
+ *	  suspended when writers commit could sometimes see wrong data.
+ *
+ *	...when several processes can use a database concurrently:
  *
  *	- Avoid aborting a process with an active transaction.
  *	  The transaction becomes "long-lived" as above until the lockfile
@@ -221,7 +222,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
  *	Values do not overlap Database Flags.
  *	@{
  */
-	/** mmap at a fixed address */
+	/** mmap at a fixed address (experimental) */
 #define MDB_FIXEDMAP	0x01
 	/** no environment directory */
 #define MDB_NOSUBDIR	0x4000
@@ -233,7 +234,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
 #define MDB_NOMETASYNC		0x40000
 	/** use writable mmap */
 #define MDB_WRITEMAP		0x80000
-	/** use asynchronous msync */
+	/** use asynchronous msync when MDB_WRITEMAP is used */
 #define MDB_MAPASYNC		0x100000
 /** @} */
 
@@ -435,24 +436,43 @@ int  mdb_env_create(MDB_env **env);
 	 *		under that directory. With this option, \b path is used as-is for
 	 *		the database main data file. The database lock file is the \b path
 	 *		with "-lock" appended.
-	 *	<li>#MDB_NOSYNC
-	 *		Don't perform a synchronous flush after committing a transaction. This means
-	 *		transactions will exhibit the ACI (atomicity, consistency, and isolation)
-	 *		properties, but not D (durability); that is database integrity will be
-	 *		maintained but it is possible some number of the most recently committed
-	 *		transactions may be undone after a system crash. The number of transactions
-	 *		at risk is governed by how often the system flushes dirty buffers to disk
-	 *		and how often #mdb_env_sync() is called. This flag may be changed
-	 *		at any time using #mdb_env_set_flags().
-	 *	<li>#MDB_NOMETASYNC
-	 *		Don't perform a synchronous flush of the meta page after committing
-	 *		a transaction. This is similar to the #MDB_NOSYNC case, but safer
-	 *		because the transaction data is still flushed. The meta page for any
-	 *		transaction N will be flushed by the data flush of transaction N+1.
-	 *		In case of a system crash, the last committed transaction may be
-	 *		lost. This flag may be changed at any time using #mdb_env_set_flags().
 	 *	<li>#MDB_RDONLY
-	 *		Open the environment in read-only mode. No write operations will be allowed.
+	 *		Open the environment in read-only mode. No write operations will be
+	 *		allowed. MDB will still modify the lock file - except on read-only
+	 *		filesystems, where MDB does not use locks.
+	 *	<li>#MDB_WRITEMAP
+	 *		Use a writeable memory map unless MDB_RDONLY is set. This is faster
+	 *		and uses fewer mallocs, but loses protection from application bugs
+	 *		like wild pointer writes and other bad updates into the database.
+	 *		Incompatible with nested transactions.
+	 *	<li>#MDB_NOMETASYNC
+	 *		Flush system buffers to disk only once per transaction, omit the
+	 *		metadata flush. Defer that until the system flushes files to disk,
+	 *		or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization
+	 *		maintains database integrity, but a system crash may undo the last
+	 *		committed transaction. I.e. it preserves the ACI (atomicity,
+	 *		consistency, isolation) but not D (durability) database property.
+	 *		This flag may be changed at any time using #mdb_env_set_flags().
+	 *	<li>#MDB_NOSYNC
+	 *		Don't flush system buffers to disk when committing a transaction.
+	 *		This optimization means a system crash can corrupt the database or
+	 *		lose the last transactions if buffers are not yet flushed to disk.
+	 *		The risk is governed by how often the system flushes dirty buffers
+	 *		to disk and how often #mdb_env_sync() is called.  However, if the
+	 *		filesystem preserves write order and the #MDB_WRITEMAP flag is not
+	 *		used, transactions exhibit ACI (atomicity, consistency, isolation)
+	 *		properties and only lose D (durability).  I.e. database integrity
+	 *		is maintained, but a system crash may undo the final transactions.
+	 *		Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no
+	 *		hint for when to write transactions to disk, unless #mdb_env_sync()
+	 *		is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable.
+	 *		This flag may be changed at any time using #mdb_env_set_flags().
+	 *	<li>#MDB_MAPASYNC
+	 *		When using #MDB_WRITEMAP, use asynchronous flushes to disk.
+	 *		As with #MDB_NOSYNC, a system crash can then corrupt the
+	 *		database or lose the last transactions. Calling #mdb_env_sync()
+	 *		ensures on-disk database integrity until next commit.
+	 *		This flag may be changed at any time using #mdb_env_set_flags().
 	 * </ul>
 	 * @param[in] mode The UNIX permissions to set on created files. This parameter
 	 * is ignored on Windows.
@@ -502,7 +522,7 @@ int  mdb_env_info(MDB_env *env, MDB_envinfo *stat);
 	 * Data is always written to disk when #mdb_txn_commit() is called,
 	 * but the operating system may keep it buffered. MDB always flushes
 	 * the OS buffers upon commit as well, unless the environment was
-	 * opened with #MDB_NOSYNC.
+	 * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC.
 	 * @param[in] env An environment handle returned by #mdb_env_create()
 	 * @param[in] force If non-zero, force a synchronous flush.  Otherwise
 	 *  if the environment has the #MDB_NOSYNC flag set the flushes
@@ -731,7 +751,7 @@ int  mdb_txn_renew(MDB_txn *txn);
 	 * by the given transaction. Only one thread should call this function;
 	 * it is not mutex-protected in a read-only transaction.
 	 * To use named databases (with name != NULL), #mdb_env_set_maxdbs()
-	 * must be called before opening the enviorment.
+	 * must be called before opening the environment.
 	 * @param[in] txn A transaction handle returned by #mdb_txn_begin()
 	 * @param[in] name The name of the database to open. If only a single
 	 * 	database is needed in the environment, this value may be NULL.
@@ -796,7 +816,7 @@ int  mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat);
 	 *
 	 * This call is not mutex protected. Handles should only be closed by
 	 * a single thread, and only if no other threads are going to reference
-	 * the database handle any further.
+	 * the database handle or one of its cursors any further.
 	 * @param[in] env An environment handle returned by #mdb_env_create()
 	 * @param[in] dbi A database handle returned by #mdb_dbi_open()
 	 */
diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c
index 1a443cecef..e5013d6d56 100644
--- a/libraries/liblmdb/mdb.c
+++ b/libraries/liblmdb/mdb.c
@@ -382,7 +382,7 @@ static txnid_t mdb_debug_start;
 	 */
 #define P_INVALID	 (~(pgno_t)0)
 
-	/** Test if a flag \b f is set in a flag word \b w. */
+	/** Test if the flags \b f are set in a flag word \b w. */
 #define F_ISSET(w, f)	 (((w) & (f)) == (f))
 
 	/**	Used for offsets within a single page.
@@ -404,6 +404,8 @@ typedef uint16_t	 indx_t;
  *	slot's address is saved in thread-specific data so that subsequent read
  *	transactions started by the same thread need no further locking to proceed.
  *
+ *	No reader table is used if the database is on a read-only filesystem.
+ *
  *	Since the database uses multi-version concurrency control, readers don't
  *	actually need any locking. This table is used to keep track of which
  *	readers are using data from which old transactions, so that we'll know
@@ -810,8 +812,8 @@ struct MDB_txn {
 	 */
 	MDB_IDL		mt_free_pgs;
 	union {
-		MDB_ID2L	dirty_list;	/**< modified pages */
-		MDB_reader	*reader;	/**< this thread's slot in the reader table */
+		MDB_ID2L	dirty_list;	/**< for write txns: modified pages */
+		MDB_reader	*reader;	/**< this thread's reader table slot or NULL */
 	} mt_u;
 	/** Array of records for each DB known in the environment. */
 	MDB_dbx		*mt_dbxs;
@@ -824,7 +826,7 @@ struct MDB_txn {
 #define DB_DIRTY	0x01		/**< DB was written in this txn */
 #define DB_STALE	0x02		/**< DB record is older than txnID */
 /** @} */
-	/** Array of cursors for each DB */
+	/** In write txns, array of cursors for each DB */
 	MDB_cursor	**mt_cursors;
 	/** Array of flags for each DB */
 	unsigned char	*mt_dbflags;
@@ -941,7 +943,7 @@ struct MDB_env {
 	pid_t		me_pid;		/**< process ID of this env */
 	char		*me_path;		/**< path to the DB files */
 	char		*me_map;		/**< the memory map of the data file */
-	MDB_txninfo	*me_txns;		/**< the memory map of the lock file */
+	MDB_txninfo	*me_txns;		/**< the memory map of the lock file or NULL */
 	MDB_meta	*me_metas[2];	/**< pointers to the two meta pages */
 	MDB_txn		*me_txn;		/**< current write transaction */
 	size_t		me_mapsize;		/**< size of the data memory map */
@@ -950,7 +952,7 @@ struct MDB_env {
 	txnid_t		me_pgfirst;		/**< ID of first old page record we used */
 	txnid_t		me_pglast;		/**< ID of last old page record we used */
 	MDB_dbx		*me_dbxs;		/**< array of static DB info */
-	uint16_t	*me_dbflags;	/**< array of DB flags */
+	uint16_t	*me_dbflags;	/**< array of flags from MDB_db.md_flags */
 	MDB_oldpages *me_pghead;	/**< list of old page records */
 	MDB_oldpages *me_pgfree;	/**< list of page records to free */
 	pthread_key_t	me_txkey;	/**< thread-key for readers */
-- 
GitLab