diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 5cf8bd75ec0c2d1220eb3641eb9f150705fe5ff3..280b43795c4793c1526f42d926fb58d98c8ff835 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -66,6 +66,7 @@ * * - A thread can only use one transaction at a time, plus any child * transactions. Each transaction belongs to one thread. See below. + * The #MDB_NOTLS flag changes this for read-only transactions. * * - Use an MDB_env* in the process which opened it, without fork()ing. * @@ -249,6 +250,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_WRITEMAP 0x80000 /** use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 + /** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 /** @} */ /** @defgroup mdb_dbi_open Database Flags @@ -370,7 +373,9 @@ typedef enum MDB_cursor_op { #define MDB_MAP_RESIZED (-30785) /** Database flags changed or would change */ #define MDB_INCOMPATIBLE (-30784) -#define MDB_LAST_ERRCODE MDB_INCOMPATIBLE + /** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) +#define MDB_LAST_ERRCODE MDB_BAD_RSLOT /** @} */ /** @brief Statistics for a database in the environment */ @@ -390,8 +395,8 @@ typedef struct MDB_envinfo { size_t me_mapsize; /**< Size of the data memory map */ size_t me_last_pgno; /**< ID of the last used page */ size_t me_last_txnid; /**< ID of the last committed transaction */ - unsigned int me_maxreaders; /**< maximum number of threads for the environment */ - unsigned int me_numreaders; /**< maximum number of threads used in the environment */ + unsigned int me_maxreaders; /**< max reader slots in the environment */ + unsigned int me_numreaders; /**< max reader slots used in the environment */ } MDB_envinfo; /** @brief Return the mdb library version information. @@ -490,6 +495,15 @@ int mdb_env_create(MDB_env **env); * database or lose the last transactions. Calling #mdb_env_sync() * ensures on-disk database integrity until next commit. * This flag may be changed at any time using #mdb_env_set_flags(). + * <li>#MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps + * the slot reseved for the #MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since MDB's write locking is unaware of the user threads. * </ul> * @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. @@ -624,13 +638,17 @@ int mdb_env_get_path(MDB_env *env, const char **path); */ int mdb_env_set_mapsize(MDB_env *env, size_t size); - /** @brief Set the maximum number of threads for the environment. + /** @brief Set the maximum number of threads/reader slots for the environment. * * This defines the number of slots in the lock table that is used to track readers in the * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. * This function may only be called after #mdb_env_create() and before #mdb_env_open(). * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] readers The maximum number of threads + * @param[in] readers The maximum number of reader lock table slots * @return A non-zero error value on failure and 0 on success. Some possible * errors are: * <ul> @@ -639,7 +657,7 @@ int mdb_env_set_mapsize(MDB_env *env, size_t size); */ int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); - /** @brief Get the maximum number of threads for the environment. + /** @brief Get the maximum number of threads/reader slots for the environment. * * @param[in] env An environment handle returned by #mdb_env_create() * @param[out] readers Address of an integer to store the number of readers @@ -670,8 +688,9 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /** @brief Create a transaction for use with the environment. * * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). - * @note Transactions may not span threads; a transaction must only be used by a - * single thread. Also, a thread may only have a single transaction. + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. * @note Cursors may not span transactions; each cursor must be opened and closed * within a single transaction. * @param[in] env An environment handle returned by #mdb_env_create() @@ -692,7 +711,7 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); * errors are: * <ul> * <li>#MDB_PANIC - a fatal error occurred earlier and the environment -- * must be shut down. + * must be shut down. * <li>#MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's * mapsize and the environment must be shut down. * <li>#MDB_READERS_FULL - a read-only transaction was requested and @@ -728,11 +747,13 @@ void mdb_txn_abort(MDB_txn *txn); /** @brief Reset a read-only transaction. * - * This releases the current reader lock but doesn't free the - * transaction handle, allowing it to be used again later by #mdb_txn_renew(). - * It otherwise has the same effect as #mdb_txn_abort() but saves some memory - * allocation/deallocation overhead if a thread is going to start a new - * read-only transaction again soon. + * Abort the transaction like #mdb_txn_abort(), but keep the transaction + * handle. #mdb_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. * All cursors opened within the transaction must be closed before the transaction * is reset. * Reader locks generally don't interfere with writers, but they keep old @@ -1043,8 +1064,8 @@ int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); /** @brief Create a cursor handle. * - * Cursors are associated with a specific transaction and database and - * may not span threads. + * A cursor is associated with a specific transaction and database. + * It must be closed before its transaction ends. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] dbi A database handle returned by #mdb_dbi_open() * @param[out] cursor Address where the new #MDB_cursor handle will be stored @@ -1065,8 +1086,9 @@ void mdb_cursor_close(MDB_cursor *cursor); /** @brief Renew a cursor handle. * - * Cursors are associated with a specific transaction and database and - * may not span threads. Cursors that are only used in read-only + * A cursor is associated with a specific transaction and database. + * It must be closed before its transaction ends. + * Cursors that are only used in read-only * transactions may be re-used, to avoid unnecessary malloc/free overhead. * The cursor may be associated with a new read-only transaction, and * referencing the same database handle as it was created with. diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8a2875cf089e8f4262a5cc31ce7005c7d414f1da..09fc3e29e8c9a85cb4850f5c172e800d14838904 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -406,6 +406,8 @@ typedef uint16_t indx_t; * slot's address is saved in thread-specific data so that subsequent read * transactions started by the same thread need no further locking to proceed. * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * * No reader table is used if the database is on a read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't @@ -746,7 +748,8 @@ typedef struct MDB_db { } MDB_db; /** mdb_dbi_open flags */ -#define PERSISTENT_FLAGS 0x7fff +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -830,8 +833,7 @@ struct MDB_txn { #define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< DB record is older than txnID */ #define DB_NEW 0x04 /**< DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid */ -#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; @@ -934,10 +936,10 @@ struct MDB_env { HANDLE me_mfd; /**< just for writing the meta pages */ /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U - /** Read-only Filesystem. Allow read access, no locking. */ -#define MDB_ROFS 0x40000000U /** Some fields are initialized. */ #define MDB_ENV_ACTIVE 0x20000000U + /** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U uint32_t me_flags; /**< @ref mdb_env */ unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */ unsigned int me_maxreaders; /**< size of the reader table */ @@ -963,8 +965,8 @@ struct MDB_env { MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; - /** ID2L of pages that were written during a write txn */ - MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE]; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; /** Max number of freelist items that can fit in a single overflow page */ unsigned int me_maxfree_1pg; /** Max size of a node on a page */ @@ -1083,6 +1085,7 @@ static char *const mdb_errstr[] = { "MDB_PAGE_FULL: Internal error - page has no more space", "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", "MDB_INCOMPATIBLE: Database flags changed or would change", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", }; char * @@ -1769,9 +1772,7 @@ mdb_txn_reset0(MDB_txn *txn); /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. This can only - * fail for read-only transactions, and then only if the - * reader table is full. + * @return 0 on success, non-zero on failure. */ static int mdb_txn_renew0(MDB_txn *txn) @@ -1786,13 +1787,17 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ if (txn->mt_flags & MDB_TXN_RDONLY) { - if (env->me_flags & MDB_ROFS) { + if (!env->me_txns) { i = mdb_env_pick_meta(env); txn->mt_txnid = env->me_metas[i]->mm_txnid; txn->mt_u.reader = NULL; } else { - MDB_reader *r = pthread_getspecific(env->me_txkey); - if (!r) { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { pid_t pid = env->me_pid; pthread_t tid = pthread_self(); @@ -1812,7 +1817,8 @@ mdb_txn_renew0(MDB_txn *txn) env->me_numreaders = env->me_txns->mti_numreaders; UNLOCK_MUTEX_R(env); r = &env->me_txns->mti_readers[i]; - if ((rc = pthread_setspecific(env->me_txkey, r)) != 0) { + if (!(env->me_flags & MDB_NOTLS) && + (rc = pthread_setspecific(env->me_txkey, r)) != 0) { env->me_txns->mti_readers[i].mr_pid = 0; return rc; } @@ -1844,7 +1850,8 @@ mdb_txn_renew0(MDB_txn *txn) /* Copy the DB info and flags */ memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); for (i=2; i<txn->mt_numdbs; i++) { - txn->mt_dbs[i].md_flags = x = env->me_dbflags[i]; + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; } txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; @@ -1862,7 +1869,7 @@ mdb_txn_renew(MDB_txn *txn) { int rc; - if (! (txn && (txn->mt_flags & MDB_TXN_RDONLY))) + if (!txn || txn->mt_numdbs || !(txn->mt_flags & MDB_TXN_RDONLY)) return EINVAL; if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { @@ -1979,6 +1986,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } /** Common code for #mdb_txn_reset() and #mdb_txn_abort(). + * May be called twice for readonly txns: First reset it, then abort. * @param[in] txn the transaction handle to reset */ static void @@ -1998,8 +2006,12 @@ mdb_txn_reset0(MDB_txn *txn) } if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (!(env->me_flags & MDB_ROFS)) + if (txn->mt_u.reader) { txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; /* txn does not own reader */ + } + txn->mt_numdbs = 0; /* mark txn as reset, do not close DBs again */ } else { MDB_page *dp; @@ -2061,6 +2073,10 @@ mdb_txn_reset(MDB_txn *txn) txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + /* This call is only valid for read-only txns */ + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + mdb_txn_reset0(txn); } @@ -2078,6 +2094,10 @@ mdb_txn_abort(MDB_txn *txn) mdb_txn_abort(txn->mt_child); mdb_txn_reset0(txn); + /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; + free(txn); } @@ -2769,13 +2789,8 @@ mdb_env_create(MDB_env **env) if (!e) return ENOMEM; - e->me_free_pgs = mdb_midl_alloc(); - if (!e->me_free_pgs) { - free(e); - return ENOMEM; - } e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = 2; + e->me_maxdbs = e->me_numdbs = 2; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_mfd = INVALID_HANDLE_VALUE; @@ -3192,65 +3207,69 @@ mdb_hash_hex(MDB_val *val, char *hexbuf) * @param[in] lpath The pathname of the file used for the lock region. * @param[in] mode The Unix permissions for the file, if we create it. * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { +#ifdef _WIN32 +# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT +#else +# define MDB_ERRCODE_ROFS EROFS +#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ +# define MDB_CLOEXEC O_CLOEXEC +#else + int fdflags; +# define MDB_CLOEXEC 0 +#endif +#endif int rc; off_t size, rsize; - *excl = -1; - #ifdef _WIN32 - if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, + env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) { + FILE_ATTRIBUTE_NORMAL, NULL); +#else + env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); +#endif + if (env->me_lfd == INVALID_HANDLE_VALUE) { rc = ErrCode(); - if (rc == ERROR_WRITE_PROTECT && (env->me_flags & MDB_RDONLY)) { - env->me_flags |= MDB_ROFS; + if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } goto fail_errno; } - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdb_env_excl_lock(env, excl))) goto fail; - size = GetFileSize(env->me_lfd, NULL); - -#else -#if !(O_CLOEXEC) - { - int fdflags; - if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) { - rc = ErrCode(); - if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - env->me_flags |= MDB_ROFS; - return MDB_SUCCESS; - } - goto fail_errno; - } - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) +#if ! ((MDB_CLOEXEC) || defined(_WIN32)) + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) fcntl(env->me_lfd, F_SETFD, fdflags); - } -#else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */ - if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1) { - rc = ErrCode(); - if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - env->me_flags |= MDB_ROFS; - return MDB_SUCCESS; +#endif + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + if (rc) + goto fail; + env->me_flags |= MDB_ENV_TXKEY; +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys >= MAX_TLS_KEYS) { + rc = MDB_TLS_FULL; + goto fail; } - goto fail_errno; - } + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; #endif + } /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ if ((rc = mdb_env_excl_lock(env, excl))) goto fail; +#ifdef _WIN32 + size = GetFileSize(env->me_lfd, NULL); +#else size = lseek(env->me_lfd, 0, SEEK_END); #endif rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); @@ -3406,12 +3425,12 @@ fail: * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS) int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int oflags, rc, len, excl; + int oflags, rc, len, excl = -1; char *lpath, *dpath; if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) @@ -3436,11 +3455,27 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode sprintf(dpath, "%s" DATANAME, path); } + rc = MDB_SUCCESS; flags |= env->me_flags; - /* silently ignore WRITEMAP if we're only getting read access */ - if (F_ISSET(flags, MDB_RDONLY|MDB_WRITEMAP)) - flags ^= MDB_WRITEMAP; + if (flags & MDB_RDONLY) { + /* silently ignore WRITEMAP when we're only getting read access */ + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc()) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags)) { + rc = ENOMEM; + goto leave; + } rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) @@ -3490,29 +3525,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode } } DPRINTF("opened dbenv %p", (void *) env); - rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); - if (rc) - goto leave; - env->me_numdbs = 2; /* this notes that me_txkey was set */ -#ifdef _WIN32 - /* Windows TLS callbacks need help finding their TLS info. */ - if (mdb_tls_nkeys < MAX_TLS_KEYS) - mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; - else { - rc = MDB_TLS_FULL; - goto leave; - } -#endif if (excl > 0) { rc = mdb_env_share_locks(env, &excl); - if (rc) - goto leave; } - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); - env->me_path = strdup(path); - if (!env->me_dbxs || !env->me_dbflags || !env->me_path) - rc = ENOMEM; } leave: @@ -3535,8 +3550,11 @@ mdb_env_close0(MDB_env *env, int excl) free(env->me_dbflags); free(env->me_dbxs); free(env->me_path); + free(env->me_dirty_list); + if (env->me_free_pgs) + mdb_midl_free(env->me_free_pgs); - if (env->me_numdbs) { + if (env->me_flags & MDB_ENV_TXKEY) { pthread_key_delete(env->me_txkey); #ifdef _WIN32 /* Delete our key from the global list */ @@ -3602,7 +3620,7 @@ mdb_env_close0(MDB_env *env, int excl) close(env->me_lfd); } - env->me_flags &= ~MDB_ENV_ACTIVE; + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); } int @@ -3661,7 +3679,7 @@ mdb_env_copy(MDB_env *env, const char *path) if (rc) goto leave; - if (!(env->me_flags & MDB_ROFS)) { + if (env->me_txns) { /* We must start the actual read txn after blocking writers */ mdb_txn_reset0(txn); @@ -3686,7 +3704,7 @@ mdb_env_copy(MDB_env *env, const char *path) rc = write(newfd, env->me_map, wsize); rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode(); #endif - if (! (env->me_flags & MDB_ROFS)) + if (env->me_txns) UNLOCK_MUTEX_W(env); if (rc) @@ -3752,7 +3770,6 @@ mdb_env_close(MDB_env *env) } mdb_env_close0(env, 0); - mdb_midl_free(env->me_free_pgs); free(env); } @@ -4111,6 +4128,28 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) return MDB_SUCCESS; } +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdb_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int +mdb_page_search_lowest(MDB_cursor *mc) +{ + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp))) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, 0); +} + /** Search for the page a given key should be in. * Pushes parent pages on the cursor stack. This function just sets up * the search; it finds the root page for \b mc's database and sets this @@ -4688,19 +4727,19 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (!(mc->mc_flags & C_EOF)) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - MDB_val lkey; + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + MDB_val lkey; - lkey.mv_size = MDB_MAXKEYSIZE+1; - lkey.mv_data = NULL; - rc = mdb_page_search(mc, &lkey, 0); - if (rc != MDB_SUCCESS) - return rc; - } - assert(IS_LEAF(mc->mc_pg[mc->mc_top])); + lkey.mv_size = MDB_MAXKEYSIZE+1; + lkey.mv_data = NULL; + rc = mdb_page_search(mc, &lkey, 0); + if (rc != MDB_SUCCESS) + return rc; + } + assert(IS_LEAF(mc->mc_pg[mc->mc_top])); - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; } + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); @@ -5823,13 +5862,19 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { + unsigned flags; + if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) return EINVAL; if (txn->mt_cursors) return EINVAL; + flags = mc->mc_flags; + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + + mc->mc_flags |= (flags & C_ALLOCD); return MDB_SUCCESS; } @@ -6001,7 +6046,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) unsigned int snum = csrc->mc_snum; MDB_node *s2; /* must find the lowest key below src */ - mdb_page_search_root(csrc, NULL, 0); + mdb_page_search_lowest(csrc); if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); @@ -6024,7 +6069,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) MDB_node *s2; MDB_val bkey; /* must find the lowest key below dst */ - mdb_page_search_root(cdst, NULL, 0); + mdb_page_search_lowest(cdst); if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { bkey.mv_size = cdst->mc_db->md_pad; bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); @@ -6188,7 +6233,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) unsigned int snum = csrc->mc_snum; MDB_node *s2; /* must find the lowest key below src */ - mdb_page_search_root(csrc, NULL, 0); + mdb_page_search_lowest(csrc); if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_size = csrc->mc_db->md_pad; key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); @@ -6295,6 +6340,7 @@ mdb_rebalance(MDB_cursor *mc) unsigned int ptop, minkeys; MDB_cursor mn; + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); #if MDB_DEBUG { pgno_t pgno; @@ -6305,7 +6351,8 @@ mdb_rebalance(MDB_cursor *mc) } #endif - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) { + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { #if MDB_DEBUG pgno_t pgno; COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno); @@ -6317,6 +6364,10 @@ mdb_rebalance(MDB_cursor *mc) if (mc->mc_snum < 2) { MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } if (NUMKEYS(mp) == 0) { DPUTS("tree is completely empty"); mc->mc_db->md_root = P_INVALID;