Commit 4009c9ae authored by Howard Chu's avatar Howard Chu
Browse files

Add mdb_env_copy2()

And mdb_env_copyfd2(). Perform compaction on the copy. Trims out
freed pages and renumbers data pages in sequential order. This is
more CPU-intensive since it copies and modifies data pages.
parent 4d65cf13
......@@ -622,6 +622,43 @@ int mdb_env_copy(MDB_env *env, const char *path);
*/
int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
/** @brief Copy an LMDB environment to the specified path, with compaction.
*
* This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need. Unlike
* #mdb_env_copy(), which copies all pages from the environment, this
* function trims freed/unused pages from the copy and reorders leaf
* pages in sequential order. This function may execute more slowly
* than #mdb_env_copy() and will use more CPU time.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] path The directory in which the copy will reside. This
* directory must already exist and be writable but must otherwise be
* empty.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copy2(MDB_env *env, const char *path);
/** @brief Copy an LMDB environment to the specified file descriptor,
* with compaction.
*
* This function may be used to make a backup of an existing environment.
* No lockfile is created, since it gets recreated at need. See
* #mdb_env_copy2() for further details.
* @note This call can trigger significant file size growth if run in
* parallel with write transactions, because it employs a read-only
* transaction. See long-lived transactions under @ref caveats_sec.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] fd The filedescriptor to write the copy to. It must
* have already been opened for Write access.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd);
/** @brief Return statistics about the LMDB environment.
*
* @param[in] env An environment handle returned by #mdb_env_create()
......
......@@ -3301,6 +3301,20 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
return 0;
}
static void
mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
{
meta->mm_magic = MDB_MAGIC;
meta->mm_version = MDB_DATA_VERSION;
meta->mm_mapsize = env->me_mapsize;
meta->mm_psize = env->me_psize;
meta->mm_last_pg = 1;
meta->mm_flags = env->me_flags & 0xffff;
meta->mm_flags |= MDB_INTEGERKEY;
meta->mm_dbs[0].md_root = P_INVALID;
meta->mm_dbs[1].md_root = P_INVALID;
}
/** Write the environment parameters of a freshly created DB environment.
* @param[in] env the environment handle
* @param[out] meta address of where to store the meta information
......@@ -3330,15 +3344,7 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
psize = env->me_psize;
meta->mm_magic = MDB_MAGIC;
meta->mm_version = MDB_DATA_VERSION;
meta->mm_mapsize = env->me_mapsize;
meta->mm_psize = psize;
meta->mm_last_pg = 1;
meta->mm_flags = env->me_flags & 0xffff;
meta->mm_flags |= MDB_INTEGERKEY;
meta->mm_dbs[0].md_root = P_INVALID;
meta->mm_dbs[1].md_root = P_INVALID;
mdb_env_init_meta0(env, meta);
p = calloc(2, psize);
p->mp_pgno = 0;
......@@ -4443,167 +4449,6 @@ mdb_env_close0(MDB_env *env, int excl)
env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
}
int
mdb_env_copyfd(MDB_env *env, HANDLE fd)
{
MDB_txn *txn = NULL;
int rc;
size_t wsize;
char *ptr;
#ifdef _WIN32
DWORD len, w2;
#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
#else
ssize_t len;
size_t w2;
#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
#endif
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers.
*/
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc)
return rc;
if (env->me_txns) {
/* We must start the actual read txn after blocking writers */
mdb_txn_reset0(txn, "reset-stage1");
/* Temporarily block writers until we snapshot the meta pages */
LOCK_MUTEX_W(env);
rc = mdb_txn_renew0(txn);
if (rc) {
UNLOCK_MUTEX_W(env);
goto leave;
}
}
wsize = env->me_psize * 2;
ptr = env->me_map;
w2 = wsize;
while (w2 > 0) {
DO_WRITE(rc, fd, ptr, w2, len);
if (!rc) {
rc = ErrCode();
break;
} else if (len > 0) {
rc = MDB_SUCCESS;
ptr += len;
w2 -= len;
continue;
} else {
/* Non-blocking or async handles are not supported */
rc = EIO;
break;
}
}
if (env->me_txns)
UNLOCK_MUTEX_W(env);
if (rc)
goto leave;
w2 = txn->mt_next_pgno * env->me_psize;
#ifdef WIN32
{
LARGE_INTEGER fsize;
GetFileSizeEx(env->me_fd, &fsize);
if (w2 > fsize.QuadPart)
w2 = fsize.QuadPart;
}
#else
{
struct stat st;
fstat(env->me_fd, &st);
if (w2 > (size_t)st.st_size)
w2 = st.st_size;
}
#endif
wsize = w2 - wsize;
while (wsize > 0) {
if (wsize > MAX_WRITE)
w2 = MAX_WRITE;
else
w2 = wsize;
DO_WRITE(rc, fd, ptr, w2, len);
if (!rc) {
rc = ErrCode();
break;
} else if (len > 0) {
rc = MDB_SUCCESS;
ptr += len;
wsize -= len;
continue;
} else {
rc = EIO;
break;
}
}
leave:
mdb_txn_abort(txn);
return rc;
}
int
mdb_env_copy(MDB_env *env, const char *path)
{
int rc, len;
char *lpath;
HANDLE newfd = INVALID_HANDLE_VALUE;
if (env->me_flags & MDB_NOSUBDIR) {
lpath = (char *)path;
} else {
len = strlen(path);
len += sizeof(DATANAME);
lpath = malloc(len);
if (!lpath)
return ENOMEM;
sprintf(lpath, "%s" DATANAME, path);
}
/* The destination path must exist, but the destination file must not.
* We don't want the OS to cache the writes, since the source data is
* already in the OS cache.
*/
#ifdef _WIN32
newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
#else
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
#endif
if (newfd == INVALID_HANDLE_VALUE) {
rc = ErrCode();
goto leave;
}
#ifdef O_DIRECT
/* Set O_DIRECT if the file system supports it */
if ((rc = fcntl(newfd, F_GETFL)) != -1)
(void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
#endif
#ifdef F_NOCACHE /* __APPLE__ */
rc = fcntl(newfd, F_NOCACHE, 1);
if (rc) {
rc = ErrCode();
goto leave;
}
#endif
rc = mdb_env_copyfd(env, newfd);
leave:
if (!(env->me_flags & MDB_NOSUBDIR))
free(lpath);
if (newfd != INVALID_HANDLE_VALUE)
if (close(newfd) < 0 && rc == MDB_SUCCESS)
rc = ErrCode();
return rc;
}
void
mdb_env_close(MDB_env *env)
......@@ -8165,6 +8010,489 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
return mdb_cursor_put(&mc, key, data, flags);
}
#define WBUF (64*1024)
typedef struct mdb_copy {
pthread_mutex_t mc_mutex[2];
char *mc_wbuf[2];
char *mc_over[2];
void *mc_obuf[2];
void *mc_free;
MDB_env *mc_env;
MDB_txn *mc_txn;
int mc_wlen[2];
int mc_olen[2];
pgno_t mc_next_pgno;
HANDLE mc_fd;
int mc_status;
int mc_toggle;
} mdb_copy;
static void *
mdb_env_copythr(void *arg)
{
mdb_copy *my = arg;
char *ptr;
int wsize;
int toggle = 0, len, rc;
#ifdef _WIN32
#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
#else
#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
#endif
for(;;) {
pthread_mutex_lock(&my->mc_mutex[toggle]);
if (!my->mc_wlen[toggle]) {
pthread_mutex_unlock(&my->mc_mutex[toggle]);
break;
}
wsize = my->mc_wlen[toggle];
ptr = my->mc_wbuf[toggle];
again:
while (wsize > 0) {
DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
if (!rc) {
rc = ErrCode();
break;
} else if (len > 0) {
rc = MDB_SUCCESS;
ptr += len;
wsize -= len;
continue;
} else {
rc = EIO;
break;
}
}
if (rc) {
my->mc_status = rc;
pthread_mutex_unlock(&my->mc_mutex[toggle]);
break;
}
/* If there's an overflow page tail, write it too */
if (my->mc_olen[toggle]) {
wsize = my->mc_olen[toggle];
ptr = my->mc_over[toggle];
my->mc_olen[toggle] = 0;
goto again;
}
pthread_mutex_unlock(&my->mc_mutex[toggle]);
toggle ^= 1;
}
return NULL;
#undef DO_WRITE
}
static int
mdb_env_cthr_toggle(mdb_copy *my)
{
int toggle = my->mc_toggle ^ 1;
pthread_mutex_unlock(&my->mc_mutex[my->mc_toggle]);
pthread_mutex_lock(&my->mc_mutex[toggle]);
if (my->mc_status) {
pthread_mutex_unlock(&my->mc_mutex[toggle]);
return my->mc_status;
}
my->mc_wlen[toggle] = 0;
my->mc_olen[toggle] = 0;
my->mc_toggle = toggle;
return 0;
}
static int
mdb_env_cwalk(mdb_copy *my, pgno_t pg)
{
MDB_cursor mc;
MDB_txn *txn = my->mc_txn;
MDB_node *ni;
MDB_page *mo, *mp;
char *buf, *ptr;
int rc, toggle;
unsigned int i;
mc.mc_snum = 1;
mc.mc_top = 0;
mc.mc_txn = txn;
rc = mdb_page_get(my->mc_txn, pg, &mc.mc_pg[0], NULL);
if (rc)
return rc;
rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
if (rc)
return rc;
/* Make cursor pages writable */
buf = ptr = malloc(my->mc_env->me_psize * mc.mc_top);
if (buf == NULL)
return ENOMEM;
for (i=0; i<mc.mc_top; i++) {
mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
mc.mc_pg[i] = (MDB_page *)ptr;
ptr += my->mc_env->me_psize;
}
toggle = my->mc_toggle;
while (mc.mc_snum > 0) {
unsigned n;
mp = mc.mc_pg[mc.mc_top];
n = NUMKEYS(mp);
if (IS_LEAF(mp)) {
for (i=0; i<n; i++) {
ni = NODEPTR(mp, i);
if (ni->mn_flags & F_BIGDATA) {
MDB_page *omp;
pgno_t pg;
memcpy(&pg, NODEDATA(ni), sizeof(pg));
rc = mdb_page_get(txn, pg, &omp, NULL);
if (rc)
goto done;
if (my->mc_wlen[toggle] >= WBUF) {
rc = mdb_env_cthr_toggle(my);
if (rc)
goto done;
toggle ^= 1;
}
mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
memcpy(mo, omp, my->mc_env->me_psize);
mo->mp_pgno = my->mc_next_pgno;
my->mc_next_pgno += omp->mp_pages;
my->mc_wlen[toggle] += my->mc_env->me_psize;
my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
my->mc_obuf[toggle] = (char *)omp + my->mc_env->me_psize;
rc = mdb_env_cthr_toggle(my);
if (rc)
goto done;
toggle ^= 1;
} else if (ni->mn_flags & F_SUBDATA) {
MDB_db db;
memcpy(&db, NODEDATA(ni), sizeof(db));
my->mc_toggle = toggle;
rc = mdb_env_cwalk(my, db.md_root);
if (rc)
goto done;
toggle = my->mc_toggle;
}
}
} else {
mc.mc_ki[mc.mc_top]++;
if (mc.mc_ki[mc.mc_top] < n) {
pgno_t pg;
again:
ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
pg = NODEPGNO(ni);
rc = mdb_page_get(txn, pg, &mp, NULL);
if (rc)
goto done;
mc.mc_top++;
mc.mc_snum++;
mc.mc_ki[mc.mc_top] = 0;
if (IS_BRANCH(mp)) {
mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
goto again;
} else
mc.mc_pg[mc.mc_top] = mp;
continue;
}
}
if (mc.mc_top) {
ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
SETPGNO(ni, my->mc_next_pgno);
}
if (my->mc_wlen[toggle] >= WBUF) {
rc = mdb_env_cthr_toggle(my);
if (rc)
goto done;
toggle ^= 1;
}
mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
mdb_page_copy(mo, mp, my->mc_env->me_psize);
mo->mp_pgno = my->mc_next_pgno++;
my->mc_wlen[toggle] += my->mc_env->me_psize;
mdb_cursor_pop(&mc);
}
done:
free(buf);
return rc;
}
int
mdb_env_copyfd2(MDB_env *env, HANDLE fd)
{
MDB_meta *mm;
MDB_page *mp;
mdb_copy my;
MDB_txn *txn = NULL;
pthread_t thr;
int rc;
rc = posix_memalign(&my.mc_free, env->me_psize, WBUF*2);
if (rc)
return rc;
my.mc_wbuf[0] = my.mc_free;
my.mc_wbuf[1] = my.mc_free + WBUF;
pthread_mutex_init(&my.mc_mutex[0], NULL);
pthread_mutex_init(&my.mc_mutex[1], NULL);
my.mc_wlen[0] = 0;
my.mc_wlen[1] = 0;
my.mc_olen[0] = 0;
my.mc_olen[1] = 0;
my.mc_next_pgno = 2;
my.mc_status = 0;
my.mc_toggle = 0;
my.mc_env = env;
my.mc_fd = fd;
pthread_mutex_lock(&my.mc_mutex[0]);
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers.
*/
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc)
return rc;
if (env->me_txns) {
/* We must start the actual read txn after blocking writers */
mdb_txn_reset0(txn, "reset-stage1");
/* Temporarily block writers until we snapshot the meta pages */
LOCK_MUTEX_W(env);
rc = mdb_txn_renew0(txn);
if (rc) {
UNLOCK_MUTEX_W(env);
goto leave;
}
}
mp = (MDB_page *)my.mc_wbuf[0];
memset(mp, 0, 2*env->me_psize);
mp->mp_pgno = 0;
mp->mp_flags = P_META;
mm = (MDB_meta *)METADATA(mp);
mdb_env_init_meta0(env, mm);
mm->mm_address = env->me_metas[0]->mm_address;
mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
mp->mp_pgno = 1;
mp->mp_flags = P_META;
*(MDB_meta *)METADATA(mp) = *mm;
mm = (MDB_meta *)METADATA(mp);
/* Count the number of free pages, subtract from lastpg to find
* number of active pages
*/
{
MDB_ID freecount = 0;
MDB_cursor mc;
MDB_val key, data;
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
freecount += *(MDB_ID *)data.mv_data;
freecount += txn->mt_dbs[0].md_branch_pages +
txn->mt_dbs[0].md_leaf_pages +
txn->mt_dbs[0].md_overflow_pages;
/* Set metapage 1 */
mm->mm_last_pg = txn->mt_next_pgno - freecount - 1;
mm->mm_dbs[1] = txn->mt_dbs[1];
mm->mm_dbs[1].md_root = mm->mm_last_pg;
mm->mm_txnid = 1;
}
my.mc_wlen[0] = env->me_psize * 2;
my.mc_txn = txn;
pthread_create(&thr, NULL, mdb_env_copythr, &my);
rc = mdb_env_cwalk(&my, txn->mt_dbs[1].md_root);
if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
rc = mdb_env_cthr_toggle(&my);
my.mc_wlen[my.mc_toggle] = 0;
pthread_mutex_unlock(&my.mc_mutex[my.mc_toggle]);
pthread_join(thr, NULL);
leave:
mdb_txn_abort(txn);
free(my.mc_free);
return rc;
}
int
mdb_env_copyfd(MDB_env *env, HANDLE fd)
{
MDB_txn *txn = NULL;
int rc;
size_t wsize;
char *ptr;
#ifdef _WIN32
DWORD len, w2;
#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
#else
ssize_t len;
size_t w2;
#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
#endif
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers.
*/
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc)
return rc;
if (env->me_txns) {
/* We must start the actual read txn after blocking writers */
mdb_txn_reset0(txn, "reset-stage1");
/* Temporarily block writers until we snapshot the meta pages */
LOCK_MUTEX_W(env);
rc = mdb_txn_renew0(txn);
if (rc) {
UNLOCK_MUTEX_W(env);
goto leave;
}
}
wsize = env->me_psize * 2;
ptr = env->me_map;
w2 = wsize;
while (w2 > 0) {
DO_WRITE(rc, fd, ptr, w2, len);
if (!rc) {
rc = ErrCode();
break;
} else if (len > 0) {
rc = MDB_SUCCESS;
ptr += len;
w2 -= len;