Commit 30736a0f authored by Howard Chu's avatar Howard Chu
Browse files

Ugly-as-sin initial Windows port

parent 2510bea7
......@@ -35,12 +35,16 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/uio.h>
#include <sys/mman.h>
#ifdef HAVE_SYS_FILE_H
#include <sys/file.h>
#endif
#include <fcntl.h>
#endif
#include <assert.h>
#include <errno.h>
......@@ -51,8 +55,42 @@
#include <string.h>
#include <time.h>
#include <unistd.h>
#ifdef _WIN32
#define pthread_t DWORD
#define pthread_mutex_t HANDLE
#define pthread_key_t DWORD
#define pthread_self() GetCurrentThreadId()
#define pthread_key_create(x,y) *(x) = TlsAlloc()
#define pthread_key_delete(x) TlsFree(x)
#define pthread_getspecific(x) TlsGetValue(x)
#define pthread_setspecific(x,y) TlsSetValue(x,y)
#define pthread_mutex_unlock(x) ReleaseMutex(x)
#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
#define LOCK_MUTEX_R(env) pthread_mutex_lock(env->me_rmutex)
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(env->me_rmutex)
#define LOCK_MUTEX_W(env) pthread_mutex_lock(env->me_wmutex)
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(env->me_wmutex)
#define getpid() GetCurrentProcessId()
#else
#include <pthread.h>
#define LOCK_MUTEX_R(env) pthread_mutex_lock(&env->me_txns->mti_mutex)
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&env->me_txns->mti_mutex)
#define LOCK_MUTEX_W(env) pthread_mutex_lock(&env->me_txns->mti_wmutex)
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&env->me_txns->mti_wmutex)
#endif
#ifdef _WIN32
#define fdatasync(fd) !FlushFileBuffers(fd)
#define ErrCode() GetLastError()
#define GetPageSize(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
#define close(fd) CloseHandle(fd)
#define munmap(ptr,len) UnmapViewOfFile(ptr)
#else
#define ErrCode() errno
#define HANDLE int
#define INVALID_HANDLE_VALUE -1
#define GetPageSize(x) (x) = sysconf(_SC_PAGE_SIZE)
#endif
#include "mdb.h"
#define ULONG unsigned long
......@@ -156,7 +194,16 @@ typedef struct MDB_reader {
typedef struct MDB_txbody {
uint32_t mtb_magic;
uint32_t mtb_version;
/* For POSIX the actual mutexes reside in shared memory.
* On Windows, mutexes are allocated by the kernel; we store
* the name in shared memory so that other processes can
* grab them.
*/
#ifdef _WIN32
char mtb_rmname[32];
#else
pthread_mutex_t mtb_mutex;
#endif
ULONG mtb_txnid;
uint32_t mtb_numreaders;
uint32_t mtb_me_toggle;
......@@ -168,14 +215,20 @@ typedef struct MDB_txninfo {
#define mti_magic mt1.mtb.mtb_magic
#define mti_version mt1.mtb.mtb_version
#define mti_mutex mt1.mtb.mtb_mutex
#define mti_rmname mt1.mtb.mtb_rmname
#define mti_txnid mt1.mtb.mtb_txnid
#define mti_numreaders mt1.mtb.mtb_numreaders
#define mti_me_toggle mt1.mtb.mtb_me_toggle
char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
} mt1;
union {
#ifdef _WIN32
char mt2_wmname[32];
#define mti_wmname mt2.mt2_wmname
#else
pthread_mutex_t mt2_wmutex;
#define mti_wmutex mt2.mt2_wmutex
#endif
char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)];
} mt2;
MDB_reader mti_readers[1];
......@@ -351,9 +404,9 @@ typedef struct MDB_xcursor {
} MDB_xcursor;
struct MDB_env {
int me_fd;
int me_lfd;
int me_mfd; /* just for writing the meta pages */
HANDLE me_fd;
HANDLE me_lfd;
HANDLE me_mfd; /* just for writing the meta pages */
#define MDB_FATAL_ERROR 0x80000000U
uint32_t me_flags;
uint32_t me_extrapad; /* unused for now */
......@@ -378,6 +431,10 @@ struct MDB_env {
pgno_t me_free_pgs[MDB_IDL_UM_SIZE];
MIDL2 me_dirty_list[MDB_IDL_DB_SIZE];
LAZY_RWLOCK_DEF(me_dblock);
#ifdef _WIN32
HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
HANDLE me_wmutex;
#endif
};
#define NODESIZE offsetof(MDB_node, mn_data)
......@@ -464,6 +521,12 @@ static int memncmp(const void *s1, size_t n1,
static int memnrcmp(const void *s1, size_t n1,
const void *s2, size_t n2);
#ifdef _WIN32
static SECURITY_DESCRIPTOR mdb_null_sd;
static SECURITY_ATTRIBUTES mdb_all_sa;
static int mdb_sec_inited;
#endif
static int
memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
{
......@@ -506,7 +569,7 @@ mdb_version(int *maj, int *min, int *pat)
return MDB_VERSION_STRING;
}
static char *const errstr[] = {
static char *const mdb_errstr[] = {
"MDB_KEYEXIST: Key/data pair already exists",
"MDB_NOTFOUND: No matching key/data pair found",
"MDB_PAGE_NOTFOUND: Requested page not found",
......@@ -522,7 +585,7 @@ mdb_strerror(int err)
return ("Successful return: 0");
if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
return errstr[err - MDB_KEYEXIST];
return mdb_errstr[err - MDB_KEYEXIST];
return strerror(err);
}
......@@ -731,7 +794,7 @@ mdb_env_sync(MDB_env *env, int force)
int rc = 0;
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
if (fdatasync(env->me_fd))
rc = errno;
rc = ErrCode();
}
return rc;
}
......@@ -751,19 +814,19 @@ mdb_txn_renew0(MDB_txn *txn)
pid_t pid = getpid();
pthread_t tid = pthread_self();
pthread_mutex_lock(&env->me_txns->mti_mutex);
LOCK_MUTEX_R(env);
for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0)
break;
if (i == env->me_maxreaders) {
pthread_mutex_unlock(&env->me_txns->mti_mutex);
UNLOCK_MUTEX_R(env);
return ENOMEM;
}
env->me_txns->mti_readers[i].mr_pid = pid;
env->me_txns->mti_readers[i].mr_tid = tid;
if (i >= env->me_txns->mti_numreaders)
env->me_txns->mti_numreaders = i+1;
pthread_mutex_unlock(&env->me_txns->mti_mutex);
UNLOCK_MUTEX_R(env);
r = &env->me_txns->mti_readers[i];
pthread_setspecific(env->me_txkey, r);
}
......@@ -772,7 +835,7 @@ mdb_txn_renew0(MDB_txn *txn)
r->mr_txnid = txn->mt_txnid;
txn->mt_u.reader = r;
} else {
pthread_mutex_lock(&env->me_txns->mti_wmutex);
LOCK_MUTEX_W(env);
txn->mt_txnid = env->me_txns->mti_txnid+1;
txn->mt_toggle = env->me_txns->mti_me_toggle;
......@@ -830,7 +893,7 @@ mdb_txn_begin(MDB_env *env, unsigned int flags, MDB_txn **ret)
return MDB_PANIC;
}
if ((txn = calloc(1, sizeof(MDB_txn) + env->me_maxdbs * sizeof(MDB_db))) == NULL) {
DPRINTF("calloc: %s", strerror(errno));
DPRINTF("calloc: %s", strerror(ErrCode()));
return ENOMEM;
}
txn->mt_dbs = (MDB_db *)(txn+1);
......@@ -884,7 +947,7 @@ mdb_txn_reset0(MDB_txn *txn)
env->me_txn = NULL;
for (i=2; i<env->me_numdbs; i++)
env->me_dbxs[i].md_dirty = 0;
pthread_mutex_unlock(&env->me_txns->mti_wmutex);
UNLOCK_MUTEX_W(env);
}
}
......@@ -925,7 +988,6 @@ mdb_txn_commit(MDB_txn *txn)
MDB_dpage *dp;
MDB_env *env;
pgno_t next;
struct iovec iov[MDB_COMMIT_PAGES];
assert(txn != NULL);
assert(txn->mt_env != NULL);
......@@ -1041,6 +1103,36 @@ mdb_txn_commit(MDB_txn *txn)
next = 0;
i = 1;
do {
#ifdef _WIN32
/* Windows actually supports scatter/gather I/O, but only on
* unbuffered file handles. Since we're relying on the OS page
* cache for all our data, that's self-defeating. So we just
* write pages one at a time. We use the ov structure to set
* the write offset, to at least save the overhead of a Seek
* system call.
*/
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
dp = txn->mt_u.dirty_list[i].mptr;
DPRINTF("committing page %lu", dp->p.mp_pgno);
size = dp->p.mp_pgno * env->me_psize;
ov.Offset = size & 0xffffffff;
ov.OffsetHigh = size >> 16;
ov.OffsetHigh >>= 16;
/* clear dirty flag */
dp->p.mp_flags &= ~P_DIRTY;
rc = WriteFile(env->me_fd, &dp->p, env->me_psize * dp->h.md_num, NULL, &ov);
if (!rc) {
n = ErrCode();
DPRINTF("WriteFile: %d", n);
mdb_txn_abort(txn);
return n;
}
}
done = 1;;
#else
struct iovec iov[MDB_COMMIT_PAGES];
n = 0;
done = 1;
size = 0;
......@@ -1051,11 +1143,11 @@ mdb_txn_commit(MDB_txn *txn)
DPRINTF("committing %u dirty pages", n);
rc = writev(env->me_fd, iov, n);
if (rc != size) {
n = errno;
n = ErrCode();
if (rc > 0)
DPUTS("short write, filesystem full?");
else
DPRINTF("writev: %s", strerror(errno));
DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn);
return n;
}
......@@ -1085,15 +1177,15 @@ mdb_txn_commit(MDB_txn *txn)
DPRINTF("committing %u dirty pages", n);
rc = writev(env->me_fd, iov, n);
if (rc != size) {
n = errno;
n = ErrCode();
if (rc > 0)
DPUTS("short write, filesystem full?");
else
DPRINTF("writev: %s", strerror(errno));
DPRINTF("writev: %s", strerror(n));
mdb_txn_abort(txn);
return n;
}
#endif
} while (!done);
/* Drop the dirty pages.
......@@ -1141,7 +1233,7 @@ done:
LAZY_RWLOCK_UNLOCK(&env->me_dblock);
}
pthread_mutex_unlock(&env->me_txns->mti_wmutex);
UNLOCK_MUTEX_W(env);
free(txn);
return MDB_SUCCESS;
......@@ -1153,20 +1245,25 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta)
char page[PAGESIZE];
MDB_page *p;
MDB_meta *m;
int rc;
assert(env != NULL);
int rc, err;
/* We don't know the page size yet, so use a minimum value.
*/
if ((rc = pread(env->me_fd, page, PAGESIZE, 0)) == 0) {
#ifdef _WIN32
if (!ReadFile(env->me_fd, page, PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
#else
if ((rc = read(env->me_fd, page, PAGESIZE)) == 0)
#endif
{
return ENOENT;
} else if (rc != PAGESIZE) {
}
else if (rc != PAGESIZE) {
err = ErrCode();
if (rc > 0)
errno = EINVAL;
DPRINTF("read: %s", strerror(errno));
return errno;
err = EINVAL;
DPRINTF("read: %s", strerror(err));
return err;
}
p = (MDB_page *)page;
......@@ -1201,7 +1298,8 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
unsigned int psize;
DPUTS("writing new meta page");
psize = sysconf(_SC_PAGE_SIZE);
GetPageSize(psize);
meta->mm_magic = MDB_MAGIC;
meta->mm_version = MDB_VERSION;
......@@ -1227,9 +1325,18 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
m = METADATA(q);
memcpy(m, meta, sizeof(*meta));
#ifdef _WIN32
{
DWORD len;
rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
}
#else
rc = write(env->me_fd, p, psize * 2);
rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
#endif
free(p);
return (rc == (int)psize * 2) ? MDB_SUCCESS : errno;
return rc;
}
static int
......@@ -1240,6 +1347,9 @@ mdb_env_write_meta(MDB_txn *txn)
off_t off;
int rc, len, toggle;
char *ptr;
#ifdef _WIN32
OVERLAPPED ov;
#endif
assert(txn != NULL);
assert(txn->mt_env != NULL);
......@@ -1268,10 +1378,18 @@ mdb_env_write_meta(MDB_txn *txn)
off += PAGEHDRSZ;
/* Write to the SYNC fd */
#ifdef _WIN32
{
memset(&ov, 0, sizeof(ov));
ov.Offset = off;
WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov);
}
#else
rc = pwrite(env->me_mfd, ptr, len, off);
#endif
if (rc != len) {
int r2;
rc = errno;
rc = ErrCode();
DPUTS("write failed, disk error?");
/* On a failure, the pagecache still contains the new data.
* Write some old data back, to prevent it from being used.
......@@ -1279,7 +1397,11 @@ mdb_env_write_meta(MDB_txn *txn)
*/
meta.mm_last_pg = metab.mm_last_pg;
meta.mm_txnid = metab.mm_txnid;
#ifdef _WIN32
WriteFile(env->me_fd, ptr, len, NULL, &ov);
#else
r2 = pwrite(env->me_fd, ptr, len, off);
#endif
env->me_flags |= MDB_FATAL_ERROR;
return rc;
}
......@@ -1323,9 +1445,9 @@ mdb_env_create(MDB_env **env)
e->me_maxreaders = DEFAULT_READERS;
e->me_maxdbs = 2;
e->me_fd = -1;
e->me_lfd = -1;
e->me_mfd = -1;
e->me_fd = INVALID_HANDLE_VALUE;
e->me_lfd = INVALID_HANDLE_VALUE;
e->me_mfd = INVALID_HANDLE_VALUE;
*env = e;
return MDB_SUCCESS;
}
......@@ -1388,13 +1510,41 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
}
#ifdef _WIN32
{
HANDLE mh;
LONG sizelo, sizehi;
sizelo = env->me_mapsize & 0xffffffff;
sizehi = env->me_mapsize >> 16; /* pointless on WIN32, only needed on W64 */
sizehi >>= 16;
/* Windows won't create mappings for zero length files.
* Just allocate the maxsize right now.
*/
if (newenv) {
SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
if (!SetEndOfFile(env->me_fd))
return ErrCode();
SetFilePointer(env->me_fd, 0, NULL, 0);
}
mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY,
sizehi, sizelo, NULL);
if (!mh)
return ErrCode();
env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize,
meta.mm_address);
CloseHandle(mh);
if (!env->me_map)
return ErrCode();
}
#else
i = MAP_SHARED;
if (meta.mm_address && (flags & MDB_FIXEDMAP))
i |= MAP_FIXED;
env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
env->me_fd, 0);
if (env->me_map == MAP_FAILED)
return errno;
return ErrCode();
#endif
if (newenv) {
meta.mm_mapsize = env->me_mapsize;
......@@ -1429,6 +1579,8 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
return MDB_SUCCESS;
}
#ifndef _WIN32
/* Windows doesn't support destructor callbacks for thread-specific storage */
static void
mdb_env_reader_dest(void *ptr)
{
......@@ -1438,12 +1590,12 @@ mdb_env_reader_dest(void *ptr)
reader->mr_pid = 0;
reader->mr_tid = 0;
}
#endif
/* downgrade the exclusive lock on the region back to shared */
static void
mdb_env_share_locks(MDB_env *env)
{
struct flock lock_info;
int toggle = 0;
if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
......@@ -1451,12 +1603,28 @@ mdb_env_share_locks(MDB_env *env)
env->me_txns->mti_me_toggle = toggle;
env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_RDLCK;
lock_info.l_whence = SEEK_SET;
lock_info.l_start = 0;
lock_info.l_len = 1;
fcntl(env->me_lfd, F_SETLK, &lock_info);
#ifdef _WIN32
{
OVERLAPPED ov;
/* First acquire a shared lock. The Unlock will
* then release the existing exclusive lock.
*/
memset(&ov, 0, sizeof(ov));
LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
UnlockFile(env->me_lfd, 0, 0, 1, 0);
}
#else
{
struct flock lock_info;
/* The shared lock replaces the existing lock */
memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_RDLCK;
lock_info.l_whence = SEEK_SET;
lock_info.l_start = 0;
lock_info.l_len = 1;
fcntl(env->me_lfd, F_SETLK, &lock_info);
}
#endif
}
static int
......@@ -1464,52 +1632,138 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
{
int rc;
off_t size, rsize;
struct flock lock_info;
*excl = 0;
#ifdef _WIN32
if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
rc = ErrCode();
return rc;
}
/* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it.
*/
{
if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
*excl = 1;
} else {
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
rc = ErrCode();
goto fail;
}
}
}
size = GetFileSize(env->me_lfd, NULL);
#else
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
rc = errno;
rc = ErrCode();
return rc;
}
/* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it.
*/
memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_WRLCK;
lock_info.l_whence = SEEK_SET;
lock_info.l_start = 0;
lock_info.l_len = 1;
rc = fcntl(env->me_lfd, F_SETLK, &lock_info);
if (rc == 0) {
*excl = 1;
} else {
lock_info.l_type = F_RDLCK;
{
struct flock lock_info;
memset((void *)&lock_info, 0, sizeof(lock_info));
lock_info.l_type = F_WRLCK;
lock_info.l_whence = SEEK_SET;
lock_info.l_start = 0;
lock_info.l_len = 1;
rc = fcntl(env->me_lfd, F_SETLK, &lock_info);
if (rc) {
rc = errno;
goto fail;
if (rc == 0) {
*excl = 1;
} else {
lock_info.l_type = F_RDLCK;
rc = fcntl(env->me_lfd, F_SETLKW, &lock_info);
if (rc) {
rc = ErrCode();
goto fail;
}
}
}
size = lseek(env->me_lfd, 0, SEEK_END);
#endif
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
if (size < rsize && *excl) {
#ifdef _WIN32
SetFilePointer(env->me_lfd, rsize, NULL, 0);
if (!SetEndOfFile(env->me_lfd)) {
rc = ErrCode();
goto fail;
}
#else
if (ftruncate(env->me_lfd, rsize) != 0) {
rc = errno;
rc = ErrCode();
goto fail;
}
#endif
} else {
rsize = size;
size = rsize - sizeof(MDB_txninfo);
env->me_maxreaders = size/sizeof(MDB_reader) + 1;
}
#ifdef _WIN32
{
HANDLE mh;
mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
0, 0, NULL);
if (!mh) {
rc = ErrCode();
goto fail;
}
env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
CloseHandle(mh);
if (!env->me_txns) {
rc = ErrCode();
goto fail;
}
}
#else
env->me_txns = mmap(0, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
env->me_lfd, 0);
if (env->me_txns == MAP_FAILED) {
rc = errno;
rc = ErrCode();
goto fail;
}
#endif
if (*excl) {