diff options
Diffstat (limited to 'plugins/Dbx_mdb')
-rw-r--r-- | plugins/Dbx_mdb/src/lmdb/lmdb.h | 97 | ||||
-rw-r--r-- | plugins/Dbx_mdb/src/lmdb/mdb.c | 1700 | ||||
-rw-r--r-- | plugins/Dbx_mdb/src/lmdb/midl.c | 8 | ||||
-rw-r--r-- | plugins/Dbx_mdb/src/lmdb/midl.h | 8 |
4 files changed, 702 insertions, 1111 deletions
diff --git a/plugins/Dbx_mdb/src/lmdb/lmdb.h b/plugins/Dbx_mdb/src/lmdb/lmdb.h index c4d05d1791..f318ad416d 100644 --- a/plugins/Dbx_mdb/src/lmdb/lmdb.h +++ b/plugins/Dbx_mdb/src/lmdb/lmdb.h @@ -40,9 +40,6 @@ * corrupt the database. Of course if your application code is known to * be bug-free (...) then this is not an issue. * - * If this is your first time using a transactional embedded key/value - * store, you may find the \ref starting page to be helpful. - * * @section caveats_sec Caveats * Troubleshooting the lock file, plus semaphores on BSD systems: * @@ -53,14 +50,14 @@ * * Fix: Check for stale readers periodically, using the * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. - * Stale writers will be cleared automatically on some systems: + * Stale writers will be cleared automatically on most systems: * - Windows - automatic + * - BSD, systems using SysV semaphores - automatic * - Linux, systems using POSIX mutexes with Robust option - automatic - * - not on BSD, systems using POSIX semaphores. * Otherwise just make all programs using the database close it; * the lockfile is always reset on first open of the environment. * - * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * - On BSD systems or others configured with MDB_USE_SYSV_SEM, * startup can fail due to semaphores owned by another userid. * * Fix: Open and close the database as the user which owns the @@ -112,9 +109,7 @@ * The transaction becomes "long-lived" as above until a check * for stale readers is performed or the lockfile is reset, * since the process may not remove it from the lockfile. - * - * This does not apply to write transactions if the system clears - * stale writers, see above. + * Except write-transactions on Unix with MDB_ROBUST or on Windows. * * - If you do that anyway, do a periodic check for stale readers. Or * close the environment once in a while, so the lockfile can get reset. @@ -129,7 +124,7 @@ * * @author Howard Chu, Symas Corporation. * - * @copyright Copyright 2011-2016 Howard Chu, Symas Corp. All rights reserved. + * @copyright Copyright 2011-2014 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -194,7 +189,7 @@ typedef int mdb_filehandle_t; /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 18 +#define MDB_VERSION_PATCH 14 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -204,7 +199,7 @@ typedef int mdb_filehandle_t; MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "February 5, 2016" +#define MDB_VERSION_DATE "September 20, 2014" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" @@ -306,12 +301,12 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_REVERSEKEY 0x02 /** use sorted duplicates */ #define MDB_DUPSORT 0x04 - /** numeric keys in native byte order: either unsigned int or size_t. + /** numeric keys in native byte order. * The keys must all be of the same size. */ #define MDB_INTEGERKEY 0x08 /** with #MDB_DUPSORT, sorted dup items have fixed size */ #define MDB_DUPFIXED 0x10 - /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ + /** with #MDB_DUPSORT, dups are numeric in native byte order */ #define MDB_INTEGERDUP 0x20 /** with #MDB_DUPSORT, use reverse string dups */ #define MDB_REVERSEDUP 0x40 @@ -423,18 +418,11 @@ typedef enum MDB_cursor_op { #define MDB_PAGE_FULL (-30786) /** Database contents grew beyond environment mapsize */ #define MDB_MAP_RESIZED (-30785) - /** Operation and DB incompatible, or DB type changed. This can mean: - * <ul> - * <li>The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. - * <li>Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. - * <li>Accessing a data record as a database, or vice versa. - * <li>The database was dropped and recreated with different flags. - * </ul> - */ + /** MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed */ #define MDB_INCOMPATIBLE (-30784) /** Invalid reuse of reader locktable slot */ #define MDB_BAD_RSLOT (-30783) - /** Transaction must abort, has a child, or is invalid */ + /** Transaction cannot recover - it must be aborted */ #define MDB_BAD_TXN (-30782) /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ #define MDB_BAD_VALSIZE (-30781) @@ -460,7 +448,7 @@ typedef struct MDB_envinfo { void *me_mapaddr; /**< Address of map, if fixed */ size_t me_mapsize; /**< Size of the data memory map */ size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ + size_t me_last_txnid; /**< ID of the last committed transaction */ unsigned int me_maxreaders; /**< max reader slots in the environment */ unsigned int me_numreaders; /**< max reader slots used in the environment */ } MDB_envinfo; @@ -529,14 +517,12 @@ int mdb_env_create(MDB_env **env); * allowed. LMDB will still modify the lock file - except on read-only * filesystems, where LMDB does not use locks. * <li>#MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This uses - * fewer mallocs but loses protection from application bugs + * Use a writeable memory map unless MDB_RDONLY is set. This is faster + * and uses fewer mallocs, but loses protection from application bugs * like wild pointer writes and other bad updates into the database. - * This may be slightly faster for DBs that fit entirely in RAM, but - * is slower for DBs larger than RAM. * Incompatible with nested transactions. - * Do not mix processes with and without MDB_WRITEMAP on the same - * environment. This can defeat durability (#mdb_env_sync etc). + * Processes with and without MDB_WRITEMAP on the same environment do + * not cooperate well. * <li>#MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to disk, @@ -607,8 +593,8 @@ int mdb_env_create(MDB_env **env); * reserved in that case. * This flag may be changed at any time using #mdb_env_set_flags(). * </ul> - * @param[in] mode The UNIX permissions to set on created files and semaphores. - * This parameter is ignored on Windows. + * @param[in] mode The UNIX permissions to set on created files. This parameter + * is ignored on Windows. * @return A non-zero error value on failure and 0 on success. Some possible * errors are: * <ul> @@ -717,8 +703,7 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * Data is always written to disk when #mdb_txn_commit() is called, * but the operating system may keep it buffered. LMDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is - * not valid if the environment was opened with #MDB_RDONLY. + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes @@ -726,7 +711,6 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * @return A non-zero error value on failure and 0 on success. Some possible * errors are: * <ul> - * <li>EACCES - the environment is read-only. * <li>EINVAL - an invalid parameter was specified. * <li>EIO - an error occurred during synchronization. * </ul> @@ -748,6 +732,7 @@ void mdb_env_close(MDB_env *env); * This may be used to set some flags in addition to those from * #mdb_env_open(), or to unset these flags. If several threads * change the flags at the same time, the result is undefined. + * Most flags cannot be changed after #mdb_env_open(). * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] flags The flags to change, bitwise OR'ed together * @param[in] onoff A non-zero value sets the flags, zero clears them. @@ -1051,22 +1036,19 @@ int mdb_txn_renew(MDB_txn *txn); * The database handle may be discarded by calling #mdb_dbi_close(). * The old database handle is returned if the database was already open. * The handle may only be closed once. - * * The database handle will be private to the current transaction until * the transaction is successfully committed. If the transaction is * aborted the handle will be closed automatically. - * After a successful commit the handle will reside in the shared - * environment, and may be used by other transactions. - * - * This function must not be called from multiple concurrent - * transactions in the same process. A transaction that uses - * this function must finish (either commit or abort) before + * After a successful commit the + * handle will reside in the shared environment, and may be used + * by other transactions. This function must not be called from + * multiple concurrent transactions in the same process. A transaction + * that uses this function must finish (either commit or abort) before * any other transaction in the process may use this function. * * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the environment. Database names are - * keys in the unnamed database, and may be read but not written. - * + * must be called before opening the environment. Database names + * are kept as keys in the unnamed database. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] name The name of the database to open. If only a single * database is needed in the environment, this value may be NULL. @@ -1083,9 +1065,9 @@ int mdb_txn_renew(MDB_txn *txn); * keys may have multiple data items, stored in sorted order.) By default * keys must be unique and may have only a single data item. * <li>#MDB_INTEGERKEY - * Keys are binary integers in native byte order, either unsigned int - * or size_t, and will be sorted as such. - * The keys must all be of the same size. + * Keys are binary integers in native byte order. Setting this option + * requires all keys to be the same size, typically sizeof(int) + * or sizeof(size_t). * <li>#MDB_DUPFIXED * This flag may only be used in combination with #MDB_DUPSORT. This option * tells the library that the data items for this database are all the same @@ -1093,8 +1075,8 @@ int mdb_txn_renew(MDB_txn *txn); * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE * cursor operations may be used to retrieve multiple items at once. * <li>#MDB_INTEGERDUP - * This option specifies that duplicate data items are binary integers, - * similar to #MDB_INTEGERKEY keys. + * This option specifies that duplicate data items are also integers, and + * should be sorted as such. * <li>#MDB_REVERSEDUP * This option specifies that duplicate data items should be compared as * strings in reverse order. @@ -1303,8 +1285,7 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); * the next update operation or the transaction ends. This saves * an extra memcpy if the data is being generated later. * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. This flag must not be - * specified if the database was opened with #MDB_DUPSORT. + * to modify all of the space requested. * <li>#MDB_APPEND - append the given key/data pair to the end of the * database. This option allows fast bulk loading when keys are * already known to be in the correct order. Loading unsorted keys @@ -1460,15 +1441,13 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * the database supports duplicates (#MDB_DUPSORT). * <li>#MDB_RESERVE - reserve space for data of the given size, but * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. This flag - * must not be specified if the database was opened with #MDB_DUPSORT. + * reserved space, which the caller can fill in later. This saves + * an extra memcpy if the data is being generated later. * <li>#MDB_APPEND - append the given key/data pair to the end of the * database. No key comparisons are performed. This option allows * fast bulk loading when keys are already known to be in the * correct order. Loading unsorted keys with this flag will cause - * a #MDB_KEYEXIST error. + * data corruption. * <li>#MDB_APPENDDUP - as above, but for sorted dup data. * <li>#MDB_MULTIPLE - store multiple contiguous data elements in a * single request. This flag may only be specified if the database @@ -1486,7 +1465,7 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * <ul> * <li>#MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). * <li>#MDB_TXN_FULL - the transaction has too many dirty pages. - * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EACCES - an attempt was made to modify a read-only database. * <li>EINVAL - an invalid parameter was specified. * </ul> */ @@ -1506,7 +1485,7 @@ int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * @return A non-zero error value on failure and 0 on success. Some possible * errors are: * <ul> - * <li>EACCES - an attempt was made to write in a read-only transaction. + * <li>EACCES - an attempt was made to modify a read-only database. * <li>EINVAL - an invalid parameter was specified. * </ul> */ diff --git a/plugins/Dbx_mdb/src/lmdb/mdb.c b/plugins/Dbx_mdb/src/lmdb/mdb.c index d2e81e4e9c..2b0bad3dd1 100644 --- a/plugins/Dbx_mdb/src/lmdb/mdb.c +++ b/plugins/Dbx_mdb/src/lmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2016 Howard Chu, Symas Corp. + * Copyright 2011-2014 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,6 +42,12 @@ * as int64 which is wrong. MSVC doesn't define it at all, so just * don't use it. */ +#ifdef _WIN64 +typedef signed __int64 ssize_t; +#else +typedef signed int ssize_t; +#endif + #define MDB_PID_T int #define MDB_THR_T DWORD #include <sys/types.h> @@ -85,25 +91,17 @@ extern int cacheflush(char *addr, int nbytes, int cache); * define MDB_FDATASYNC_WORKS if this code will only be run * on kernels 3.6 and newer. */ -#define BROKEN_FDATASYNC +//#define BROKEN_FDATASYNC #endif #include <errno.h> #include <limits.h> #include <stddef.h> -#include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> -#ifdef _MSC_VER -#include <io.h> -typedef SSIZE_T ssize_t; -#else -#include <unistd.h> -#endif - #if defined(__sun) || defined(ANDROID) /* Most platforms have posix_memalign, older may only have memalign */ #define HAVE_MEMALIGN 1 @@ -116,7 +114,7 @@ typedef SSIZE_T ssize_t; #endif #if defined(__APPLE__) || defined (BSD) -# define MDB_USE_POSIX_SEM 1 +# define MDB_USE_SYSV_SEM 1 # define MDB_FDATASYNC fsync #elif defined(ANDROID) # define MDB_FDATASYNC fsync @@ -124,18 +122,18 @@ typedef SSIZE_T ssize_t; #ifndef _WIN32 #include <pthread.h> -#ifdef MDB_USE_POSIX_SEM -# define MDB_USE_HASH 1 -#include <semaphore.h> -#else -#define MDB_USE_POSIX_MUTEX 1 -#endif -#endif - -#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ - + defined(MDB_USE_POSIX_MUTEX) != 1 -# error "Ambiguous shared-lock implementation" -#endif +#ifdef MDB_USE_SYSV_SEM +#include <sys/ipc.h> +#include <sys/sem.h> +#ifdef _SEM_SEMUN_UNDEFINED +union semun { + int val; + struct semid_ds *buf; + unsigned short *array; +}; +#endif /* _SEM_SEMUN_UNDEFINED */ +#endif /* MDB_USE_SYSV_SEM */ +#endif /* !_WIN32 */ #ifdef USE_VALGRIND #include <valgrind/memcheck.h> @@ -198,12 +196,6 @@ typedef SSIZE_T ssize_t; #define ESECT #endif -#ifdef _WIN32 -#define CALL_CONV WINAPI -#else -#define CALL_CONV -#endif - /** @defgroup internal LMDB Internals * @{ */ @@ -220,6 +212,10 @@ typedef SSIZE_T ssize_t; #define MDB_DEVEL 0 #endif +#if defined(_WIN32) || (defined(EOWNERDEAD) && !defined(MDB_USE_SYSV_SEM)) +#define MDB_ROBUST_SUPPORTED 1 +#endif + /** Wrapper around __func__, which is a C99 feature */ #if __STDC_VERSION__ >= 199901L # define mdb_func_ __func__ @@ -230,53 +226,14 @@ typedef SSIZE_T ssize_t; # define mdb_func_ "<mdb_unknown>" #endif -/* Internal error codes, not exposed outside liblmdb */ -#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) -#ifdef _WIN32 -#define MDB_OWNERDEAD ((int) WAIT_ABANDONED) -#elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) -#define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ -#endif - -#ifdef __GLIBC__ -#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) -#endif -/** Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDB_USE_ROBUST=0, or use some other - * mechanism like -DMDB_USE_SYSV_SEM instead of - * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are - * also Robust, but some systems don't support them - * either.) - */ -#ifndef MDB_USE_ROBUST -/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ -# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ - (defined(__GLIBC__) && GLIBC_VER < 0x020004)) -# define MDB_USE_ROBUST 0 -# else -# define MDB_USE_ROBUST 1 -/* glibc < 2.12 only provided _np API */ -# if defined(__GLIBC__) && GLIBC_VER < 0x02000c -# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP -# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) -# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) -# endif -# endif -#endif /* MDB_USE_ROBUST */ - -#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST -#define MDB_ROBUST_SUPPORTED 1 -#endif - #ifdef _WIN32 #define MDB_USE_HASH 1 #define MDB_PIDLOCK 0 -#define THREAD_RET DWORD +#define THREAD_RET DWORD WINAPI #define pthread_t HANDLE #define pthread_mutex_t HANDLE #define pthread_cond_t HANDLE -typedef HANDLE mdb_mutex_t, mdb_mutexref_t; +typedef HANDLE mdb_mutex_t; #define pthread_key_t DWORD #define pthread_self() GetCurrentThreadId() #define pthread_key_create(x,y) \ @@ -284,15 +241,16 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #define pthread_key_delete(x) TlsFree(x) #define pthread_getspecific(x) TlsGetValue(x) #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) +#define pthread_mutex_consistent(mutex) 0 #define pthread_mutex_unlock(x) ReleaseMutex(*x) #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) #define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) #define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) +#define MDB_MUTEX(env, rw) ((env)->me_##rw##mutex) #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) -#define mdb_mutex_consistent(mutex) 0 #define getpid() GetCurrentProcessId() #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) @@ -315,30 +273,38 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ #define MDB_PIDLOCK 1 -#ifdef MDB_USE_POSIX_SEM +#ifdef MDB_USE_SYSV_SEM -typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; +typedef struct mdb_mutex { + int semid; + int semnum; +} mdb_mutex_t; + +#define MDB_MUTEX(env, rw) (&(env)->me_##rw##mutex) #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) -#define UNLOCK_MUTEX(mutex) sem_post(mutex) +#define UNLOCK_MUTEX(mutex) do { \ + struct sembuf sb = { 0, 1, SEM_UNDO }; \ + sb.sem_num = (mutex)->semnum; \ + semop((mutex)->semid, &sb, 1); \ +} while(0) static int -mdb_sem_wait(sem_t *sem) +mdb_sem_wait(mdb_mutex_t *sem) { int rc; - while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; + struct sembuf sb = { 0, -1, SEM_UNDO }; + sb.sem_num = sem->semnum; + while ((rc = semop(sem->semid, &sb, 1)) && (rc = errno) == EINTR) ; return rc; } -#else /* MDB_USE_POSIX_MUTEX: */ - /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as - * local variables keep it (mdb_mutexref_t). - * - * When #mdb_mutexref_t is a pointer declaration and #mdb_mutex_t is - * not, then it is array[size 1] so it can be assigned to a pointer. - * @{ +#else + /** Pointer/HANDLE type of shared mutex/semaphore. */ -typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; - /* @} */ +typedef pthread_mutex_t mdb_mutex_t; + /** Mutex for the reader table (rw = r) or write transaction (rw = w). + */ +#define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex) /** Lock the reader or writer mutex. * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). */ @@ -346,10 +312,7 @@ typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; /** Unlock the reader or writer mutex. */ #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) - /** Mark mutex-protected data as repaired, after death of previous owner. - */ -#define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) -#endif /* MDB_USE_POSIX_SEM */ +#endif /* MDB_USE_SYSV_SEM */ /** Get the error code for the last failed system function. */ @@ -374,8 +337,10 @@ typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) #endif -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#if defined(_WIN32) #define MNAME_LEN 32 +#elif defined(MDB_USE_SYSV_SEM) +#define MNAME_LEN 0 #else #define MNAME_LEN (sizeof(pthread_mutex_t)) #endif @@ -389,7 +354,7 @@ typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; #define LOCK_MUTEX(rc, env, mutex) \ (((rc) = LOCK_MUTEX0(mutex)) && \ ((rc) = mdb_mutex_failed(env, mutex, rc))) -static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); +static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc); #else #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) #define mdb_mutex_failed(env, mutex, rc) (rc) @@ -403,13 +368,10 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); * * @note If O_DSYNC is undefined but exists in /usr/include, * preferably set some compiler flag to get the definition. + * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. */ #ifndef MDB_DSYNC -# ifdef O_DSYNC # define MDB_DSYNC O_DSYNC -# else -# define MDB_DSYNC O_SYNC -# endif #endif #endif @@ -521,19 +483,14 @@ static txnid_t mdb_debug_start; /** The version number for a database's datafile format. */ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION 1 +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) - /** @brief The max size of a key we can write, or 0 for computed max. - * - * This macro should normally be left alone or set to 0. - * Note that a database with big keys or dupsort data cannot be - * reliably modified by a liblmdb which uses a smaller max. - * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + /** @brief The max size of a key we can write, or 0 for dynamic max. * - * Other values are allowed, for backwards compat. However: - * A value bigger than the computed max can break if you do not - * know what you are doing, and liblmdb <= 0.9.10 can break when - * modifying a DB with keys/dupsort data bigger than its max. + * Define this as 0 to compute the max from the page size. 511 + * is default for backwards compat: liblmdb <= 0.9.10 can break + * when modifying a DB with keys/dupsort data bigger than its max. + * #MDB_DEVEL sets the default to 0. * * Data items in an #MDB_DUPSORT database are also limited to * this size, since they're actually keys of a sub-DB. Keys and @@ -710,13 +667,15 @@ typedef struct MDB_txbody { uint32_t mtb_magic; /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ uint32_t mtb_format; -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#if defined(_WIN32) char mtb_rmname[MNAME_LEN]; +#elif defined(MDB_USE_SYSV_SEM) + int mtb_semid; #else /** Mutex protecting access to this table. - * This is the reader table lock used with LOCK_MUTEX(). + * This is the #MDB_MUTEX(env,r) reader table lock. */ - mdb_mutex_t mtb_rmutex; + pthread_mutex_t mtb_rmutex; #endif /** The ID of the last transaction committed to the database. * This is recorded here only for convenience; the value can always @@ -742,16 +701,20 @@ typedef struct MDB_txninfo { #define mti_numreaders mt1.mtb.mtb_numreaders char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; +#ifdef MDB_USE_SYSV_SEM +#define mti_semid mt1.mtb.mtb_semid +#else union { -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) +#if defined(_WIN32) char mt2_wmname[MNAME_LEN]; #define mti_wmname mt2.mt2_wmname #else - mdb_mutex_t mt2_wmutex; + pthread_mutex_t mt2_wmutex; #define mti_wmutex mt2.mt2_wmutex #endif char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; } mt2; +#endif MDB_reader mti_readers[1]; } MDB_txninfo; @@ -760,6 +723,7 @@ typedef struct MDB_txninfo { ((uint32_t) \ ((MDB_LOCK_VERSION) \ /* Flags which describe functionality */ \ + + (((MNAME_LEN) == 0) << 18) /* MDB_USE_SYSV_SEM */ \ + (((MDB_PIDLOCK) != 0) << 16))) /** @} */ @@ -965,7 +929,7 @@ typedef struct MDB_db { pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_leaf_pages; /**< number of leaf pages */ pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ + uint64_t md_entries; /**< number of data items */ pgno_t md_root; /**< the root page of this tree */ } MDB_db; @@ -979,11 +943,6 @@ typedef struct MDB_db { #define FREE_DBI 0 /** Handle for the default DB. */ #define MAIN_DBI 1 - /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ -#define CORE_DBS 2 - - /** Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 /** Meta page content. * A meta page is the start point for accessing a database snapshot. @@ -996,12 +955,12 @@ typedef struct MDB_meta { /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ uint32_t mm_version; void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + uint64_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ -#define mm_psize mm_dbs[FREE_DBI].md_pad +#define mm_psize mm_dbs[0].md_pad /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[FREE_DBI].md_flags +#define mm_flags mm_dbs[0].md_flags pgno_t mm_last_pg; /**< last used page in file */ volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1036,8 +995,7 @@ typedef struct MDB_dbx { */ struct MDB_txn { MDB_txn *mt_parent; /**< parent of a nested txn */ - /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ - MDB_txn *mt_child; + MDB_txn *mt_child; /**< nested txn under this txn */ pgno_t mt_next_pgno; /**< next unallocated page */ /** The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction @@ -1079,15 +1037,13 @@ struct MDB_txn { #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; - /** Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. + /** Number of DB records in use. This number only ever increments; + * we don't decrement it when individual DB handles are closed. */ MDB_dbi mt_numdbs; @@ -1095,18 +1051,10 @@ struct MDB_txn { * @ingroup internal * @{ */ - /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY -#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ - /* internal txn flags */ -#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ -#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ - /** most operations on the txn are currently illegal */ -#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ /** #dirty_list room: Array size - \#dirty pages visible to this txn. @@ -1161,6 +1109,7 @@ struct MDB_cursor { #define C_EOF 0x02 /**< No more data */ #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_SPLITTING 0x20 /**< Cursor is in page_split */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ /** @} */ unsigned int mc_flags; /**< @ref mdb_cursor */ @@ -1207,19 +1156,18 @@ struct MDB_env { unsigned int me_psize; /**< DB page size, inited from me_os_psize */ unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ unsigned int me_maxreaders; /**< size of the reader table */ - /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ - volatile int me_close_readers; + unsigned int me_numreaders; /**< max numreaders set by this env */ MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_maxdbs; /**< size of the DB table */ MDB_PID_T me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ - MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ + MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ + uint64_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ MDB_dbx *me_dbxs; /**< array of static DB info */ @@ -1246,10 +1194,8 @@ struct MDB_env { #ifdef _WIN32 int me_pidquery; /**< Used in OpenProcess */ #endif -#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ -# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ -# define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ -#else +#if defined(_WIN32) || defined(MDB_USE_SYSV_SEM) + /* Windows mutexes/SysV semaphores do not reside in shared mem */ mdb_mutex_t me_rmutex; mdb_mutex_t me_wmutex; #endif @@ -1271,11 +1217,11 @@ typedef struct MDB_ntxn { #endif /** max bytes to write in one call */ -#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) /** Check \b txn and \b dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi, validity) \ - ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) +#define TXN_DBI_EXIST(txn, dbi) \ + ((txn) && (dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & DB_VALID)) /** Check for misused \b dbi handles */ #define TXN_DBI_CHANGED(txn, dbi) \ @@ -1285,19 +1231,6 @@ static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); static int mdb_page_touch(MDB_cursor *mc); -#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ - "reset-tmp", "fail-begin", "fail-beginchild"} -enum { - /* mdb_txn_end operation number, for logging */ - MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, - MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD -}; -#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ -#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ -#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static void mdb_txn_end(MDB_txn *txn, unsigned mode); - static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); @@ -1314,9 +1247,9 @@ static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags); static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static MDB_meta *mdb_env_pick_meta(const MDB_env *env); +static int mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); -#ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ +#if !(defined(_WIN32) || defined(MDB_USE_SYSV_SEM)) /* Drop unused excl arg */ # define mdb_env_close0(env, excl) mdb_env_close1(env) #endif static void mdb_env_close0(MDB_env *env, int excl); @@ -1326,7 +1259,7 @@ static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); static void mdb_node_del(MDB_cursor *mc, int ksize); static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); @@ -1350,7 +1283,6 @@ static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); static void mdb_xcursor_init0(MDB_cursor *mc); static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); -static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdb_drop0(MDB_cursor *mc, int subs); static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); @@ -1360,23 +1292,14 @@ static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; /** @endcond */ -/** Compare two items pointing at size_t's of unknown alignment. */ -#ifdef MISALIGNED_OK -# define mdb_cmp_clong mdb_cmp_long -#else -# define mdb_cmp_clong mdb_cmp_cint -#endif - #ifdef _WIN32 static SECURITY_DESCRIPTOR mdb_null_sd; static SECURITY_ATTRIBUTES mdb_all_sa; static int mdb_sec_inited; - -static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize); #endif /** Return the library version info. */ -char * ESECT +char * mdb_version(int *major, int *minor, int *patch) { if (major) *major = MDB_VERSION_MAJOR; @@ -1404,7 +1327,7 @@ static char *const mdb_errstr[] = { "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", }; @@ -1448,7 +1371,7 @@ mdb_strerror(int err) ; } buf[0] = 0; - FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | + FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, 0, ptr, sizeof(buf), (va_list *)pad); return ptr; @@ -1460,7 +1383,7 @@ mdb_strerror(int err) /** assert(3) variant in cursor context */ #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) /** assert(3) variant in transaction context */ -#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) +#define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) /** assert(3) variant in environment context */ #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) @@ -1468,7 +1391,7 @@ mdb_strerror(int err) # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) -static void ESECT +static void mdb_assert_fail(MDB_env *env, const char *expr_txt, const char *func, const char *file, int line) { @@ -1604,7 +1527,7 @@ mdb_cursor_chk(MDB_cursor *mc) MDB_node *node; MDB_page *mp; - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; for (i=0; i<mc->mc_top; i++) { mp = mc->mc_pg[i]; node = NODEPTR(mp, mc->mc_ki[i]); @@ -1613,13 +1536,6 @@ mdb_cursor_chk(MDB_cursor *mc) } if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) printf("ack!\n"); - if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - printf("blah!\n"); - } - } } #endif @@ -1673,10 +1589,9 @@ static void mdb_audit(MDB_txn *txn) mdb_tassert(txn, rc == MDB_NOTFOUND); } } - if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+NUM_METAS, - freecount+count+NUM_METAS, txn->mt_next_pgno); + txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); } } #endif @@ -1690,12 +1605,7 @@ mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { - MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) - dcmp = mdb_cmp_clong; -#endif - return dcmp(a, b); + return txn->mt_dbxs[dbi].md_dcmp(a, b); } /** Allocate memory for a page. @@ -1945,7 +1855,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Estimate how much space this op will take */ i = m0->mc_db->md_depth; /* Named DBs also dirty the main DB */ - if (m0->mc_dbi >= CORE_DBS) + if (m0->mc_dbi > MAIN_DBI) i += txn->mt_dbs[MAIN_DBI].md_depth; /* For puts, roughly factor in the key+data size */ if (key) @@ -2054,7 +1964,7 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) MDB_ID2 mid; int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - if (txn->mt_flags & MDB_TXN_WRITEMAP) { + if (txn->mt_env->me_flags & MDB_WRITEMAP) { insert = mdb_mid2l_append; } else { insert = mdb_mid2l_insert; @@ -2425,15 +2335,14 @@ done: } else { for (; m2; m2=m2->mc_next) { if (m2->mc_snum < mc->mc_snum) continue; - if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; if ((mc->mc_db->md_flags & MDB_DUPSORT) && IS_LEAF(np) && - (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]); - if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } } @@ -2497,15 +2406,14 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) *bk = *mc; mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. + /* Kill pointers into src - and dst to reduce abuse: The + * user may not use mc until dst ends. Otherwise we'd... */ - mc->mc_txn = dst; - mc->mc_dbflag = &dst->mt_dbflags[i]; + mc->mc_txn = NULL; /* ...set this to dst */ + mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ if ((mx = mc->mc_xcursor) != NULL) { *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = dst; + mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ } mc->mc_next = dst->mt_cursors[i]; dst->mt_cursors[i] = mc; @@ -2555,6 +2463,12 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) } } +#if !(MDB_DEBUG) +#define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) +#endif +static void +mdb_txn_reset0(MDB_txn *txn, const char *act); + #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ enum Pidlock_op { Pidset, Pidcheck @@ -2619,13 +2533,16 @@ mdb_txn_renew0(MDB_txn *txn) MDB_env *env = txn->mt_env; MDB_txninfo *ti = env->me_txns; MDB_meta *meta; - unsigned int i, nr, flags = txn->mt_flags; + unsigned int i, nr; uint16_t x; int rc, new_notls = 0; - if ((flags &= MDB_TXN_RDONLY) != 0) { + if (txn->mt_flags & MDB_TXN_RDONLY) { + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ if (!ti) { - meta = mdb_env_pick_meta(env); + meta = env->me_metas[ mdb_env_pick_meta(env) ]; txn->mt_txnid = meta->mm_txnid; txn->mt_u.reader = NULL; } else { @@ -2637,7 +2554,7 @@ mdb_txn_renew0(MDB_txn *txn) } else { MDB_PID_T pid = env->me_pid; MDB_THR_T tid = pthread_self(); - mdb_mutexref_t rmutex = env->me_rmutex; + mdb_mutex_t *rmutex = MDB_MUTEX(env, r); if (!env->me_live_reader) { rc = mdb_reader_pid(env, Pidset, pid); @@ -2657,19 +2574,13 @@ mdb_txn_renew0(MDB_txn *txn) return MDB_READERS_FULL; } r = &ti->mti_readers[i]; - /* Claim the reader slot, carefully since other code - * uses the reader table un-mutexed: First reset the - * slot, next publish it in mti_numreaders. After - * that, it is safe for mdb_env_close() to touch it. - * When it will be closed, we can finally claim it. - */ - r->mr_pid = 0; r->mr_txnid = (txnid_t)-1; r->mr_tid = tid; + r->mr_pid = pid; /* should be written last, see ITS#7971. */ if (i == nr) ti->mti_numreaders = ++nr; - env->me_close_readers = nr; - r->mr_pid = pid; + /* Save numreaders for un-mutexed mdb_env_close() */ + env->me_numreaders = nr; UNLOCK_MUTEX(rmutex); new_notls = (env->me_flags & MDB_NOTLS); @@ -2685,23 +2596,33 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_u.reader = r; meta = env->me_metas[txn->mt_txnid & 1]; } - } else { - /* Not yet touching txn == env->me_txn0, it may be active */ if (ti) { - if (LOCK_MUTEX(rc, env, env->me_wmutex)) + if (LOCK_MUTEX(rc, env, MDB_MUTEX(env, w))) return rc; +#ifdef MDB_USE_SYSV_SEM + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + /* Update mti_txnid like mdb_mutex_failed() would, + * in case last writer crashed before updating it. + */ + ti->mti_txnid = txn->mt_txnid; +#else txn->mt_txnid = ti->mti_txnid; meta = env->me_metas[txn->mt_txnid & 1]; +#endif } else { - meta = mdb_env_pick_meta(env); + meta = env->me_metas[ mdb_env_pick_meta(env) ]; txn->mt_txnid = meta->mm_txnid; } + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; txn->mt_txnid++; #if MDB_DEBUG if (txn->mt_txnid == mdb_debug_start) mdb_debug = 1; #endif + txn->mt_flags = 0; txn->mt_child = NULL; txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; @@ -2716,33 +2637,28 @@ mdb_txn_renew0(MDB_txn *txn) } /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; - txn->mt_flags = flags; - - /* Setup db info */ - txn->mt_numdbs = env->me_numdbs; - for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + for (i=2; i<txn->mt_numdbs; i++) { x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; } - txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; - txn->mt_dbflags[FREE_DBI] = DB_VALID; + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - rc = MDB_PANIC; - } else if (env->me_maxpg < txn->mt_next_pgno) { - rc = MDB_MAP_RESIZED; - } else { - return MDB_SUCCESS; + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn, "renew0-mapfail"); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; } - mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); - return rc; + + return MDB_SUCCESS; } int @@ -2750,9 +2666,14 @@ mdb_txn_renew(MDB_txn *txn) { int rc; - if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) + if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ return EINVAL; + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", @@ -2767,47 +2688,60 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { MDB_txn *txn; MDB_ntxn *ntxn; - int rc, size, tsize; - - flags &= MDB_TXN_BEGIN_FLAGS; - flags |= env->me_flags & MDB_WRITEMAP; + int rc, size, tsize = sizeof(MDB_txn); - if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) return EACCES; - if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ - flags |= parent->mt_flags; - if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { + if (parent->mt_child || + (flags & MDB_RDONLY) || + (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || + (env->me_flags & MDB_WRITEMAP)) + { return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; } - /* Child txns save MDB_pgstate and use own copy of cursors */ - size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); - size += tsize = sizeof(MDB_ntxn); - } else if (flags & MDB_RDONLY) { - size = env->me_maxdbs * (sizeof(MDB_db)+1); - size += tsize = sizeof(MDB_txn); - } else { - /* Reuse preallocated write txn. However, do not touch it until - * mdb_txn_renew0() succeeds, since it currently may be active. - */ - txn = env->me_txn0; - goto renew; + tsize = sizeof(MDB_ntxn); + } + size = tsize; + if (!(flags & MDB_RDONLY)) { + if (!parent) { + txn = env->me_txn0; /* just reuse preallocated write txn */ + goto ok; + } + /* child txns use own copy of cursors */ + size += env->me_maxdbs * sizeof(MDB_cursor *); } + size += env->me_maxdbs * (sizeof(MDB_db)+1); + if ((txn = calloc(1, size)) == NULL) { DPRINTF(("calloc: %s", strerror(errno))); return ENOMEM; } - txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; - txn->mt_flags = flags; + if (flags & MDB_RDONLY) { + txn->mt_flags |= MDB_TXN_RDONLY; + txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = env->me_dbiseqs; + } else { + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + if (parent) { + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); + } else { + txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + } + } txn->mt_env = env; +ok: if (parent) { unsigned int i; - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); if (!txn->mt_u.dirty_list || !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) @@ -2821,10 +2755,11 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_u.dirty_list[0].mid = 0; txn->mt_spill_pgs = NULL; txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_flags |= MDB_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; + txn->mt_flags = parent->mt_flags; + txn->mt_dbxs = parent->mt_dbxs; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ for (i=0; i<txn->mt_numdbs; i++) @@ -2843,20 +2778,17 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) if (!rc) rc = mdb_cursor_shadow(parent, txn); if (rc) - mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); - } else { /* MDB_RDONLY */ - txn->mt_dbiseqs = env->me_dbiseqs; -renew: + mdb_txn_reset0(txn, "beginchild-fail"); + } else { rc = mdb_txn_renew0(txn); } if (rc) { if (txn != env->me_txn0) free(txn); } else { - txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ *ret = txn; DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -2886,7 +2818,7 @@ mdb_dbis_update(MDB_txn *txn, int keep) MDB_env *env = txn->mt_env; unsigned char *tdbflags = txn->mt_dbflags; - for (i = n; --i >= CORE_DBS;) { + for (i = n; --i >= 2;) { if (tdbflags[i] & DB_NEW) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; @@ -2906,68 +2838,52 @@ mdb_dbis_update(MDB_txn *txn, int keep) env->me_numdbs = n; } -/** End a transaction, except successful commit of a nested transaction. +/** Common code for #mdb_txn_reset() and #mdb_txn_abort(). * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to end - * @param[in] mode why and how to end the transaction + * @param[in] txn the transaction handle to reset + * @param[in] act why the transaction is being reset */ static void -mdb_txn_end(MDB_txn *txn, unsigned mode) +mdb_txn_reset0(MDB_txn *txn, const char *act) { MDB_env *env = txn->mt_env; -#if MDB_DEBUG - static const char *const names[] = MDB_END_NAMES; -#endif - /* Export or close DBI handles opened in this txn */ - mdb_dbis_update(txn, mode & MDB_END_UPDATE); + /* Close any DBI handles opened in this txn */ + mdb_dbis_update(txn, 0); DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - names[mode & MDB_END_OPMASK], - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { txn->mt_u.reader->mr_txnid = (txnid_t)-1; - if (!(env->me_flags & MDB_NOTLS)) { + if (!(env->me_flags & MDB_NOTLS)) txn->mt_u.reader = NULL; /* txn does not own reader */ - } else if (mode & MDB_END_SLOT) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } /* else txn owns the slot until it does MDB_END_SLOT */ } - txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags |= MDB_TXN_FINISHED; - - } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + txn->mt_numdbs = 0; /* close nothing if called again */ + txn->mt_dbxs = NULL; /* mark txn as reset */ + } else { pgno_t *pghead = env->me_pghead; - if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ - mdb_cursors_close(txn, 0); + mdb_cursors_close(txn, 0); if (!(env->me_flags & MDB_WRITEMAP)) { mdb_dlist_free(txn); } - txn->mt_numdbs = 0; - txn->mt_flags = MDB_TXN_FINISHED; - if (!txn->mt_parent) { - mdb_midl_shrink(&txn->mt_free_pgs); - env->me_free_pgs = txn->mt_free_pgs; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; /* me_pgstate: */ env->me_pghead = NULL; env->me_pglast = 0; env->me_txn = NULL; - mode = 0; /* txn == env->me_txn0, do not free() it */ - /* The writer mutex was locked in mdb_txn_begin. */ if (env->me_txns) - UNLOCK_MUTEX(env->me_wmutex); + UNLOCK_MUTEX(MDB_MUTEX(env, w)); } else { txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; mdb_midl_free(txn->mt_free_pgs); mdb_midl_free(txn->mt_spill_pgs); @@ -2976,9 +2892,6 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) mdb_midl_free(pghead); } - - if (mode & MDB_END_FREE) - free(txn); } void @@ -2991,7 +2904,7 @@ mdb_txn_reset(MDB_txn *txn) if (!(txn->mt_flags & MDB_TXN_RDONLY)) return; - mdb_txn_end(txn, MDB_END_RESET); + mdb_txn_reset0(txn, "reset"); } void @@ -3003,7 +2916,13 @@ mdb_txn_abort(MDB_txn *txn) if (txn->mt_child) mdb_txn_abort(txn->mt_child); - mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); + mdb_txn_reset0(txn, "abort"); + /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; + + if (txn != txn->mt_env->me_txn0) + free(txn); } /** Save the freelist as of this transaction to the freeDB. @@ -3279,7 +3198,6 @@ mdb_page_flush(MDB_txn *txn, int keep) /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { if (n) { -retry_write: /* Write previous page(s) */ #ifdef MDB_USE_PWRITEV wres = pwritev(env->me_fd, iov, n, wpos); @@ -3287,11 +3205,8 @@ retry_write: if (n == 1) { wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); } else { -retry_seek: if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { rc = ErrCode(); - if (rc == EINTR) - goto retry_seek; DPRINTF(("lseek: %s", strerror(rc))); return rc; } @@ -3301,8 +3216,6 @@ retry_seek: if (wres != wsize) { if (wres < 0) { rc = ErrCode(); - if (rc == EINTR) - goto retry_write; DPRINTF(("Write error: %s", strerror(rc))); } else { rc = EIO; /* TODO: Use which error code? */ @@ -3354,17 +3267,15 @@ int mdb_txn_commit(MDB_txn *txn) { int rc; - unsigned int i, end_mode; + unsigned int i; MDB_env *env; - if (txn == NULL) + if (txn == NULL || txn->mt_env == NULL) return EINVAL; - /* mdb_txn_end() mode for a commit which writes nothing */ - end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; - if (txn->mt_child) { rc = mdb_txn_commit(txn->mt_child); + txn->mt_child = NULL; if (rc) goto fail; } @@ -3372,11 +3283,14 @@ mdb_txn_commit(MDB_txn *txn) env = txn->mt_env; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - goto done; + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ + mdb_txn_abort(txn); + return MDB_SUCCESS; } - if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { - DPUTS("txn has failed/finished, can't commit"); + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("error flag is set, can't commit"); if (txn->mt_parent) txn->mt_parent->mt_flags |= MDB_TXN_ERROR; rc = MDB_BAD_TXN; @@ -3408,9 +3322,9 @@ mdb_txn_commit(MDB_txn *txn) /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; - parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; - for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; i<txn->mt_numdbs; i++) { /* preserve parent's DB_NEW status */ x = parent->mt_dbflags[i] & DB_NEW; parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; @@ -3439,25 +3353,6 @@ mdb_txn_commit(MDB_txn *txn) pspill[0] = y; } - /* Remove anything in our spill list from parent's dirty list */ - if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { - for (i=1; i<=txn->mt_spill_pgs[0]; i++) { - MDB_ID pn = txn->mt_spill_pgs[i]; - if (pn & 1) - continue; /* deleted spillpg */ - pn >>= 1; - y = mdb_mid2l_search(dst, pn); - if (y <= dst[0].mid && dst[y].mid == pn) { - free(dst[y].mptr); - while (y < dst[0].mid) { - dst[y] = dst[y+1]; - y++; - } - dst[0].mid--; - } - } - } - /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; dst[0].mid = 0; /* simplify loops */ @@ -3503,7 +3398,7 @@ mdb_txn_commit(MDB_txn *txn) } /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp)) ; *lp = txn->mt_loose_pgs; parent->mt_loose_count += txn->mt_loose_count; @@ -3530,22 +3425,21 @@ mdb_txn_commit(MDB_txn *txn) txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ - if (txn->mt_numdbs > CORE_DBS) { + if (txn->mt_numdbs > 2) { MDB_cursor mc; MDB_dbi i; MDB_val data; data.mv_size = sizeof(MDB_db); mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (i = 2; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { if (TXN_DBI_CHANGED(txn, i)) { rc = MDB_BAD_DBI; goto fail; } data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, - F_SUBDATA); + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); if (rc) goto fail; } @@ -3558,7 +3452,8 @@ mdb_txn_commit(MDB_txn *txn) mdb_midl_free(env->me_pghead); env->me_pghead = NULL; - mdb_midl_shrink(&txn->mt_free_pgs); + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; #if (MDB_DEBUG) > 2 mdb_audit(txn); @@ -3568,10 +3463,21 @@ mdb_txn_commit(MDB_txn *txn) (rc = mdb_env_sync(env, 0)) || (rc = mdb_env_write_meta(txn))) goto fail; - end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + + /* Free P_LOOSE pages left behind in dirty_list */ + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dlist_free(txn); done: - mdb_txn_end(txn, end_mode); + env->me_pglast = 0; + env->me_txn = NULL; + mdb_dbis_update(txn, 1); + + if (env->me_txns) + UNLOCK_MUTEX(MDB_MUTEX(env, w)); + if (txn != env->me_txn0) + free(txn); + return MDB_SUCCESS; fail: @@ -3598,7 +3504,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) * Read both meta pages so we can use the latest one. */ - for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { + for (i=off=0; i<2; i++, off = meta->mm_psize) { #ifdef _WIN32 DWORD len; OVERLAPPED ov; @@ -3651,11 +3557,11 @@ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) meta->mm_version = MDB_DATA_VERSION; meta->mm_mapsize = env->me_mapsize; meta->mm_psize = env->me_psize; - meta->mm_last_pg = NUM_METAS-1; + meta->mm_last_pg = 1; meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - meta->mm_dbs[FREE_DBI].md_root = P_INVALID; - meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; + meta->mm_flags |= MDB_INTEGERKEY; + meta->mm_dbs[0].md_root = P_INVALID; + meta->mm_dbs[1].md_root = P_INVALID; } /** Write the environment parameters of a freshly created DB environment. @@ -3680,18 +3586,14 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) int len; #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ len = pwrite(fd, ptr, size, pos); \ - if (len == -1 && ErrCode() == EINTR) continue; \ - rc = (len >= 0); break; } while(1) + rc = (len >= 0); } while(0) #endif DPUTS("writing new meta page"); psize = env->me_psize; - p = calloc(NUM_METAS, psize); - if (!p) - return ENOMEM; - + p = calloc(2, psize); p->mp_pgno = 0; p->mp_flags = P_META; *(MDB_meta *)METADATA(p) = *meta; @@ -3701,10 +3603,10 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) q->mp_flags = P_META; *(MDB_meta *)METADATA(q) = *meta; - DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); + DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); if (!rc) rc = ErrCode(); - else if ((unsigned) len == psize * NUM_METAS) + else if ((unsigned) len == psize * 2) rc = MDB_SUCCESS; else rc = ENOSPC; @@ -3721,7 +3623,6 @@ mdb_env_write_meta(MDB_txn *txn) { MDB_env *env; MDB_meta meta, metab, *mp; - unsigned flags; size_t mapsize; off_t off; int rc, len, toggle; @@ -3738,33 +3639,34 @@ mdb_env_write_meta(MDB_txn *txn) toggle, txn->mt_dbs[MAIN_DBI].md_root)); env = txn->mt_env; - flags = env->me_flags; mp = env->me_metas[toggle]; mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; /* Persist any increases of mapsize config */ if (mapsize < env->me_mapsize) mapsize = env->me_mapsize; - if (flags & MDB_WRITEMAP) { + if (env->me_flags & MDB_WRITEMAP) { mp->mm_mapsize = mapsize; - mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mp->mm_dbs[0] = txn->mt_dbs[0]; + mp->mm_dbs[1] = txn->mt_dbs[1]; mp->mm_last_pg = txn->mt_next_pgno - 1; -#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ - !(defined(__i386__) || defined(__x86_64__)) +#if !(defined(_MSC_VER) || defined(__i386__) || defined(__x86_64__)) /* LY: issue a memory barrier, if not x86. ITS#7969 */ __sync_synchronize(); #endif mp->mm_txnid = txn->mt_txnid; - if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { unsigned meta_size = env->me_psize; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = (char *)mp - PAGEHDRSZ; + ptr = env->me_map; + if (toggle) { #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ - r2 = (ptr - env->me_map) & (env->me_os_psize - 1); - ptr -= r2; - meta_size += r2; + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else #endif + ptr += meta_size; + } if (MDB_MSYNC(ptr, meta_size, rc)) { rc = ErrCode(); goto fail; @@ -3772,22 +3674,25 @@ mdb_env_write_meta(MDB_txn *txn) } goto done; } - metab.mm_txnid = mp->mm_txnid; - metab.mm_last_pg = mp->mm_last_pg; + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; meta.mm_mapsize = mapsize; - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_dbs[0] = txn->mt_dbs[0]; + meta.mm_dbs[1] = txn->mt_dbs[1]; meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; off = offsetof(MDB_meta, mm_mapsize); ptr = (char *)&meta + off; len = sizeof(MDB_meta) - off; - off += (char *)mp - env->me_map; + if (toggle) + off += env->me_psize; + off += PAGEHDRSZ; /* Write to the SYNC fd */ - mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; #ifdef _WIN32 { memset(&ov, 0, sizeof(ov)); @@ -3796,15 +3701,10 @@ mdb_env_write_meta(MDB_txn *txn) rc = -1; } #else -retry_write: rc = pwrite(mfd, ptr, len, off); #endif if (rc != len) { rc = rc < 0 ? ErrCode() : EIO; -#ifndef _WIN32 - if (rc == EINTR) - goto retry_write; -#endif DPUTS("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Write some old data back, to prevent it from being used. @@ -3841,13 +3741,12 @@ done: /** Check both meta pages to see which one is newer. * @param[in] env the environment handle - * @return newest #MDB_meta. + * @return meta toggle (0 or 1). */ -static MDB_meta * +static int mdb_env_pick_meta(const MDB_env *env) { - MDB_meta *const *metas = env->me_metas; - return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; + return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); } int ESECT @@ -3860,13 +3759,13 @@ mdb_env_create(MDB_env **env) return ENOMEM; e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_maxdbs = e->me_numdbs = 2; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_mfd = INVALID_HANDLE_VALUE; -#ifdef MDB_USE_POSIX_SEM - e->me_rmutex = SEM_FAILED; - e->me_wmutex = SEM_FAILED; +#ifdef MDB_USE_SYSV_SEM + e->me_rmutex.semid = -1; + e->me_wmutex.semid = -1; #endif e->me_pid = getpid(); GET_PAGESIZE(e->me_os_psize); @@ -3971,7 +3870,7 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) void *old; if (env->me_txn) return EINVAL; - meta = mdb_env_pick_meta(env); + meta = env->me_metas[mdb_env_pick_meta(env)]; if (!size) size = meta->mm_mapsize; { @@ -3998,7 +3897,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) return EINVAL; - env->me_maxdbs = dbs + CORE_DBS; + env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ return MDB_SUCCESS; } @@ -4178,12 +4077,12 @@ mdb_env_open2(MDB_env *env) #if MDB_DEBUG { - MDB_meta *meta = mdb_env_pick_meta(env); - MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + int toggle = mdb_env_pick_meta(env); + MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; DPRINTF(("opened database version %u, pagesize %u", - meta->mm_version, env->me_psize)); - DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); + env->me_metas[0]->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", toggle)); DPRINTF(("depth: %u", db->md_depth)); DPRINTF(("entries: %"Z"u", db->md_entries)); DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); @@ -4270,10 +4169,9 @@ PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { - int rc = 0; - MDB_meta *meta = mdb_env_pick_meta(env); + int rc = 0, toggle = mdb_env_pick_meta(env); - env->me_txns->mti_txnid = meta->mm_txnid; + env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; #ifdef _WIN32 { @@ -4338,8 +4236,8 @@ mdb_env_excl_lock(MDB_env *env, int *excl) if (!rc) { *excl = 1; } else -# ifndef MDB_USE_POSIX_MUTEX - if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ +# ifdef MDB_USE_SYSV_SEM + if (*excl < 0) /* always true when !MDB_USE_SYSV_SEM */ # endif { lock_info.l_type = F_RDLCK; @@ -4422,7 +4320,7 @@ mdb_hash_val(MDB_val *val, mdb_hash_t hval) */ static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; -static void ESECT +static void mdb_pack85(unsigned long l, char *out) { int i; @@ -4433,7 +4331,7 @@ mdb_pack85(unsigned long l, char *out) } } -static void ESECT +static void mdb_hash_enc(MDB_val *val, char *encbuf) { mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); @@ -4469,14 +4367,9 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) off_t size, rsize; #ifdef _WIN32 - wchar_t *wlpath; - rc = utf8_to_utf16(lpath, -1, &wlpath, NULL); - if (rc) - return rc; - env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, + env->me_lfd = CreateFileA(lpath, GENERIC_READ|GENERIC_WRITE, + FILE_SHARE_READ, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); - free(wlpath); #else env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); #endif @@ -4578,45 +4471,27 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) mdb_hash_enc(&val, encbuf); sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); - env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; -#elif defined(MDB_USE_POSIX_SEM) - struct stat stbuf; - struct { - dev_t dev; - ino_t ino; - } idbuf; - MDB_val val; - char encbuf[11]; +#elif defined(MDB_USE_SYSV_SEM) + union semun semu; + unsigned short vals[2] = {1, 1}; + int semid = semget(IPC_PRIVATE, 2, mode); + if (semid < 0) + goto fail_errno; -#if defined(__NetBSD__) -#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ -#endif - if (fstat(env->me_lfd, &stbuf)) goto fail_errno; - idbuf.dev = stbuf.st_dev; - idbuf.ino = stbuf.st_ino; - val.mv_data = &idbuf; - val.mv_size = sizeof(idbuf); - mdb_hash_enc(&val, encbuf); -#ifdef MDB_SHORT_SEMNAMES - encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ -#endif - sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); - sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); - /* Clean up after a previous run, if needed: Try to - * remove both semaphores before doing anything else. - */ - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); - env->me_rmutex = sem_open(env->me_txns->mti_rmname, - O_CREAT|O_EXCL, mode, 1); - if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, - O_CREAT|O_EXCL, mode, 1); - if (env->me_wmutex == SEM_FAILED) goto fail_errno; -#else /* MDB_USE_POSIX_MUTEX: */ + env->me_rmutex.semid = semid; + env->me_wmutex.semid = semid; + env->me_rmutex.semnum = 0; + env->me_wmutex.semnum = 1; + + semu.array = vals; + if (semctl(semid, 0, SETALL, semu) < 0) + goto fail_errno; + env->me_txns->mti_semid = semid; +#else /* MDB_USE_SYSV_SEM */ pthread_mutexattr_t mattr; if ((rc = pthread_mutexattr_init(&mattr)) @@ -4624,11 +4499,11 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #ifdef MDB_ROBUST_SUPPORTED || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) #endif - || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) - || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) + || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr)) + || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) goto fail; pthread_mutexattr_destroy(&mattr); -#endif /* _WIN32 || MDB_USE_POSIX_SEM */ +#endif /* _WIN32 || MDB_USE_SYSV_SEM */ env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; @@ -4636,6 +4511,11 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) env->me_txns->mti_numreaders = 0; } else { +#ifdef MDB_USE_SYSV_SEM + struct semid_ds buf; + union semun semu; + int semid; +#endif if (env->me_txns->mti_magic != MDB_MAGIC) { DPUTS("lock region has invalid magic"); rc = MDB_INVALID; @@ -4652,15 +4532,25 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) goto fail; } #ifdef _WIN32 - env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); if (!env->me_wmutex) goto fail_errno; -#elif defined(MDB_USE_POSIX_SEM) - env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); - if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); - if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#elif defined(MDB_USE_SYSV_SEM) + semid = env->me_txns->mti_semid; + semu.buf = &buf; + + /* check for read access */ + if (semctl(semid, 0, IPC_STAT, semu) < 0) + goto fail_errno; + /* check for write access */ + if (semctl(semid, 0, IPC_SET, semu) < 0) + goto fail_errno; + + env->me_rmutex.semid = semid; + env->me_wmutex.semid = semid; + env->me_rmutex.semnum = 0; + env->me_wmutex.semnum = 1; #endif } return MDB_SUCCESS; @@ -4694,9 +4584,6 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode { int oflags, rc, len, excl = -1; char *lpath, *dpath; -#ifdef _WIN32 - wchar_t *wpath; -#endif if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; @@ -4742,7 +4629,6 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode rc = ENOMEM; goto leave; } - env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ /* For RDONLY, get lockfile after we know datafile exists */ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { @@ -4760,12 +4646,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode len = OPEN_ALWAYS; } mode = FILE_ATTRIBUTE_NORMAL; - rc = utf8_to_utf16(dpath, -1, &wpath, NULL); - if (rc) - goto leave; - env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, - NULL, len, mode, NULL); - free(wpath); + env->me_fd = CreateFileA(dpath, oflags, FILE_SHARE_READ, NULL, len, mode, NULL); #else if (F_ISSET(flags, MDB_RDONLY)) oflags = O_RDONLY; @@ -4794,13 +4675,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode */ #ifdef _WIN32 len = OPEN_EXISTING; - rc = utf8_to_utf16(dpath, -1, &wpath, NULL); - if (rc) - goto leave; - env->me_mfd = CreateFileW(wpath, oflags, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, - mode | FILE_FLAG_WRITE_THROUGH, NULL); - free(wpath); + env->me_mfd = CreateFileA(dpath, oflags, FILE_SHARE_READ, NULL, len, mode | FILE_FLAG_WRITE_THROUGH, NULL); #else oflags &= ~O_CREAT; env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); @@ -4816,20 +4691,21 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode if (rc) goto leave; } + if (!((flags & MDB_RDONLY) || + (env->me_pbuf = calloc(1, env->me_psize)))) + rc = ENOMEM; if (!(flags & MDB_RDONLY)) { MDB_txn *txn; int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); - if ((env->me_pbuf = calloc(1, env->me_psize)) && - (txn = calloc(1, size))) - { + txn = calloc(1, size); + if (txn) { txn->mt_dbs = (MDB_db *)((char *)txn + tsize); txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDB_TXN_FINISHED; env->me_txn0 = txn; } else { rc = ENOMEM; @@ -4855,15 +4731,13 @@ mdb_env_close0(MDB_env *env, int excl) return; /* Doing this here since me_dbxs may not exist during mdb_env_close */ - if (env->me_dbxs) { - for (i = env->me_maxdbs; --i >= CORE_DBS; ) - free(env->me_dbxs[i].md_name.mv_data); - free(env->me_dbxs); - } + for (i = env->me_maxdbs; --i > MAIN_DBI; ) + free(env->me_dbxs[i].md_name.mv_data); free(env->me_pbuf); free(env->me_dbiseqs); free(env->me_dbflags); + free(env->me_dbxs); free(env->me_path); free(env->me_dirty_list); free(env->me_txn0); @@ -4893,12 +4767,8 @@ mdb_env_close0(MDB_env *env, int excl) MDB_PID_T pid = env->me_pid; /* Clearing readers is done in this function because * me_txkey with its destructor must be disabled first. - * - * We skip the the reader mutex, so we touch only - * data owned by this process (me_close_readers and - * our readers), and clear each reader atomically. */ - for (i = env->me_close_readers; --i >= 0; ) + for (i = env->me_numreaders; --i >= 0; ) if (env->me_txns->mti_readers[i].mr_pid == pid) env->me_txns->mti_readers[i].mr_pid = 0; #ifdef _WIN32 @@ -4909,20 +4779,15 @@ mdb_env_close0(MDB_env *env, int excl) /* Windows automatically destroys the mutexes when * the last handle closes. */ -#elif defined(MDB_USE_POSIX_SEM) - if (env->me_rmutex != SEM_FAILED) { - sem_close(env->me_rmutex); - if (env->me_wmutex != SEM_FAILED) - sem_close(env->me_wmutex); +#elif defined(MDB_USE_SYSV_SEM) + if (env->me_rmutex.semid != -1) { /* If we have the filelock: If we are the * only remaining user, clean up semaphores. */ if (excl == 0) mdb_env_excl_lock(env, &excl); - if (excl > 0) { - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); - } + if (excl > 0) + semctl(env->me_rmutex.semid, 0, IPC_RMID); } #endif munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); @@ -4969,11 +4834,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b) *(size_t *)a->mv_data > *(size_t *)b->mv_data; } -/** Compare two items pointing at aligned unsigned int's. - * - * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, - * but #mdb_cmp_clong() is called instead if the data type is size_t. - */ +/** Compare two items pointing at aligned unsigned int's */ static int mdb_cmp_int(const MDB_val *a, const MDB_val *b) { @@ -5011,6 +4872,13 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b) #endif } +/** Compare two items pointing at size_t's of unknown alignment. */ +#ifdef MISALIGNED_OK +# define mdb_cmp_clong mdb_cmp_long +#else +# define mdb_cmp_clong mdb_cmp_cint +#endif + /** Compare two items lexically */ static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) @@ -5091,7 +4959,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) cmp = mdb_cmp_long; - else + else if(NODEPTR(mp, 1)->mn_ksize == sizeof(int)) cmp = mdb_cmp_int; } @@ -5173,15 +5041,15 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - DPRINTF(("popping page %"Z"u off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); - +#if MDB_DEBUG + MDB_page *top = mc->mc_pg[mc->mc_top]; +#endif mc->mc_snum--; - if (mc->mc_snum) { + if (mc->mc_snum) mc->mc_top--; - } else { - mc->mc_flags &= ~C_INITIALIZED; - } + + DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, + DDBI(mc), (void *) mc)); } } @@ -5218,7 +5086,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) MDB_page *p = NULL; int level; - if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { MDB_txn *tx2 = txn; level = 1; do { @@ -5279,11 +5147,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) indx_t i; DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - /* Don't assert on branch pages in the FreeDB. We can get here - * while in the process of rebalancing a FreeDB branch page; we must - * let that proceed. ITS#8336 - */ - mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + mdb_cassert(mc, NUMKEYS(mp) > 1); DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { @@ -5379,8 +5243,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ - if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { - DPUTS("transaction may not be used now"); + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("transaction has failed, must abort"); return MDB_BAD_TXN; } else { /* Make sure we're using an up-to-date root */ @@ -5400,8 +5264,6 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) &mc->mc_dbx->md_name, &exact); if (!exact) return MDB_NOTFOUND; - if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) - return MDB_INCOMPATIBLE; /* not a named DB */ rc = mdb_node_read(mc->mc_txn, leaf, &data); if (rc) return rc; @@ -5501,7 +5363,6 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) return MDB_CORRUPTED; } } - txn->mt_dirty_room++; if (!(env->me_flags & MDB_WRITEMAP)) mdb_dpage_free(env, mp); release: @@ -5565,10 +5426,10 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; - if (txn->mt_flags & MDB_TXN_BLOCKED) + if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; mdb_cursor_init(&mc, txn, dbi, &mx); @@ -5668,10 +5529,8 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) { - mc->mc_flags ^= C_DEL; + if (mc->mc_flags & C_DEL) goto skip; - } if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { DPUTS("=====> move to next sibling page"); @@ -5750,8 +5609,6 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); - mc->mc_flags &= ~(C_EOF|C_DEL); - if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { @@ -5763,6 +5620,8 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } else mc->mc_ki[mc->mc_top]--; + mc->mc_flags &= ~C_EOF; + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); @@ -5898,8 +5757,6 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else return MDB_NOTFOUND; } - } else { - mc->mc_pg[0] = 0; } rc = mdb_page_search(mc, key, 0); @@ -5918,10 +5775,8 @@ set2: if (leaf == NULL) { DPUTS("===> inexact leaf not found, goto sibling"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { - mc->mc_flags |= C_EOF; + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) return rc; /* no entries matched */ - } mp = mc->mc_pg[mc->mc_top]; mdb_cassert(mc, IS_LEAF(mp)); leaf = NODEPTR(mp, 0); @@ -5959,21 +5814,15 @@ set1: return rc; } } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val olddata; - MDB_cmp_func *dcmp; - if ((rc = mdb_node_read(mc->mc_txn, leaf, &olddata)) != MDB_SUCCESS) + MDB_val d2; + if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) return rc; - dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) - dcmp = mdb_cmp_clong; -#endif - rc = dcmp(data, &olddata); + rc = mc->mc_dbx->md_dcmp(data, &d2); if (rc) { if (op == MDB_GET_BOTH || rc > 0) return MDB_NOTFOUND; rc = 0; - *data = olddata; + *data = d2; } } else { @@ -6093,7 +5942,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (mc == NULL) return EINVAL; - if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; switch (op) { @@ -6117,6 +5966,8 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_GET_KEY(leaf, key); if (data) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_flags & C_DEL) + mdb_xcursor_init1(mc, leaf); rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); } else { rc = mdb_node_read(mc->mc_txn, leaf, data); @@ -6262,7 +6113,7 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6290,9 +6141,10 @@ int mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags) { + enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ MDB_env *env; MDB_node *leaf = NULL; - MDB_page *fp, *mp, *sub_root = NULL; + MDB_page *fp, *mp; uint16_t fp_flags; MDB_val xdata, *rdata, dkey, olddata; MDB_db dummy; @@ -6321,7 +6173,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, nospill = flags & MDB_NOSPILL; flags &= ~MDB_NOSPILL; - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (key->mv_size-1 >= ENV_MAXKEY(env)) @@ -6446,18 +6298,16 @@ fix_parent: * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short dtop = 1; + unsigned short top = mc->mc_top; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) mc->mc_top--; - dtop++; - } if (mc->mc_ki[mc->mc_top]) rc2 = mdb_update_key(mc, key); else rc2 = MDB_SUCCESS; - mc->mc_top += dtop; + mc->mc_top = top; if (rc2) return rc2; } @@ -6482,18 +6332,17 @@ more: /* Was a single item before, must convert now */ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDB_cmp_func *dcmp; /* Just overwrite the current item */ if (flags == MDB_CURRENT) goto current; - dcmp = mc->mc_dbx->md_dcmp; + #if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) - dcmp = mdb_cmp_clong; + if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + mc->mc_dbx->md_dcmp = mdb_cmp_clong; #endif /* does data match? */ - if (!dcmp(data, &olddata)) { - if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (flags & MDB_NODUPDATA) return MDB_KEYEXIST; /* overwrite it */ goto current; @@ -6574,7 +6423,6 @@ prep_subDB: offset = env->me_psize - olddata.mv_size; flags |= F_DUPDATA|F_SUBDATA; dummy.md_root = mp->mp_pgno; - sub_root = mp; } if (mp != fp) { mp->mp_flags = fp_flags | P_DIRTY; @@ -6599,9 +6447,6 @@ prep_subDB: goto new_sub; } current: - /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ - if ((leaf->mn_flags ^ flags) & F_SUBDATA) - return MDB_INCOMPATIBLE; /* overflow page overwrites need special handling */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; @@ -6638,7 +6483,6 @@ current: return ENOMEM; id2.mid = pg; id2.mptr = np; - /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); if (!(flags & MDB_RESERVE)) { @@ -6695,7 +6539,7 @@ new_sub: } else { /* There is room already in this leaf page. */ rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0) { + if (rc == 0 && insert_key) { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; @@ -6707,15 +6551,10 @@ new_sub: m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { m3->mc_ki[i]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } } } } @@ -6727,7 +6566,7 @@ new_sub: * DB are all zero size. */ if (do_sub) { - int xflags, new_dupdata; + int xflags; size_t ecount; put_sub: xdata.mv_size = 0; @@ -6740,38 +6579,27 @@ put_sub: xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; } - if (sub_root) - mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; - new_dupdata = (int)dkey.mv_size; /* converted, write the original data first */ if (dkey.mv_size) { rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); if (rc) goto bad_sub; - /* we've done our job */ - dkey.mv_size = 0; - } - if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2; - MDB_xcursor *mx = mc->mc_xcursor; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - int nkeys = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp) { - if (m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init1(m2, leaf); } } } + /* we've done our job */ + dkey.mv_size = 0; } ecount = mc->mc_xcursor->mx_db.md_entries; if (flags & MDB_APPENDDUP) @@ -6823,7 +6651,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) MDB_page *mp; int rc; - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (!(mc->mc_flags & C_INITIALIZED)) @@ -6848,7 +6676,6 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (flags & MDB_NODUPDATA) { /* mdb_cursor_del0() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); @@ -6868,25 +6695,17 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - /* fix other sub-DB cursors pointed at fake pages on this page */ + /* fix other sub-DB cursors pointed at this fake page */ for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } else { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (!(n2->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } - } + if (m2->mc_pg[mc->mc_top] == mp && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } } mc->mc_db->md_entries--; + mc->mc_flags |= C_DEL; return rc; - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } /* otherwise fall thru and delete the sub-DB */ } @@ -6898,11 +6717,6 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) goto fail; } } - /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ - else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { - rc = MDB_INCOMPATIBLE; - goto fail; - } /* add overflow pages to free list */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { @@ -7034,7 +6848,6 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_page *ofp = NULL; /* overflow page */ - void *ndata; DKBUF; mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); @@ -7065,7 +6878,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, if (key != NULL) node_size += key->mv_size; if (IS_LEAF(mp)) { - mdb_cassert(mc, key && data); + mdb_cassert(mc, data); if (F_ISSET(flags, F_BIGDATA)) { /* Data already on overflow page. */ node_size += sizeof(pgno_t); @@ -7116,21 +6929,23 @@ update: memcpy(NODEKEY(node), key->mv_data, key->mv_size); if (IS_LEAF(mp)) { - ndata = NODEDATA(node); + mdb_cassert(mc, key); if (ofp == NULL) { if (F_ISSET(flags, F_BIGDATA)) - memcpy(ndata, data->mv_data, sizeof(pgno_t)); + memcpy(node->mn_data + key->mv_size, data->mv_data, + sizeof(pgno_t)); else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; + data->mv_data = node->mn_data + key->mv_size; else - memcpy(ndata, data->mv_data, data->mv_size); + memcpy(node->mn_data + key->mv_size, data->mv_data, + data->mv_size); } else { - memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); - ndata = METADATA(ofp); + memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, + sizeof(pgno_t)); if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; + data->mv_data = METADATA(ofp); else - memcpy(ndata, data->mv_data, data->mv_size); + memcpy(METADATA(ofp), data->mv_data, data->mv_size); } } @@ -7212,38 +7027,45 @@ mdb_node_shrink(MDB_page *mp, indx_t indx) MDB_node *node; MDB_page *sp, *xp; char *base; - indx_t delta, nsize, len, ptr; - int i; + int nsize, delta; + indx_t i, numkeys, ptr; node = NODEPTR(mp, indx); sp = (MDB_page *)NODEDATA(node); delta = SIZELEFT(sp); - nsize = NODEDSZ(node) - delta; + xp = (MDB_page *)((char *)sp + delta); - /* Prepare to shift upward, set len = length(subpage part to shift) */ + /* shift subpage upward */ if (IS_LEAF2(sp)) { - len = nsize; + nsize = NUMKEYS(sp) * sp->mp_pad; if (nsize & 1) return; /* do not make the node uneven-sized */ + memmove(METADATA(xp), METADATA(sp), nsize); } else { - xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ - for (i = NUMKEYS(sp); --i >= 0; ) + int i; + numkeys = NUMKEYS(sp); + for (i=numkeys-1; i>=0; i--) xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; - len = PAGEHDRSZ; } - sp->mp_upper = sp->mp_lower; - COPY_PGNO(sp->mp_pgno, mp->mp_pgno); - SETDSZ(node, nsize); + xp->mp_upper = sp->mp_lower; + xp->mp_lower = sp->mp_lower; + xp->mp_flags = sp->mp_flags; + xp->mp_pad = sp->mp_pad; + COPY_PGNO(xp->mp_pgno, mp->mp_pgno); - /* Shift <lower nodes...initial part of subpage> upward */ - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + delta, base, (char *)sp + len - base); + nsize = NODEDSZ(node) - delta; + SETDSZ(node, nsize); + /* shift lower nodes upward */ ptr = mp->mp_ptrs[indx]; - for (i = NUMKEYS(mp); --i >= 0; ) { + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { if (mp->mp_ptrs[i] <= ptr) mp->mp_ptrs[i] += delta; } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); mp->mp_upper += delta; } @@ -7296,7 +7118,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_cursor.mc_flags = C_SUB; } else { MDB_page *fp = NODEDATA(node); - mx->mx_db.md_pad = 0; + mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; mx->mx_db.md_flags = 0; mx->mx_db.md_depth = 1; mx->mx_db.md_branch_pages = 0; @@ -7318,45 +7140,13 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; #endif } - -/** Fixup a sorted-dups cursor due to underlying update. - * Sets up some fields that depend on the data from the main cursor. - * Almost the same as init1, but skips initialization steps if the - * xcursor had already been used. - * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. - * @param[in] src_mx The xcursor of an up-to-date cursor. - * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. - */ -static void -mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (new_dupdata) { - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; - mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ -#if UINT_MAX < SIZE_MAX - mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; -#endif - } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { - return; - } - mx->mx_db = src_mx->mx_db; - mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root)); -} - /** Initialize a cursor for a given transaction and database. */ static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) @@ -7371,7 +7161,6 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) mc->mc_snum = 0; mc->mc_top = 0; mc->mc_pg[0] = 0; - mc->mc_ki[0] = 0; mc->mc_flags = 0; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { mdb_tassert(txn, mx != NULL); @@ -7391,13 +7180,14 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) MDB_cursor *mc; size_t size = sizeof(MDB_cursor); - if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + if (!ret || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; - if (txn->mt_flags & MDB_TXN_BLOCKED) + if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; - if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + /* Allow read access to the freelist */ + if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) return EINVAL; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) @@ -7422,13 +7212,13 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { - if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) + if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi)) return EINVAL; if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) return EINVAL; - if (txn->mt_flags & MDB_TXN_BLOCKED) + if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); @@ -7447,7 +7237,7 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (mc->mc_xcursor == NULL) return MDB_INCOMPATIBLE; - if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; if (!(mc->mc_flags & C_INITIALIZED)) @@ -7573,26 +7363,10 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) static void mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); -/** Perform \b act while tracking temporary cursor \b mn */ -#define WITH_CURSOR_TRACKING(mn, act) do { \ - MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ - if ((mn).mc_flags & C_SUB) { \ - dummy.mc_flags = C_INITIALIZED; \ - dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ - tracked = &dummy; \ - } else { \ - tracked = &(mn); \ - } \ - tracked->mc_next = *tp; \ - *tp = tracked; \ - { act; } \ - *tp = tracked->mc_next; \ -} while (0) - /** Move a node from csrc to cdst. */ static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) { MDB_node *srcnode; MDB_val key, data; @@ -7644,7 +7418,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) data.mv_size = NODEDSZ(srcnode); data.mv_data = NODEDATA(srcnode); } - mn.mc_xcursor = NULL; if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { unsigned int snum = cdst->mc_snum; MDB_node *s2; @@ -7691,64 +7464,18 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mpd, *mps; - - mps = csrc->mc_pg[csrc->mc_top]; - /* If we're adding on the left, bump others up */ - if (fromleft) { - mpd = cdst->mc_pg[csrc->mc_top]; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3 != cdst && - m3->mc_pg[csrc->mc_top] == mpd && - m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { - m3->mc_ki[csrc->mc_top]++; - } - if (m3 !=csrc && - m3->mc_pg[csrc->mc_top] == mps && - m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]++; - } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } - } - } else - /* Adding on the right, bump others down */ - { - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3->mc_pg[csrc->mc_top] == mps) { - if (!m3->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]--; - } else { - m3->mc_ki[csrc->mc_top]--; - } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } - } + MDB_page *mp = csrc->mc_pg[csrc->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == + csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; } } } @@ -7769,10 +7496,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(csrc, &mn); mn.mc_snum--; mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_update_key(&mn, &key)); - if (rc) + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { @@ -7800,10 +7524,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(cdst, &mn); mn.mc_snum--; mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_update_key(&mn, &key)); - if (rc) + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { @@ -7813,7 +7534,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) cdst->mc_ki[cdst->mc_top] = 0; rc = mdb_update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(cdst, rc == MDB_SUCCESS); + mdb_cassert(csrc, rc == MDB_SUCCESS); } } @@ -7850,9 +7571,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) if ((rc = mdb_page_touch(cdst))) return rc; - /* get dst page again now that we've touched it. */ - pdst = cdst->mc_pg[cdst->mc_top]; - /* Move all nodes from src to dst. */ j = nkeys = NUMKEYS(pdst); @@ -7872,7 +7590,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) MDB_cursor mn; MDB_node *s2; mdb_cursor_copy(csrc, &mn); - mn.mc_xcursor = NULL; /* must find the lowest key below src */ rc = mdb_page_search_lowest(&mn); if (rc) @@ -7931,7 +7648,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; - unsigned int top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) @@ -7940,19 +7656,9 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3 = m2; if (m3 == csrc) continue; if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[top] == psrc) { - m3->mc_pg[top] = pdst; - m3->mc_ki[top] += nkeys; - m3->mc_ki[top-1] = cdst->mc_ki[top-1]; - } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && - m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { - m3->mc_ki[top-1]--; - } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(psrc)) { - MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + if (m3->mc_pg[csrc->mc_top] == psrc) { + m3->mc_pg[csrc->mc_top] = pdst; + m3->mc_ki[csrc->mc_top] += nkeys; } } } @@ -7961,9 +7667,9 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) uint16_t depth = cdst->mc_db->md_depth; mdb_cursor_pop(cdst); rc = mdb_rebalance(cdst); - /* Did the tree height change? */ - if (depth != cdst->mc_db->md_depth) - snum += cdst->mc_db->md_depth - depth; + /* Did the tree shrink? */ + if (depth > cdst->mc_db->md_depth) + snum--; cdst->mc_snum = snum; cdst->mc_top = snum-1; } @@ -8002,24 +7708,18 @@ static int mdb_rebalance(MDB_cursor *mc) { MDB_node *node; - int rc, fromleft; - unsigned int ptop, minkeys, thresh; + int rc; + unsigned int ptop, minkeys; MDB_cursor mn; indx_t oldki; - if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { - minkeys = 2; - thresh = 1; - } else { - minkeys = 1; - thresh = FILL_THRESHOLD; - } + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); @@ -8053,8 +7753,7 @@ mdb_rebalance(MDB_cursor *mc) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) - continue; + if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[0] == mp) { m3->mc_snum = 0; m3->mc_top = 0; @@ -8089,16 +7788,14 @@ mdb_rebalance(MDB_cursor *mc) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3 == mc) continue; - if (!(m3->mc_flags & C_INITIALIZED)) - continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[0] == mp) { - for (i=0; i<mc->mc_db->md_depth; i++) { + m3->mc_snum--; + m3->mc_top--; + for (i=0; i<m3->mc_snum; i++) { m3->mc_pg[i] = m3->mc_pg[i+1]; m3->mc_ki[i] = m3->mc_ki[i+1]; } - m3->mc_snum--; - m3->mc_top--; } } } @@ -8135,7 +7832,6 @@ mdb_rebalance(MDB_cursor *mc) return rc; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - fromleft = 0; } else { /* There is at least one neighbor to the left. */ @@ -8147,7 +7843,6 @@ mdb_rebalance(MDB_cursor *mc) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; mc->mc_ki[mc->mc_top] = 0; - fromleft = 1; } DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", @@ -8158,21 +7853,19 @@ mdb_rebalance(MDB_cursor *mc) * move one key from it. Otherwise we should try to merge them. * (A branch page must never have less than 2 keys.) */ - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc, fromleft); - if (fromleft) { - /* if we inserted on left, bump position up */ + minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdb_node_move(&mn, mc); + if (mc->mc_ki[ptop]) { oldki++; } } else { - if (!fromleft) { + if (mc->mc_ki[ptop] == 0) { rc = mdb_page_merge(&mn, mc); } else { oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_page_merge(mc, &mn)); + rc = mdb_page_merge(mc, &mn); mdb_cursor_copy(&mn, mc); } mc->mc_flags &= ~C_EOF; @@ -8189,59 +7882,43 @@ mdb_cursor_del0(MDB_cursor *mc) MDB_page *mp; indx_t ki; unsigned int nkeys; - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; mdb_node_del(mc, mc->mc_db->md_pad); mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } - } - } - } rc = mdb_rebalance(mc); if (rc == MDB_SUCCESS) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by mdb_rebalance and aren't needed here. - */ - if (!mc->mc_snum) - return rc; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; mp = mc->mc_pg[mc->mc_top]; nkeys = NUMKEYS(mp); + /* if mc points past last node in page, find next sibling */ + if (mc->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(mc, 1); + if (rc == MDB_NOTFOUND) { + mc->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + } + } + /* Adjust other cursors pointing to mp */ for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; - if (m3->mc_snum < mc->mc_snum) + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= ki) { + m3->mc_flags |= C_DEL; + if (m3->mc_ki[mc->mc_top] > ki) + m3->mc_ki[mc->mc_top]--; + else if (mc->mc_db->md_flags & MDB_DUPSORT) + m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; + } if (m3->mc_ki[mc->mc_top] >= nkeys) { rc = mdb_cursor_sibling(m3, 1); if (rc == MDB_NOTFOUND) { @@ -8263,10 +7940,10 @@ int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { - if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (!key || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { @@ -8359,27 +8036,20 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Create a right sibling. */ if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; - rp->mp_pad = mp->mp_pad; DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdb_update_key, - * the cursor height may be greater because it walks - * up the stack while finding the branch slot to update. - */ - if (mc->mc_top < 1) { + if (mc->mc_snum < 2) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) goto done; /* shift current top to make room for new parent */ - for (i=mc->mc_snum; i>0; i--) { - mc->mc_pg[i] = mc->mc_pg[i-1]; - mc->mc_ki[i] = mc->mc_ki[i-1]; - } + mc->mc_pg[1] = mc->mc_pg[0]; + mc->mc_ki[1] = mc->mc_ki[0]; mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); - new_root = mc->mc_db->md_depth++; + mc->mc_db->md_depth++; + new_root = 1; /* Add left (implicit) pointer. */ if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { @@ -8390,16 +8060,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_db->md_depth--; goto done; } - mc->mc_snum++; - mc->mc_top++; + mc->mc_snum = 2; + mc->mc_top = 1; ptop = 0; } else { ptop = mc->mc_top-1; DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); } + mc->mc_flags |= C_SPLITTING; mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; @@ -8449,6 +8119,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_lower += sizeof(indx_t); rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; + mc->mc_pg[mc->mc_top] = rp; } } else { int psize, nsize, k; @@ -8499,7 +8170,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno psize = 0; if (newindx <= split_indx || newindx >= nkeys) { i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); + k = newindx >= nkeys ? nkeys : split_indx+2; } else { i = nkeys; j = -1; k = split_indx-1; @@ -8541,18 +8212,21 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Copy separator key to the parent. */ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - int snum = mc->mc_snum; mn.mc_snum--; mn.mc_top--; did_split = 1; - /* We want other splits to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); if (rc) goto done; /* root split? */ - if (mc->mc_snum > snum) { + if (mn.mc_snum == mc->mc_snum) { + mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; + mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; + mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; + mc->mc_snum++; + mc->mc_top++; ptop++; } /* Right page might now have changed parent. @@ -8578,6 +8252,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } + mc->mc_flags ^= C_SPLITTING; if (rc != MDB_SUCCESS) { goto done; } @@ -8647,26 +8322,12 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* reset back to original page */ if (newindx < split_indx) { mc->mc_pg[mc->mc_top] = mp; - } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); } - } - if (nflags & MDB_RESERVE) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - if (newindx >= split_indx) { + } else { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; /* Make sure mc_ki is still valid. @@ -8685,7 +8346,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - nkeys = NUMKEYS(mp); + int fixup = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -8696,17 +8357,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno continue; if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; + if (m3->mc_flags & C_SPLITTING) + continue; if (new_root) { int k; - /* sub cursors may be on different DB */ - if (m3->mc_pg[0] != mp) - continue; /* root split */ - for (k=new_root; k>=0; k--) { + for (k=m3->mc_top; k>=0; k--) { m3->mc_ki[k+1] = m3->mc_ki[k]; m3->mc_pg[k+1] = m3->mc_pg[k]; } - if (m3->mc_ki[0] >= nkeys) { + if (m3->mc_ki[0] >= split_indx) { m3->mc_ki[0] = 1; } else { m3->mc_ki[0] = 0; @@ -8718,24 +8378,15 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= nkeys) { + if (m3->mc_ki[mc->mc_top] >= fixup) { m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= nkeys; - for (i=0; i<mc->mc_top; i++) { - m3->mc_ki[i] = mn.mc_ki[i]; - m3->mc_pg[i] = mn.mc_pg[i]; - } + m3->mc_ki[mc->mc_top] -= fixup; + m3->mc_ki[ptop] = mn.mc_ki[ptop]; } } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mp)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } } } DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); @@ -8754,23 +8405,15 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, { MDB_cursor mc; MDB_xcursor mx; - int rc; - if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (!key || !data || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; - if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) return EINVAL; - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - mdb_cursor_init(&mc, txn, dbi, &mx); - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_put(&mc, key, data, flags); - txn->mt_cursors[dbi] = mc.mc_next; - return rc; + return mdb_cursor_put(&mc, key, data, flags); } #ifndef MDB_WBUF @@ -8796,7 +8439,7 @@ typedef struct mdb_copy { } mdb_copy; /** Dedicated writer thread for compacting copy. */ -static THREAD_RET ESECT CALL_CONV +static THREAD_RET ESECT mdb_env_copythr(void *arg) { mdb_copy *my = arg; @@ -9073,7 +8716,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) my.mc_wlen[1] = 0; my.mc_olen[0] = 0; my.mc_olen[1] = 0; - my.mc_next_pgno = NUM_METAS; + my.mc_next_pgno = 2; my.mc_status = 0; my.mc_new = 1; my.mc_toggle = 0; @@ -9086,7 +8729,7 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) return rc; mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, NUM_METAS * env->me_psize); + memset(mp, 0, 2*env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; mm = (MDB_meta *)METADATA(mp); @@ -9109,27 +8752,27 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; - freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + - txn->mt_dbs[FREE_DBI].md_leaf_pages + - txn->mt_dbs[FREE_DBI].md_overflow_pages; + freecount += txn->mt_dbs[0].md_branch_pages + + txn->mt_dbs[0].md_leaf_pages + + txn->mt_dbs[0].md_overflow_pages; /* Set metapage 1 */ mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; - mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - if (mm->mm_last_pg > NUM_METAS-1) { - mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; + mm->mm_dbs[1] = txn->mt_dbs[1]; + if (mm->mm_last_pg > 1) { + mm->mm_dbs[1].md_root = mm->mm_last_pg; mm->mm_txnid = 1; } else { - mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; + mm->mm_dbs[1].md_root = P_INVALID; } } - my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_wlen[0] = env->me_psize * 2; my.mc_txn = txn; pthread_mutex_lock(&my.mc_mutex); while(my.mc_new) pthread_cond_wait(&my.mc_cond, &my.mc_mutex); pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[1].md_root, 0); if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) rc = mdb_env_cthr_toggle(&my, 1); mdb_env_cthr_toggle(&my, -1); @@ -9157,9 +8800,9 @@ static int ESECT mdb_env_copyfd0(MDB_env *env, HANDLE fd) { MDB_txn *txn = NULL; - mdb_mutexref_t wmutex = NULL; + mdb_mutex_t *wmutex = NULL; int rc; - size_t wsize, w3; + size_t wsize; char *ptr; #ifdef _WIN32 DWORD len, w2; @@ -9179,10 +8822,10 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (env->me_txns) { /* We must start the actual read txn after blocking writers */ - mdb_txn_end(txn, MDB_END_RESET_TMP); + mdb_txn_reset0(txn, "reset-stage1"); /* Temporarily block writers until we snapshot the meta pages */ - wmutex = env->me_wmutex; + wmutex = MDB_MUTEX(env, w); if (LOCK_MUTEX(rc, env, wmutex)) goto leave; @@ -9193,7 +8836,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) } } - wsize = env->me_psize * NUM_METAS; + wsize = env->me_psize * 2; ptr = env->me_map; w2 = wsize; while (w2 > 0) { @@ -9218,15 +8861,15 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (rc) goto leave; - w3 = txn->mt_next_pgno * env->me_psize; + w2 = txn->mt_next_pgno * env->me_psize; { size_t fsize = 0; if ((rc = mdb_fsize(env->me_fd, &fsize))) goto leave; - if (w3 > fsize) - w3 = fsize; + if (w2 > fsize) + w2 = fsize; } - wsize = w3 - wsize; + wsize = w2 - wsize; while (wsize > 0) { if (wsize > MAX_WRITE) w2 = MAX_WRITE; @@ -9273,9 +8916,6 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) int rc, len; char *lpath; HANDLE newfd = INVALID_HANDLE_VALUE; -#ifdef _WIN32 - wchar_t *wpath; -#endif if (env->me_flags & MDB_NOSUBDIR) { lpath = (char *)path; @@ -9293,12 +8933,8 @@ mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) * already in the OS cache. */ #ifdef _WIN32 - rc = utf8_to_utf16(lpath, -1, &wpath, NULL); - if (rc) - goto leave; - newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + newfd = CreateFileA(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); - free(wpath); #else newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); #endif @@ -9343,7 +8979,7 @@ mdb_env_copy(MDB_env *env, const char *path) int ESECT mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) { - if (flag & ~CHANGEABLE) + if (flag & (env->me_map ? ~CHANGEABLE : ~(CHANGEABLE|CHANGELESS))) return EINVAL; if (onoff) env->me_flags |= flag; @@ -9358,7 +8994,7 @@ mdb_env_get_flags(MDB_env *env, unsigned int *arg) if (!env || !arg) return EINVAL; - *arg = env->me_flags & (CHANGEABLE|CHANGELESS); + *arg = env->me_flags; return MDB_SUCCESS; } @@ -9430,32 +9066,36 @@ mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) int ESECT mdb_env_stat(MDB_env *env, MDB_stat *arg) { - MDB_meta *meta; + int toggle; if (env == NULL || arg == NULL) return EINVAL; - meta = mdb_env_pick_meta(env); + toggle = mdb_env_pick_meta(env); - return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); + return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); } int ESECT mdb_env_info(MDB_env *env, MDB_envinfo *arg) { - MDB_meta *meta; + int toggle; if (env == NULL || arg == NULL) return EINVAL; - meta = mdb_env_pick_meta(env); - arg->me_mapaddr = meta->mm_address; - arg->me_last_pgno = meta->mm_last_pg; - arg->me_last_txnid = meta->mm_txnid; - + toggle = mdb_env_pick_meta(env); + arg->me_mapaddr = env->me_metas[toggle]->mm_address; arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; - arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; + + /* me_numreaders may be zero if this process never used any readers. Use + * the shared numreader count if it exists. + */ + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; + + arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; + arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; return MDB_SUCCESS; } @@ -9490,12 +9130,15 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_db dummy; int rc, dbflag, exact; unsigned int unused = 0, seq; - char *namedup; size_t len; - if (flags & ~VALID_FLAGS) + if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, FREE_DBI); + } + + if ((flags & VALID_FLAGS) != flags) return EINVAL; - if (txn->mt_flags & MDB_TXN_BLOCKED) + if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; /* main DB? */ @@ -9519,7 +9162,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db /* Is the DB already open? */ len = strlen(name); - for (i=CORE_DBS; i<txn->mt_numdbs; i++) { + for (i=2; i<txn->mt_numdbs; i++) { if (!txn->mt_dbxs[i].md_name.mv_size) { /* Remember this free slot */ if (!unused) unused = i; @@ -9541,7 +9184,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; /* Find the DB info */ - dbflag = DB_NEW|DB_VALID|DB_USRVALID; + dbflag = DB_NEW|DB_VALID; exact = 0; key.mv_size = len; key.mv_data = (void *)name; @@ -9550,18 +9193,10 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db if (rc == MDB_SUCCESS) { /* make sure this is actually a DB */ MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) + if (!(node->mn_flags & F_SUBDATA)) return MDB_INCOMPATIBLE; - } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { - return rc; - } - - /* Done here so we cannot fail after creating a new DB */ - if ((namedup = strdup(name)) == NULL) - return ENOMEM; - - if (rc) { - /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { + /* Create if requested */ data.mv_size = sizeof(MDB_db); data.mv_data = &dummy; memset(&dummy, 0, sizeof(dummy)); @@ -9571,12 +9206,10 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db dbflag |= DB_DIRTY; } - if (rc) { - free(namedup); - } else { - /* Got info, register DBI in this txn */ + /* OK, got info, add to table */ + if (rc == MDB_SUCCESS) { unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_data = strdup(name); txn->mt_dbxs[slot].md_name.mv_size = len; txn->mt_dbxs[slot].md_rel = NULL; txn->mt_dbflags[slot] = dbflag; @@ -9597,13 +9230,12 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db return rc; } -int ESECT -mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) +int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) { - if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) + if (!arg || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; - if (txn->mt_flags & MDB_TXN_BLOCKED) + if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; if (txn->mt_dbflags[dbi] & DB_STALE) { @@ -9618,7 +9250,7 @@ mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) { char *ptr; - if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) return; ptr = env->me_dbxs[dbi].md_name.mv_data; /* If there was no name, this was already closed */ @@ -9634,7 +9266,7 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) { /* We could return the flags for the FREE_DBI too but what's the point? */ - if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; return MDB_SUCCESS; @@ -9657,10 +9289,8 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_cursor mx; unsigned int i; - /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. - * This also avoids any P_LEAF2 pages, which have no nodes. - */ - if (mc->mc_flags & C_SUB) + /* LEAF2 pages have no nodes, cannot have sub-DBs */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); @@ -9726,7 +9356,6 @@ done: } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } - mc->mc_flags &= ~C_INITIALIZED; return rc; } @@ -9735,13 +9364,13 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) MDB_cursor *mc, *m2; int rc; - if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if ((unsigned)del > 1 || dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; - if (TXN_DBI_CHANGED(txn, dbi)) + if (dbi > MAIN_DBI && TXN_DBI_CHANGED(txn, dbi)) return MDB_BAD_DBI; rc = mdb_cursor_open(txn, dbi, &mc); @@ -9756,8 +9385,8 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) goto leave; /* Can't delete the main DB */ - if (del && dbi >= CORE_DBS) { - rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (del && dbi > MAIN_DBI) { + rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, 0); if (!rc) { txn->mt_dbflags[dbi] = DB_STALE; mdb_dbi_close(txn->mt_env, dbi); @@ -9783,7 +9412,7 @@ leave: int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; @@ -9792,7 +9421,7 @@ int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { - if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; @@ -9801,7 +9430,7 @@ int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { - if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_rel = rel; @@ -9810,7 +9439,7 @@ int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { - if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) + if (dbi == FREE_DBI || !TXN_DBI_EXIST(txn, dbi)) return EINVAL; txn->mt_dbxs[dbi].md_relctx = ctx; @@ -9912,10 +9541,9 @@ mdb_reader_check(MDB_env *env, int *dead) } /** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */ -static int ESECT -mdb_reader_check0(MDB_env *env, int rlocked, int *dead) +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { - mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; + mdb_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); unsigned int i, j, rdrs; MDB_reader *mr; MDB_PID_T *pids, pid; @@ -9966,28 +9594,29 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) #ifdef MDB_ROBUST_SUPPORTED /** Handle #LOCK_MUTEX0() failure. - * Try to repair the lock file if the mutex owner died. + * With #MDB_ROBUST, try to repair the lock file if the mutex owner died. * @param[in] env the environment handle * @param[in] mutex LOCK_MUTEX0() mutex * @param[in] rc LOCK_MUTEX0() error (nonzero) * @return 0 on success with the mutex locked, or an error code on failure. */ -static int ESECT -mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) +static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc) { - int rlocked, rc2; - MDB_meta *meta; + int toggle, rlocked, rc2; +#ifndef _WIN32 + enum { WAIT_ABANDONED = EOWNERDEAD }; +#endif - if (rc == MDB_OWNERDEAD) { + if (rc == (int) WAIT_ABANDONED) { /* We own the mutex. Clean up after dead previous owner. */ rc = MDB_SUCCESS; - rlocked = (mutex == env->me_rmutex); + rlocked = (mutex == MDB_MUTEX(env, r)); if (!rlocked) { /* Keep mti_txnid updated, otherwise next writer can * overwrite data which latest meta page refers to. */ - meta = mdb_env_pick_meta(env); - env->me_txns->mti_txnid = meta->mm_txnid; + toggle = mdb_env_pick_meta(env); + env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; /* env is hosed if the dead thread was ours */ if (env->me_txn) { env->me_flags |= MDB_FATAL_ERROR; @@ -9999,7 +9628,7 @@ mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) (rc ? "this process' env is hosed" : "recovering"))); rc2 = mdb_reader_check0(env, rlocked, NULL); if (rc2 == 0) - rc2 = mdb_mutex_consistent(mutex); + rc2 = pthread_mutex_consistent(mutex); if (rc || (rc = rc2)) { DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); UNLOCK_MUTEX(mutex); @@ -10015,24 +9644,3 @@ mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) } #endif /* MDB_ROBUST_SUPPORTED */ /** @} */ - -#if defined(_WIN32) -static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize) -{ - int need; - wchar_t *result; - need = MultiByteToWideChar(CP_UTF8, 0, src, srcsize, NULL, 0); - if (need == 0xFFFD) - return EILSEQ; - if (need == 0) - return EINVAL; - result = malloc(sizeof(wchar_t) * need); - if (!result) - return ENOMEM; - MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need); - if (dstsize) - *dstsize = need; - *dst = result; - return 0; -} -#endif /* defined(_WIN32) */ diff --git a/plugins/Dbx_mdb/src/lmdb/midl.c b/plugins/Dbx_mdb/src/lmdb/midl.c index 5c6d841a7a..88a3aff10c 100644 --- a/plugins/Dbx_mdb/src/lmdb/midl.c +++ b/plugins/Dbx_mdb/src/lmdb/midl.c @@ -3,7 +3,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software <http://www.openldap.org/>. * - * Copyright 2000-2015 The OpenLDAP Foundation. + * Copyright 2000-2014 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,15 +116,17 @@ void mdb_midl_free(MDB_IDL ids) free(ids-1); } -void mdb_midl_shrink( MDB_IDL *idp ) +int mdb_midl_shrink( MDB_IDL *idp ) { MDB_IDL ids = *idp; if (*(--ids) > MDB_IDL_UM_MAX && - (ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) + (ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID)))) { *ids++ = MDB_IDL_UM_MAX; *idp = ids; + return 1; } + return 0; } static int mdb_midl_grow( MDB_IDL *idp, int num ) diff --git a/plugins/Dbx_mdb/src/lmdb/midl.h b/plugins/Dbx_mdb/src/lmdb/midl.h index 2331e78398..3d0e774406 100644 --- a/plugins/Dbx_mdb/src/lmdb/midl.h +++ b/plugins/Dbx_mdb/src/lmdb/midl.h @@ -11,7 +11,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software <http://www.openldap.org/>. * - * Copyright 2000-2015 The OpenLDAP Foundation. + * Copyright 2000-2014 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,6 +27,7 @@ #define _MDB_MIDL_H_ #include <stddef.h> +#include <stdint.h> #ifdef __cplusplus extern "C" { @@ -42,7 +43,7 @@ extern "C" { /** A generic unsigned ID number. These were entryIDs in back-bdb. * Preferably it should have the same size as a pointer. */ -typedef size_t MDB_ID; +typedef uint64_t MDB_ID; /** An IDL is an ID List, a sorted array of IDs. The first * element of the array is a counter for how many actual @@ -98,8 +99,9 @@ void mdb_midl_free(MDB_IDL ids); /** Shrink an IDL. * Return the IDL to the default size if it has grown larger. * @param[in,out] idp Address of the IDL to shrink. + * @return 0 on no change, non-zero if shrunk. */ -void mdb_midl_shrink(MDB_IDL *idp); +int mdb_midl_shrink(MDB_IDL *idp); /** Make room for num additional elements in an IDL. * @param[in,out] idp Address of the IDL. |