diff options
author | George Hazan <ghazan@miranda.im> | 2020-12-04 16:00:34 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2020-12-04 16:00:34 +0300 |
commit | 85370ab3a02ba4e167f20424d547d5e17f240352 (patch) | |
tree | 80ae78814db3425562d7e343597cc06f498bb157 /libs/libmdbx/src/mdbx.c | |
parent | e81c72fefdc67622976c0444cba59ba4d77fee70 (diff) |
libmdbx: promoting to 0.9.2.0 release
Diffstat (limited to 'libs/libmdbx/src/mdbx.c')
-rw-r--r-- | libs/libmdbx/src/mdbx.c | 1959 |
1 files changed, 1201 insertions, 758 deletions
diff --git a/libs/libmdbx/src/mdbx.c b/libs/libmdbx/src/mdbx.c index 2fa4734952..3bcb83b536 100644 --- a/libs/libmdbx/src/mdbx.c +++ b/libs/libmdbx/src/mdbx.c @@ -12,11 +12,16 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -97,11 +102,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -817,7 +817,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -993,6 +993,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1034,8 +1063,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1088,8 +1117,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1160,7 +1188,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1169,7 +1198,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1429,32 +1460,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1890,7 +1895,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1974,7 +1979,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2222,7 +2227,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2381,8 +2386,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2409,6 +2412,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2546,7 +2551,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2556,7 +2561,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2832,7 +2837,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -3064,7 +3069,7 @@ static __maybe_unused void static_checks(void) { /*------------------------------------------------------------------------------ - * Internal inlines */ + * Internal inline functions */ MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); @@ -3806,12 +3811,24 @@ static __always_inline void atomic_yield(void) { #if MDBX_64BIT_CAS static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, uint64_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_compare_exchange_strong((_Atomic uint64_t *)p, &c, v); +#ifdef ATOMIC_LLONG_LOCK_FREE + STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); +#if ATOMIC_LLONG_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint64_t) == sizeof(uint64_t)); + return atomic_compare_exchange_strong((_Atomic volatile uint64_t *)p, &c, v); +#else + return atomic_compare_exchange_strong(p, &c, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #elif defined(_MSC_VER) @@ -3827,12 +3844,24 @@ static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, uint32_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_compare_exchange_strong((_Atomic uint32_t *)p, &c, v); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); + return atomic_compare_exchange_strong((_Atomic volatile uint32_t *)p, &c, v); +#else + return atomic_compare_exchange_strong(p, &c, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #elif defined(_MSC_VER) @@ -3846,12 +3875,24 @@ static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, } static __always_inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_fetch_add((_Atomic uint32_t *)p, v); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); + return atomic_fetch_add((_Atomic volatile uint32_t *)p, v); +#else + return atomic_fetch_add(p, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(p, v); #elif defined(_MSC_VER) @@ -6128,7 +6169,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int __must_check_result mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl, - const txnid_t pp_txnid); + txnid_t pp_txnid); static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags); @@ -6204,8 +6245,9 @@ static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data, unsigned flags); -static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, - int move_right); +#define SIBLING_LEFT 0 +#define SIBLING_RIGHT 2 +static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); @@ -6229,7 +6271,7 @@ static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, bool new_dupdata); -static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); +static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result mdbx_drop0(MDBX_cursor *mc, int subs); static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); @@ -6573,7 +6615,7 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { /*----------------------------------------------------------------------------*/ -/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ +/* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */ #define XCURSOR_INITED(mc) \ ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) @@ -6588,16 +6630,26 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ } while (0) +static __maybe_unused bool cursor_is_tracked(const MDBX_cursor *mc) { + for (MDBX_cursor *scan = mc->mc_txn->tw.cursors[mc->mc_dbi]; scan; + scan = scan->mc_next) + if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) + return true; + return false; +} + /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + mn.mc_txn->tw.cursors != NULL /* must be not rdonly txt */); \ + mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ MDBX_cursor mc_dummy; \ - MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + MDBX_cursor **tracking_head = &(mn).mc_txn->tw.cursors[mn.mc_dbi]; \ MDBX_cursor *tracked = &(mn); \ if ((mn).mc_flags & C_SUB) { \ mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_top = 0; \ mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ tracked = &mc_dummy; \ } \ @@ -7155,7 +7207,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { MDBX_txn *txn = mc->mc_txn; MDBX_cursor *m3, *m0 = mc; MDBX_xcursor *mx; - MDBX_page *dp, *mp; + MDBX_page *mp; unsigned i, j; int rc = MDBX_SUCCESS; @@ -7180,7 +7232,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { } } mc = mc->mc_next; - for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + for (; !mc || mc == m0; mc = txn->tw.cursors[--i]) if (i == 0) goto mark_done; } @@ -7193,11 +7245,8 @@ mark_done: pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - int level; - if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level, - txn->mt_txnid)) != MDBX_SUCCESS)) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) + MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); + if (dp && (dp->mp_flags & Mask) == pflags) dp->mp_flags ^= P_KEEP; } } @@ -7652,7 +7701,7 @@ static __always_inline __maybe_unused int ignore_enosys(int err) { #endif /* defined(_WIN32) || defined(_WIN64) */ /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, +static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset, const size_t length, const bool enable) { assert(length > 0); mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", @@ -7729,18 +7778,18 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); + const size_t prev_size = env->me_dxb_mmap.current; + const size_t prev_limit = env->me_dxb_mmap.limit; + const void *const prev_addr = env->me_map; mdbx_verbose("resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR, - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes); + prev_size, size_bytes, prev_limit, limit_bytes); mdbx_assert(env, limit_bytes >= size_bytes); mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); - const size_t prev_limit = env->me_dxb_mmap.limit; - const void *const prev_addr = env->me_map; #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: @@ -7785,9 +7834,11 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, goto bailout; if (limit_bytes != env->me_dxb_mmap.limit && env->me_lck && !implicit) { - rc = mdbx_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(rc != MDBX_SUCCESS)) + int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; goto bailout; + } /* looking for readers from this process */ MDBX_lockinfo *const lck = env->me_lck; @@ -7807,7 +7858,6 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #endif /* ! Windows */ - const size_t prev_size = env->me_dxb_mmap.current; if (size_bytes < prev_size) { mdbx_notice("resize-MADV_%s %u..%u", (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", @@ -7846,7 +7896,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes, mapping_can_be_moved); if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { - const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0); + const int readahead = + mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); if (readahead == MDBX_RESULT_FALSE) rc = mdbx_set_readahead( env, 0, (size_bytes > prev_size) ? size_bytes : prev_size, false); @@ -7889,14 +7940,12 @@ bailout: mdbx_error("failed resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes, rc); + prev_size, size_bytes, prev_limit, limit_bytes, rc); } else { mdbx_warning("unable resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes, rc); + prev_size, size_bytes, prev_limit, limit_bytes, rc); } if (!env->me_dxb_mmap.address) { env->me_flags |= MDBX_FATAL_ERROR; @@ -8054,7 +8103,7 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, if (unlikely(mc->mc_flags & C_RECLAIMING)) { /* If mc is updating the GC, then the retired-list cannot play * catch-up with itself by growing while trying to save it. */ - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE | MDBX_LIFORECLAIM); + flags &= ~MDBX_ALLOC_GC; } else if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { /* avoid (recursive) search inside empty tree and while tree is updating, * https://github.com/erthink/libmdbx/issues/31 */ @@ -8096,12 +8145,6 @@ skip_cache: const unsigned wanna_range = num - 1; while (true) { /* hsr-kick retry loop */ - /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->tw.dirtyroom == 0)) { - rc = MDBX_TXN_FULL; - goto fail; - } - MDBX_cursor_couple recur; for (MDBX_cursor_op op = MDBX_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { @@ -8142,6 +8185,11 @@ skip_cache: } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ + if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) { + /* If our dirty list is already full, we can't touch GC */ + flags &= ~MDBX_ALLOC_GC; + } if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; @@ -8242,7 +8290,7 @@ skip_cache: } } - /* Append PNL from GC record to me_reclaimed_pglist */ + /* Append PNL from GC record to tw.reclaimed_pglist */ mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); @@ -8252,6 +8300,24 @@ skip_cache: goto fail; } const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); + /* TODO: provide a user-configurable threshold */ + const unsigned threshold_2_stop_gc_reclaiming = MDBX_PNL_MAX / 4; + if (unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + threshold_2_stop_gc_reclaiming) && + (pgno_add(txn->mt_next_pgno, num) <= txn->mt_geo.upper || + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= + MDBX_PNL_MAX / 16 * 15)) { + /* Stop reclaiming to avoid overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. https://github.com/erthink/libmdbx/issues/123 */ + flags &= ~MDBX_ALLOC_GC; + if (unlikely((flags & MDBX_ALLOC_ALL) == 0)) { + /* Oh, we can't do anything */ + rc = MDBX_TXN_FULL; + goto fail; + } + break; + } rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -8474,7 +8540,7 @@ done: mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); mdbx_tassert(txn, pgno < txn->mt_next_pgno); mdbx_tassert(txn, pgno == re_list[range_begin]); - /* Cutoff allocated pages from me_reclaimed_pglist */ + /* Cutoff allocated pages from tw.reclaimed_pglist */ #if MDBX_PNL_ASCENDING for (unsigned i = range_begin + num; i <= re_len;) re_list[range_begin++] = re_list[i++]; @@ -8666,7 +8732,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { done: /* Adjust cursors pointing to mp */ mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; + m2 = txn->tw.cursors[mc->mc_dbi]; if (mc->mc_flags & C_SUB) { for (; m2; m2 = m2->mc_next) { m3 = &m2->mc_xcursor->mx_cursor; @@ -8818,43 +8884,42 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return mdbx_env_sync_internal(env, force, nonblock); } -__cold int mdbx_env_sync(MDBX_env *env) { - return mdbx_env_sync_ex(env, true, false); -} +__cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); } __cold int mdbx_env_sync_poll(MDBX_env *env) { - return mdbx_env_sync_ex(env, false, true); + return __inline_mdbx_env_sync_poll(env); } /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { - MDBX_cursor *mc, *bk; - MDBX_xcursor *mx; - - for (int i = src->mt_numdbs; --i >= 0;) { - dst->mt_cursors[i] = NULL; - if ((mc = src->mt_cursors[i]) != NULL) { - size_t size = sizeof(MDBX_cursor); - if (mc->mc_xcursor) - size += sizeof(MDBX_xcursor); - for (; mc; mc = bk->mc_next) { +static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { + for (int i = parent->mt_numdbs; --i >= 0;) { + nested->tw.cursors[i] = NULL; + MDBX_cursor *mc = parent->tw.cursors[i]; + if (mc != NULL) { + size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; mc; mc = bk->mc_next) { + bk = mc; + if (mc->mc_signature != MDBX_MC_LIVE) + continue; bk = mdbx_malloc(size); if (unlikely(!bk)) return MDBX_ENOMEM; *bk = *mc; mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; /* Kill pointers into src to reduce abuse: The * user may not use mc until dst ends. But we need a valid * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = dst; - mc->mc_dbistate = &dst->mt_dbistate[i]; - if ((mx = mc->mc_xcursor) != NULL) { + mc->mc_txn = nested; + mc->mc_db = &nested->mt_dbs[i]; + mc->mc_dbistate = &nested->mt_dbistate[i]; + MDBX_xcursor *mx = mc->mc_xcursor; + if (mx != NULL) { *(MDBX_xcursor *)(bk + 1) = *mx; - mx->mx_cursor.mc_txn = dst; + mx->mx_cursor.mc_txn = nested; } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; + mc->mc_next = nested->tw.cursors[i]; + nested->tw.cursors[i] = mc; } } } @@ -8867,47 +8932,57 @@ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { - MDBX_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDBX_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0;) { - for (mc = cursors[i]; mc; mc = next) { - unsigned stage = mc->mc_signature; - mdbx_ensure(txn->mt_env, - stage == MDBX_MC_LIVE || stage == MDBX_MC_WAIT4EOT); +static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { + mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + for (int i = txn->mt_numdbs; --i >= 0;) { + MDBX_cursor *next, *mc = txn->tw.cursors[i]; + if (!mc) + continue; + txn->tw.cursors[i] = NULL; + do { + const unsigned stage = mc->mc_signature; + MDBX_cursor *bk = mc->mc_backup; next = mc->mc_next; - mdbx_tassert(txn, !next || next->mc_signature == MDBX_MC_LIVE || - next->mc_signature == MDBX_MC_WAIT4EOT); - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - /* Commit changes to parent txn */ + mdbx_ensure(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + if (bk) { + MDBX_xcursor *mx = mc->mc_xcursor; + mdbx_cassert(mc, mx == bk->mc_xcursor); + mdbx_tassert(txn, txn->mt_parent != NULL); + mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) + mc->mc_signature = stage /* Promote closed state to parent txn */; + else if (merge) { + /* Preserve changes from nested to parent txn */ mc->mc_next = bk->mc_next; mc->mc_backup = bk->mc_backup; mc->mc_txn = bk->mc_txn; + *bk->mc_db = *mc->mc_db; mc->mc_db = bk->mc_db; + *bk->mc_dbistate = *mc->mc_dbistate; mc->mc_dbistate = bk->mc_dbistate; - if ((mx = mc->mc_xcursor) != NULL) + if (mx) { + if (mx != bk->mc_xcursor) { + *bk->mc_xcursor = *mx; + mx = bk->mc_xcursor; + } mx->mx_cursor.mc_txn = bk->mc_txn; + } } else { - /* Abort nested txn */ + /* Restore from backup, i.e. rollback/abort nested txn */ *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) + if (mx) *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; mdbx_free(bk); - } - if (stage == MDBX_MC_WAIT4EOT) { - mc->mc_signature = 0; - mdbx_free(mc); } else { - mc->mc_signature = MDBX_MC_READY4CLOSE; + mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } - } - cursors[i] = NULL; + } while ((mc = next) != NULL); } } @@ -9471,7 +9546,7 @@ int mdbx_txn_renew(MDBX_txn *txn) { int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret) { - return mdbx_txn_begin_ex(env, parent, flags, ret, nullptr); + return __inline_mdbx_txn_begin(env, parent, flags, ret); } int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { @@ -9552,7 +9627,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent) { mdbx_tassert(txn, mdbx_dirtylist_check(parent)); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->tw.dirtylist = mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1)); txn->tw.reclaimed_pglist = @@ -9596,7 +9671,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT); + txn->mt_dbistate[i] = + parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); mdbx_tassert(parent, parent->mt_parent || parent->tw.dirtyroom + parent->tw.dirtylist->length == @@ -9773,6 +9849,7 @@ int mdbx_txn_flags(const MDBX_txn *txn) { /* Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { + mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; @@ -9874,10 +9951,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (txn == env->me_txn0) mdbx_txn_valgrind(env, nullptr); #endif - /* Export or close DBI handles created in this txn */ - mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ - mdbx_cursors_eot(txn, 0); + mdbx_cursors_eot(txn, false); if (!(env->me_flags & MDBX_WRITEMAP)) mdbx_dlist_free(txn); @@ -9886,17 +9961,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { env->me_txn = txn->mt_parent; if (txn == env->me_txn0) { mdbx_assert(env, txn->mt_parent == NULL); + /* Export or close DBI handles created in this txn */ + mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); mdbx_pnl_shrink(&txn->tw.retired_pages); mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { mdbx_assert(env, txn->mt_parent != NULL); + MDBX_txn *const parent = txn->mt_parent; + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - MDBX_txn *const parent = txn->mt_parent; - env->me_txn->mt_child = NULL; - env->me_txn->mt_flags &= ~MDBX_TXN_HAS_CHILD; mdbx_pnl_free(txn->tw.reclaimed_pglist); mdbx_pnl_free(txn->tw.spill_pages); @@ -9917,6 +9995,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { } mdbx_free(txn->tw.dirtylist); + parent->mt_child = NULL; + parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { @@ -10088,7 +10168,7 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; } } - rc = mdbx_cursor_sibling(&cx.outer, 1); + rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); } mdbx_tassert(txn, rc == MDBX_NOTFOUND); } @@ -10200,8 +10280,8 @@ static int mdbx_update_gc(MDBX_txn *txn) { goto bailout_notracking; couple.outer.mc_flags |= C_RECLAIMING; - couple.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &couple.outer; + couple.outer.mc_next = txn->tw.cursors[FREE_DBI]; + txn->tw.cursors[FREE_DBI] = &couple.outer; retry: ++loop; @@ -10233,8 +10313,10 @@ retry_noaccount: mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (txn->tw.lifo_reclaimed) { - if (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (lifo) { + if (cleaned_gc_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)) { settled = 0; cleaned_gc_slot = 0; reused_gc_slot = 0; @@ -10265,7 +10347,7 @@ retry_noaccount: } } else { /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for me_reclaimed_pglist. */ + * now delete them and any we reserved for tw.reclaimed_pglist. */ while (cleaned_gc_id <= txn->tw.last_reclaimed) { gc_rid = cleaned_gc_id; settled = 0; @@ -10325,13 +10407,13 @@ retry_noaccount: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { - /* Return loose page numbers to me_reclaimed_pglist, + /* Return loose page numbers to tw.reclaimed_pglist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { /* Put loose page numbers in tw.retired_pages, - * since unable to return them to me_reclaimed_pglist. */ + * since unable to return them to tw.reclaimed_pglist. */ if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, txn->tw.loose_count)) != 0)) goto bailout; @@ -10479,9 +10561,12 @@ retry_noaccount: env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ - couple.outer.mc_flags &= ~C_RECLAIMING; bool need_cleanup = false; + txnid_t snap_oldest; + retry_rid: + couple.outer.mc_flags &= ~C_RECLAIMING; do { + snap_oldest = mdbx_find_oldest(txn); rc = mdbx_page_alloc(&couple.outer, 0, NULL, MDBX_ALLOC_GC); if (likely(rc == MDBX_SUCCESS)) { mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, @@ -10509,7 +10594,13 @@ retry_noaccount: gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); - txn->tw.last_reclaimed = gc_rid = mdbx_find_oldest(txn) - 1; + if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) + /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + * if the oldest reader changes since the last attempt */ + goto retry_rid; + /* no reclaimable GC entries, + * therefore no entries with ID < mdbx_find_oldest(txn) */ + txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, gc_rid); } @@ -10842,7 +10933,7 @@ retry_noaccount: cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; + txn->tw.cursors[FREE_DBI] = couple.outer.mc_next; bailout_notracking: MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; @@ -10855,24 +10946,23 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, size_t iov_bytes) { MDBX_env *const env = txn->mt_env; mdbx_assert(env, iov_items > 0); + int rc; if (likely(iov_items == 1)) { mdbx_assert(env, iov->iov_len == iov_bytes); - int rc = mdbx_pwrite(env->me_lazy_fd, iov->iov_base, iov_bytes, iov_off); - mdbx_dpage_free(env, (MDBX_page *)iov->iov_base, - bytes2pgno(env, iov_bytes)); - return rc; + rc = mdbx_pwrite(env->me_lazy_fd, iov->iov_base, iov_bytes, iov_off); } else { - int rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); - if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_error("Write error: %s", mdbx_strerror(rc)); - txn->mt_flags |= MDBX_TXN_ERROR; - } + rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); + } - for (unsigned i = 0; i < iov_items; i++) - mdbx_dpage_free(env, (MDBX_page *)iov[i].iov_base, - bytes2pgno(env, iov[i].iov_len)); - return rc; + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_error("Write error: %s", mdbx_strerror(rc)); + txn->mt_flags |= MDBX_TXN_ERROR; } + + for (unsigned i = 0; i < iov_items; i++) + mdbx_dpage_free(env, (MDBX_page *)iov[i].iov_base, + bytes2pgno(env, iov[i].iov_len)); + return rc; } /* Flush (some) dirty pages to the map, after clearing their dirty flag. @@ -11011,12 +11101,18 @@ static __always_inline bool mdbx_txn_dbi_exists(MDBX_txn *txn, MDBX_dbi dbi, return mdbx_txn_import_dbi(txn, dbi); } -int mdbx_txn_commit(MDBX_txn *txn) { +int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } + +int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); + const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; + uint32_t audit_duration = 0; + int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) - return rc; + goto provide_latency; if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { rc = MDBX_RESULT_TRUE; @@ -11027,7 +11123,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; + rc = MDBX_PANIC; + goto provide_latency; } #endif /* MDBX_ENV_CHECKPID */ @@ -11038,7 +11135,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { goto done; if (txn->mt_child) { - rc = mdbx_txn_commit(txn->mt_child); + rc = mdbx_txn_commit_ex(txn->mt_child, NULL); mdbx_tassert(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -11051,8 +11148,32 @@ int mdbx_txn_commit(MDBX_txn *txn) { } if (txn->mt_parent) { + mdbx_assert(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + mdbx_assert(env, mdbx_dirtylist_check(txn)); + + if (txn->tw.dirtylist->length == 0 && + (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0 && + parent->mt_numdbs == txn->mt_numdbs) { + for (int i = txn->mt_numdbs; --i >= 0;) { + mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbistate[i] & DBI_STALE) && + !(parent->mt_dbistate[i] & DBI_STALE)) + mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); + } + + mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + + end_mode = MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE; + goto done; + } /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ @@ -11081,7 +11202,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; /* Merge our cursors into parent's and close them */ - mdbx_cursors_eot(txn, 1); + mdbx_cursors_eot(txn, true); + end_mode |= MDBX_END_EOTDONE; /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); @@ -11093,6 +11215,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_dbistate[i] = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH)); } + ts_1 = latency ? mdbx_osal_monotime() : 0; /* Remove refunded pages from parent's dirty & spill lists */ MDBX_DPL dst = mdbx_dpl_sort(parent->tw.dirtylist); @@ -11264,6 +11387,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_flags |= MDBX_TXN_SPILLS; } + ts_2 = latency ? mdbx_osal_monotime() : 0; /* Append our loose page list to parent's */ if (txn->tw.loose_pages) { MDBX_page **lp = &parent->tw.loose_pages; @@ -11285,8 +11409,6 @@ int mdbx_txn_commit(MDBX_txn *txn) { env->me_txn = parent; parent->mt_child = NULL; - txn->mt_signature = 0; - mdbx_free(txn); mdbx_tassert(parent, mdbx_dirtylist_check(parent)); /* Scan parent's loose page for suitable for refund */ @@ -11296,13 +11418,18 @@ int mdbx_txn_commit(MDBX_txn *txn) { break; } } + + ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; + txn->mt_signature = 0; + mdbx_free(txn); mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - return MDBX_SUCCESS; + rc = MDBX_SUCCESS; + goto provide_latency; } mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == MDBX_DPL_TXNFULL); - mdbx_cursors_eot(txn, 0); + mdbx_cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; if (txn->tw.dirtylist->length == 0 && @@ -11346,17 +11473,23 @@ int mdbx_txn_commit(MDBX_txn *txn) { } } + ts_1 = latency ? mdbx_osal_monotime() : 0; rc = mdbx_update_gc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto fail; + ts_2 = latency ? mdbx_osal_monotime() : 0; if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + const uint64_t audit_end = mdbx_osal_monotime(); + audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); + ts_2 = audit_end; if (unlikely(rc != MDBX_SUCCESS)) goto fail; } rc = mdbx_page_flush(txn, 0); + ts_3 = latency ? mdbx_osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { if (txn->mt_dbs[MAIN_DBI].md_flags & DBI_DIRTY) txn->mt_dbs[MAIN_DBI].md_mod_txnid = pp_txnid2chk(txn); @@ -11378,6 +11511,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = mdbx_sync_locked( env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } + ts_4 = latency ? mdbx_osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; goto fail; @@ -11386,19 +11520,34 @@ int mdbx_txn_commit(MDBX_txn *txn) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: - return mdbx_txn_end(txn, end_mode); + rc = mdbx_txn_end(txn, end_mode); + +provide_latency: + if (latency) { + latency->audit = audit_duration; + latency->preparation = + ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc = + (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->write = + (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->sync = + (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + const uint64_t ts_5 = mdbx_osal_monotime(); + latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + } + return rc; fail: mdbx_txn_abort(txn); - return rc; + goto provide_latency; } -static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, - uint64_t *filesize, - const MDBX_page *const page, - const unsigned meta_number, - MDBX_meta *dest, - const unsigned guess_pagesize) { +static __cold int +mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, + const MDBX_page *const page, const unsigned meta_number, + MDBX_meta *dest, const unsigned guess_pagesize) { if (meta->mm_magic_and_version != MDBX_DATA_MAGIC && meta->mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) { mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, @@ -11590,7 +11739,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, /* Read the environment parameters of a DB environment * before mapping it into memory. */ -static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, +static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, uint64_t *filesize, const int lck_exclusive) { int rc = mdbx_filesize(env->me_lazy_fd, filesize); @@ -12080,7 +12229,7 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); } -int __cold mdbx_env_create(MDBX_env **penv) { +__cold int mdbx_env_create(MDBX_env **penv) { MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; @@ -12422,9 +12571,11 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EPERM; goto bailout; } - rc = mdbx_rdt_lock(env); - if (unlikely(rc != MDBX_SUCCESS)) + int err = mdbx_rdt_lock(env); + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; goto bailout; + } /* Check if there are any reading threads that do not use the SRWL */ const size_t CurrentTid = GetCurrentThreadId(); @@ -12481,11 +12632,11 @@ bailout: return rc; } -int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { - return mdbx_env_set_geometry(env, size, size, size, -1, -1, -1); +__cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return __inline_mdbx_env_set_mapsize(env, size); } -int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { +__cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12500,7 +12651,7 @@ int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { return MDBX_SUCCESS; } -int __cold mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { +__cold int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12512,7 +12663,7 @@ int __cold mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { +__cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12527,7 +12678,7 @@ int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { return MDBX_SUCCESS; } -int __cold mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { +__cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12540,7 +12691,7 @@ int __cold mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { +static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; @@ -12711,6 +12862,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + /* calculate readahead hint before mmap with zero redundant pages */ + const bool readahead = + (env->me_flags & MDBX_NORDAHEAD) == 0 && + mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); if (unlikely(err != MDBX_SUCCESS)) @@ -12974,9 +13129,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { #endif /* MADV_DONTNEED */ } - const bool readahead = (env->me_flags & MDBX_NORDAHEAD) == 0 && - mdbx_is_readahead_reasonable(env->me_dxb_mmap.current, - 0) == MDBX_RESULT_TRUE; err = mdbx_set_readahead(env, 0, used_bytes, readahead); if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) return err; @@ -12987,7 +13139,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, +static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mdbx_mode_t mode) { mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); @@ -13001,7 +13153,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, /* ensure the file system is read-only */ err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); - if (err != MDBX_SUCCESS) + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) return err; /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ @@ -13394,22 +13548,21 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, 0); } -__cold int mdbx_env_open(MDBX_env *env, const char *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +typedef struct { + void *buffer_for_free; + char *lck, *dxb; + size_t ent_len; +} MDBX_handle_env_pathname; + +__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, + const char *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { + int rc; + memset(ctx, 0, sizeof(*ctx)); if (unlikely(!pathname)) return MDBX_EINVAL; - if (flags & ~ENV_USABLE_FLAGS) - return MDBX_EINVAL; - - if (env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0) - return MDBX_EPERM; - #if defined(_WIN32) || defined(_WIN64) const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) @@ -13417,33 +13570,28 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) return ERROR_INVALID_NAME; -#endif /* Windows */ - - /* pickup previously mdbx_env_set_flags(), - * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - flags = merge_sync_flags(flags, env->me_flags); -#if defined(_WIN32) || defined(_WIN64) const DWORD dwAttrib = GetFileAttributesW(pathnameW); if (dwAttrib == INVALID_FILE_ATTRIBUTES) { rc = GetLastError(); if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (flags & MDBX_RDONLY) != 0) + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; /* auto-create directory if requested */ - if ((flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathnameW, nullptr)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathnameW, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - flags |= MDBX_NOSUBDIR; + *flags |= MDBX_NOSUBDIR; if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) - flags -= MDBX_NOSUBDIR; + *flags -= MDBX_NOSUBDIR; } #else struct stat st; @@ -13451,7 +13599,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, rc = errno; if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (flags & MDBX_RDONLY) != 0) + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; @@ -13462,41 +13610,151 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, /* always add read/write/search for owner */ S_IRWXU | ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if ((flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { rc = errno; if (rc != EEXIST) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - flags |= MDBX_NOSUBDIR; + *flags |= MDBX_NOSUBDIR; if (S_ISDIR(st.st_mode)) - flags -= MDBX_NOSUBDIR; + *flags -= MDBX_NOSUBDIR; } #endif - size_t len_full, len = strlen(pathname); - if (flags & MDBX_NOSUBDIR) { - len_full = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; - } else { - len_full = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); + static const char dxb_name[] = MDBX_DATANAME; + static const size_t dxb_name_len = sizeof(dxb_name) - 1; + static const char lck_name[] = MDBX_LOCKNAME; + static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + + ctx->ent_len = strlen(pathname); + if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && + !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; } - char *lck_pathname = mdbx_malloc(len_full); - if (!lck_pathname) + + const size_t bytes_needed = + ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) + ? sizeof(lock_suffix) + 1 + : sizeof(lck_name) + sizeof(dxb_name)); + ctx->buffer_for_free = mdbx_malloc(bytes_needed); + if (!ctx->buffer_for_free) return MDBX_ENOMEM; - char *dxb_pathname; - if (flags & MDBX_NOSUBDIR) { - dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); - sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, pathname); - strcpy(dxb_pathname, pathname); + ctx->lck = ctx->buffer_for_free; + if (*flags & MDBX_NOSUBDIR) { + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); + sprintf(ctx->lck, "%s%s", pathname, lock_suffix); + strcpy(ctx->dxb, pathname); } else { - dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCKNAME); - sprintf(lck_pathname, "%s" MDBX_LOCKNAME, pathname); - sprintf(dxb_pathname, "%s" MDBX_DATANAME, pathname); + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); + sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); + sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); } - rc = MDBX_SUCCESS; + return MDBX_SUCCESS; +} + +__cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { + switch (mode) { + default: + return MDBX_EINVAL; + case MDBX_ENV_JUST_DELETE: + case MDBX_ENV_ENSURE_UNUSED: + case MDBX_ENV_WAIT_FOR_UNUSED: + break; + } + + MDBX_env dummy_env; + memset(&dummy_env, 0, sizeof(dummy_env)); + dummy_env.me_flags = + (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; + dummy_env.me_psize = dummy_env.me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env.me_pathname = (char *)pathname; + + MDBX_handle_env_pathname env_pathname; + STATIC_ASSERT(sizeof(dummy_env.me_flags) == sizeof(MDBX_env_flags_t)); + int rc = MDBX_RESULT_TRUE, + err = mdbx_handle_env_pathname( + &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env.me_flags, 0); + if (likely(err == MDBX_SUCCESS)) { + mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, + dxb_handle = INVALID_HANDLE_VALUE; + if (mode > MDBX_ENV_JUST_DELETE) { + err = mdbx_openfile(MDBX_OPEN_DELETE, &dummy_env, env_pathname.dxb, + &dxb_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + if (err == MDBX_SUCCESS) { + err = mdbx_openfile(MDBX_OPEN_DELETE, &dummy_env, env_pathname.lck, + &clk_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + } + if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.dxb); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.lck); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS && !(dummy_env.me_flags & MDBX_NOSUBDIR)) { + err = mdbx_removedirectory(pathname); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (dxb_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(dxb_handle); + if (clk_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(clk_handle); + } else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + + mdbx_free(env_pathname.buffer_for_free); + return (err == MDBX_SUCCESS) ? rc : err; +} + +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { + int rc = check_env(env); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (flags & ~ENV_USABLE_FLAGS) + return MDBX_EINVAL; + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0) + return MDBX_EPERM; + + /* pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->me_flags; + flags = merge_sync_flags(flags, env->me_flags); + + MDBX_handle_env_pathname env_pathname; + rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when * we're only getting read access */ @@ -13524,38 +13782,39 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, rc = MDBX_ENOMEM; } - const uint32_t saved_me_flags = env->me_flags; env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - env->me_path = mdbx_strdup(pathname); + env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); - if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && + env->me_dbiseqs)) { rc = MDBX_ENOMEM; goto bailout; } + memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ : MDBX_OPEN_DXB_LAZY, - env, dxb_pathname, &env->me_lazy_fd, mode); + env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, dxb_pathname, + rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); mdbx_ensure(env, (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); } #if MDBX_LOCKING == MDBX_LOCKING_SYSV - env->me_sysv_ipc.key = ftok(dxb_pathname, 42); + env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); if (env->me_sysv_ipc.key == -1) { rc = errno; goto bailout; @@ -13565,6 +13824,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, #if !(defined(_WIN32) || defined(_WIN64)) if (mode == 0) { /* pickup mode for lck-file */ + struct stat st; if (fstat(env->me_lazy_fd, &st)) { rc = errno; goto bailout; @@ -13577,12 +13837,19 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); + const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; } + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); + mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) + mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | @@ -13667,8 +13934,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_txn *txn = mdbx_calloc(1, size); if (txn) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->tw.cursors + env->me_maxdbs); txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; @@ -13712,12 +13979,12 @@ bailout: mdbx_txn_valgrind(env, nullptr); #endif } - mdbx_free(lck_pathname); + mdbx_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -static int __cold mdbx_env_close0(MDBX_env *env) { +static __cold int mdbx_env_close0(MDBX_env *env) { env->me_stuck_meta = -1; if (!(env->me_flags & MDBX_ENV_ACTIVE)) { mdbx_ensure(env, env->me_lcklist_next == nullptr); @@ -13773,7 +14040,7 @@ static int __cold mdbx_env_close0(MDBX_env *env) { mdbx_memalign_free(env->me_pbuf); mdbx_free(env->me_dbiseqs); mdbx_free(env->me_dbflags); - mdbx_free(env->me_path); + mdbx_free(env->me_pathname); mdbx_free(env->me_dirtylist); if (env->me_txn0) { mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); @@ -13786,7 +14053,7 @@ static int __cold mdbx_env_close0(MDBX_env *env) { return rc; } -int __cold mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { +__cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { MDBX_page *dp; int rc = MDBX_SUCCESS; @@ -13866,7 +14133,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { } __cold int mdbx_env_close(MDBX_env *env) { - return mdbx_env_close_ex(env, false); + return __inline_mdbx_env_close(env); } /* Compare two items pointing at aligned unsigned int's. */ @@ -13903,7 +14170,7 @@ static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { } } -/* Compare two items pointing at unsigneds of unknown alignment. +/* Compare two items pointing at unsigned values with unknown alignment. * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { @@ -14059,7 +14326,7 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, const MDBX_val *key, static void mdbx_cursor_adjust(MDBX_cursor *mc, func) { MDBX_cursor *m2; - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { func(mc, m2); } @@ -14068,13 +14335,11 @@ static void mdbx_cursor_adjust(MDBX_cursor *mc, func) { #endif /* Pop a page off the top of the cursor's stack. */ -static void mdbx_cursor_pop(MDBX_cursor *mc) { +static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { if (mc->mc_snum) { mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - - mc->mc_snum--; - if (mc->mc_snum) { + if (--mc->mc_snum) { mc->mc_top--; } else { mc->mc_flags &= ~C_INITIALIZED; @@ -14084,7 +14349,7 @@ static void mdbx_cursor_pop(MDBX_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDBX_TXN_ERROR on failure. */ -static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { +static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); @@ -14113,7 +14378,7 @@ static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { * * Returns 0 on success, non-zero on failure. */ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, - int *lvl, const txnid_t pp_txnid) { + int *lvl, txnid_t pp_txnid) { MDBX_txn *txn = mc->mc_txn; if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); @@ -14138,8 +14403,11 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, * because the dirty list got full. Bring this page * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ - if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) + if (txn->tw.spill_pages && + mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) { + pp_txnid = txn->mt_txnid; goto spilled; + } p = mdbx_dpl_find(txn->tw.dirtylist, pgno); if (p) goto dirty; @@ -14495,9 +14763,6 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14516,21 +14781,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_val save_data = *data; - int exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_RANGE, &exact); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (exact && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) != 0) { - *data = save_data; - exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_GET_BOTH_RANGE, &exact); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - - return exact ? MDBX_SUCCESS : MDBX_RESULT_TRUE; + return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, @@ -14584,15 +14835,15 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * Replaces the page at the top of the cursor's stack with the specified * sibling, if one exists. * - * [in] mc The cursor for this operation. - * [in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. + * [in] mc The cursor for this operation. + * [in] dir SIBLING_LEFT or SIBLING_RIGHT. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { +static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { int rc; - MDBX_node *indx; + MDBX_node *node; MDBX_page *mp; + assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ @@ -14601,29 +14852,28 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { mdbx_debug("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - if (move_right + if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) : (mc->mc_ki[mc->mc_top] == 0)) { - mdbx_debug("no more keys left, moving to %s sibling", - move_right ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDBX_SUCCESS)) { + mdbx_debug("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; return rc; } } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - mdbx_debug("just moving to %s index key %u", move_right ? "right" : "left", + assert((dir - 1) == -1 || (dir - 1) == 1); + mc->mc_ki[mc->mc_top] += dir - 1; + mdbx_debug("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); } mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - indx = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(indx), &mp, NULL, + node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, NULL, pp_txnid4chk(mp, mc->mc_txn))) != 0)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -14633,9 +14883,9 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { rc = mdbx_cursor_push(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (!move_right) - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; + mc->mc_ki[mc->mc_top] = + (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); return MDBX_SUCCESS; } @@ -14685,17 +14935,21 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto skip; } - if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) { + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)++ki; + const int numkeys = page_numkeys(mp); + if (unlikely(ki >= numkeys)) { mdbx_debug("%s", "=====> move to next sibling page"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { + mc->mc_ki[mc->mc_top] = numkeys - 1; + if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != + MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } mp = mc->mc_pg[mc->mc_top]; mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]++; + } skip: mdbx_debug("==> cursor points to page %" PRIaPGNO @@ -14718,18 +14972,14 @@ skip: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -14781,22 +15031,24 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_flags &= ~(C_EOF | C_DEL); - if (mc->mc_ki[mc->mc_top] == 0) { + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)--ki; + if (unlikely(ki < 0)) { + mc->mc_ki[mc->mc_top] = 0; mdbx_debug("%s", "=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, 0)) != MDBX_SUCCESS) { + if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) return rc; - } mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]--; - + } mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + if (IS_LEAF2(mp)) { if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; @@ -14805,25 +15057,20 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - mdbx_cassert(mc, IS_LEAF(mp)); node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -14878,8 +15125,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; - if (!page_numkeys(mp)) { + if (unlikely(!page_numkeys(mp))) { mc->mc_ki[mc->mc_top] = 0; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } if (IS_LEAF2(mp)) { @@ -14895,6 +15143,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * was the one we wanted. */ mc->mc_ki[mc->mc_top] = 0; *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } if (rc > 0) { @@ -14913,6 +15164,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } if (rc < 0) { @@ -14929,6 +15183,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc == 0) { /* current node was the one we wanted */ *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } } @@ -14946,17 +15203,22 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* There are no other pages */ mdbx_cassert(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } } if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDBX_SET_RANGE && exactp == &stub_exactp) { + if (op == MDBX_SET_RANGE) { rc = 0; goto set1; - } else + } else { + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); return MDBX_NOTFOUND; + } } } else { mc->mc_pg[0] = 0; @@ -14971,21 +15233,28 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, set2: node = mdbx_node_search(mc, &aligned_key, exactp); - if (exactp != &stub_exactp && !*exactp) { + if (!*exactp && op != MDBX_SET_RANGE) { /* MDBX_SET specified and not an exact match. */ + if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } if (node == NULL) { mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { + if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != + MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_LEAF(mp)); - node = page_node(mp, 0); + if (!IS_LEAF2(mp)) + node = page_node(mp, 0); } + mdbx_cassert(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); set1: mc->mc_flags |= C_INITIALIZED; @@ -15003,19 +15272,16 @@ set1: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2 = 0, *ex2p = (op == MDBX_GET_BOTH) ? &ex2 : NULL; - rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE, ex2p); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - } else if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE, NULL); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { + if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { mdbx_cassert(mc, !"Invalid data-size"); @@ -15050,20 +15316,20 @@ set1: return rc; rc = mc->mc_dbx->md_dcmp(&aligned_data, &olddata); if (rc) { + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); if (op != MDBX_GET_BOTH_RANGE || rc > 0) return MDBX_NOTFOUND; + *exactp = 0; rc = 0; } *data = olddata; - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + } else if (unlikely((rc = mdbx_node_read(mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], + mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } /* The key already matches in all other cases */ @@ -15095,28 +15361,29 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } + get_key_optional(node, key); return MDBX_SUCCESS; } @@ -15128,12 +15395,10 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (likely((mc->mc_flags & (C_EOF | C_DEL)) != C_EOF)) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } if (unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) @@ -15143,28 +15408,28 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_flags |= C_INITIALIZED | C_EOF; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], - key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], + mc->mc_ki[mc->mc_top], key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } get_key_optional(node, key); @@ -15177,7 +15442,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -15188,12 +15454,13 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; + return MDBX_ENODATA; MDBX_page *mp = mc->mc_pg[mc->mc_top]; const unsigned nkeys = page_numkeys(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { mdbx_cassert(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } mdbx_cassert(mc, nkeys > 0); @@ -15214,14 +15481,17 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; + } else { + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); + if (unlikely(rc)) + return rc; } - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); } else { rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(rc)) + return rc; } - if (unlikely(rc)) - return rc; } } break; @@ -15239,8 +15509,13 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op, - op == MDBX_SET_RANGE ? NULL : &exact); + rc = mdbx_cursor_set(mc, key, data, op, &exact); + if (mc->mc_flags & C_INITIALIZED) { + mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + } break; case MDBX_GET_MULTIPLE: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) @@ -15248,8 +15523,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) != + C_INITIALIZED) break; goto fetchm; case MDBX_NEXT_MULTIPLE: @@ -15283,7 +15558,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc == MDBX_SUCCESS) { MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { - rc = mdbx_cursor_sibling(mx, 0); + rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); if (rc == MDBX_SUCCESS) goto fetchm; } else { @@ -15313,6 +15588,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_INCOMPATIBLE; if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } { @@ -15334,6 +15610,33 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_LAST_DUP: mfunc = mdbx_cursor_last; goto mmove; + case MDBX_SET_LOWERBOUND: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + rc = mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE, &exact); + if (rc == MDBX_SUCCESS && exact && mc->mc_xcursor) { + mc->mc_flags &= ~C_DEL; + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + *data = save_data; + exact = 0; + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE, &exact); + if (rc == MDBX_NOTFOUND) { + mdbx_cassert(mc, !exact); + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } else { + int cmp = mc->mc_dbx->md_dcmp(&save_data, data); + exact = (cmp == 0); + if (cmp > 0) + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } + if (rc == MDBX_SUCCESS && !exact) + rc = MDBX_RESULT_TRUE; + break; + } default: mdbx_debug("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; @@ -15363,6 +15666,7 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { if (unlikely(rc)) return rc; *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; } mc->mc_top = 0; if (mc->mc_snum) { @@ -15387,12 +15691,14 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; + mdbx_cassert(mc, cursor_is_tracked(mc)); env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ @@ -15570,7 +15876,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; } else { - if (unlikely(rc != 0 || !(flags & MDBX_APPENDDUP))) + if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP))) /* new-key < last-key * or new-key == last-key without MDBX_APPENDDUP */ return MDBX_EKEYMISMATCH; @@ -15667,6 +15973,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, data->iov_len); } *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) np->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; @@ -16050,18 +16357,17 @@ new_sub:; rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - MDBX_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDBX_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + const MDBX_dbi dbi = mc->mc_dbi; + const unsigned i = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[i]; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { - m3->mc_ki[i]++; - } + if (m3->mc_ki[i] >= mc->mc_ki[i]) + m3->mc_ki[i] += insert_key; if (XCURSOR_INITED(m3)) XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); } @@ -16111,7 +16417,7 @@ new_sub:; MDBX_page *mp = mc->mc_pg[i]; const int nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -16184,7 +16490,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -16205,6 +16512,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; if (IS_LEAF2(mp)) goto del_key; @@ -16215,9 +16524,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) { + if (!F_ISSET(node_flags(node), F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) return rc; @@ -16235,7 +16543,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -16847,7 +17155,7 @@ int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EINVAL; + return MDBX_EBADSIGN; MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); couple->mc_userctx = ctx; @@ -16870,26 +17178,9 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!mc)) return MDBX_EINVAL; - if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE)) { - if (unlikely(mc->mc_signature != MDBX_MC_LIVE || mc->mc_backup)) - return MDBX_EINVAL; - if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_PROBLEM; - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - mc->mc_dbi = UINT_MAX; - } - - assert(!mc->mc_backup && !mc->mc_flags); - if (unlikely(mc->mc_backup || mc->mc_flags)) - return MDBX_PROBLEM; + if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && + mc->mc_signature != MDBX_MC_LIVE)) + return MDBX_EBADSIGN; int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -16901,13 +17192,52 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; + if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { + mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + if (unlikely(mc->mc_dbi != dbi || + /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || + mc->mc_txn != txn)) + return MDBX_EINVAL; + + assert(mc->mc_db == &txn->mt_dbs[dbi]); + assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); + assert(mc->mc_dbi == dbi); + assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); + return likely(mc->mc_dbi == dbi && + /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && + mc->mc_txn == txn) + ? MDBX_SUCCESS + : MDBX_EINVAL /* Disallow change DBI in nested transactions */; + } + + if (mc->mc_signature == MDBX_MC_LIVE) { + if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_PROBLEM; + if (mc->mc_flags & C_UNTRACK) { + mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); + MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + mdbx_cassert(mc, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + mc->mc_dbi = UINT_MAX; + mc->mc_next = NULL; + mc->mc_db = NULL; + mc->mc_dbx = NULL; + mc->mc_dbistate = NULL; + } + mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + rc = mdbx_cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) { + mc->mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = mc; mc->mc_flags |= C_UNTRACK; } @@ -16937,43 +17267,39 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; } -/* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { - if (unlikely(mc == NULL)) +int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { + if (unlikely(!src)) return MDBX_EINVAL; + if (unlikely(src->mc_signature != MDBX_MC_LIVE)) + return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; - - if (!mc->mc_snum) { - *countp = 0; - return MDBX_NOTFOUND; + assert(dest->mc_db == src->mc_db); + assert(dest->mc_dbi == src->mc_dbi); + assert(dest->mc_dbx == src->mc_dbx); + assert(dest->mc_dbistate == src->mc_dbistate); +again: + assert(dest->mc_txn == src->mc_txn); + dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; + dest->mc_top = src->mc_top; + dest->mc_snum = src->mc_snum; + for (unsigned i = 0; i < src->mc_snum; ++i) { + dest->mc_ki[i] = src->mc_ki[i]; + dest->mc_pg[i] = src->mc_pg[i]; } - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { - *countp = 0; - return MDBX_NOTFOUND; + if (src->mc_xcursor) { + dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db; + dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx; + src = &src->mc_xcursor->mx_cursor; + dest = &dest->mc_xcursor->mx_cursor; + goto again; } - *countp = 1; - if (mc->mc_xcursor != NULL) { - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); - *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) - ? PTRDIFF_MAX - : (size_t)mc->mc_xcursor->mx_db.md_entries; - } - } return MDBX_SUCCESS; } @@ -16985,17 +17311,19 @@ void mdbx_cursor_close(MDBX_cursor *mc) { /* Remove from txn, if tracked. * A read-only txn (!C_UNTRACK) may have been freed already, * so do not peek inside it. Only write txns track cursors. */ - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_UNTRACK) { + mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); + MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; + mdbx_cassert(mc, *prev == mc); + *prev = mc->mc_next; } mc->mc_signature = 0; + mc->mc_next = mc; mdbx_free(mc); } else { - /* cursor closed before nested txn ends */ + /* Cursor closed before nested txn ends */ mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); mc->mc_signature = MDBX_MC_WAIT4EOT; } @@ -17019,6 +17347,47 @@ MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { return mc->mc_dbi; } +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + + if (!mc->mc_snum) { + *countp = 0; + return MDBX_NOTFOUND; + } + + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { + *countp = 0; + return MDBX_NOTFOUND; + } + + *countp = 1; + if (mc->mc_xcursor != NULL) { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); + *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) + ? PTRDIFF_MAX + : (size_t)mc->mc_xcursor->mx_db.md_entries; + } + } + return MDBX_SUCCESS; +} + /* Replace the key for a branch node with a new key. * Set MDBX_TXN_ERROR on failure. * [in] mc Cursor pointing to the node to operate on. @@ -17033,6 +17402,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { int ptr, i, nkeys, indx; DKBUF; + mdbx_cassert(mc, cursor_is_tracked(mc)); indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node = page_node(mp, indx); @@ -17042,8 +17412,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, mdbx_dump_val(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), + mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, + indx, ptr, mdbx_dump_val(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } @@ -17151,7 +17521,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { const unsigned snum = cdst->mc_snum; mdbx_cassert(csrc, snum > 0); MDBX_cursor mn; - mdbx_cursor_copy(cdst, &mn); + cursor_copy_internal(cdst, &mn); mn.mc_xcursor = NULL; /* must find the lowest key below dst */ rc = mdbx_page_search_lowest(&mn); @@ -17190,7 +17560,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - rc = mdbx_update_key(&mn, &key); + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); if (unlikely(rc)) return rc; } else { @@ -17274,7 +17644,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); if (fromleft) { /* If we're adding on the left, bump others up */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) continue; @@ -17294,7 +17664,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { } } else { /* Adding on the right, bump others down */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == csrc) continue; @@ -17333,7 +17703,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", psrc->mp_pgno, DKEY(&key)); MDBX_cursor mn; - mdbx_cursor_copy(csrc, &mn); + cursor_copy_internal(csrc, &mn); mn.mc_xcursor = NULL; mdbx_cassert(csrc, mn.mc_snum > 0); mn.mc_snum--; @@ -17368,7 +17738,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", pdst->mp_pgno, DKEY(&key)); MDBX_cursor mn; - mdbx_cursor_copy(cdst, &mn); + cursor_copy_internal(cdst, &mn); mn.mc_xcursor = NULL; mdbx_cassert(cdst, mn.mc_snum > 0); mn.mc_snum--; @@ -17405,6 +17775,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { int rc; mdbx_cassert(csrc, csrc != cdst); + mdbx_cassert(csrc, cursor_is_tracked(csrc)); + mdbx_cassert(cdst, cursor_is_tracked(cdst)); const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, @@ -17449,7 +17821,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { key.iov_base = node_key(srcnode); if (pagetype & P_BRANCH) { MDBX_cursor mn; - mdbx_cursor_copy(csrc, &mn); + cursor_copy_internal(csrc, &mn); mn.mc_xcursor = NULL; /* must find the lowest key below src */ rc = mdbx_page_search_lowest(&mn); @@ -17534,7 +17906,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { const MDBX_dbi dbi = csrc->mc_dbi; const unsigned top = csrc->mc_top; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == csrc || top >= m3->mc_snum) continue; @@ -17645,7 +18017,7 @@ bailout: /* Copy the contents of a cursor. * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ -static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { +static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= *csrc->mc_txn->mt_env->me_oldest); cdst->mc_txn = csrc->mc_txn; @@ -17666,6 +18038,7 @@ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ static int mdbx_rebalance(MDBX_cursor *mc) { + mdbx_cassert(mc, cursor_is_tracked(mc)); mdbx_cassert(mc, mc->mc_snum > 0); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); @@ -17723,8 +18096,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_db->md_overflow_pages == 0 && mc->mc_db->md_leaf_pages == 1); /* Adjust cursors pointing to mp */ - const MDBX_dbi dbi = mc->mc_dbi; - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; @@ -17758,10 +18130,10 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - MDBX_dbi dbi = mc->mc_dbi; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_pg[0] == mp) { @@ -17802,7 +18174,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* Find neighbors. */ MDBX_cursor mn; - mdbx_cursor_copy(mc, &mn); + cursor_copy_internal(mc, &mn); mn.mc_xcursor = NULL; MDBX_page *left = nullptr, *right = nullptr; @@ -17839,7 +18211,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { - mdbx_cursor_copy(&mn, mc); + cursor_copy_internal(&mn, mc); mc->mc_ki[mc->mc_top] = new_ki; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; @@ -17852,7 +18224,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_page_merge(&mn, mc); + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17867,7 +18239,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); mc->mc_ki[mc->mc_top] = 0; - rc = mdbx_node_move(&mn, mc, true); + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top + 1; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17880,7 +18252,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_node_move(&mn, mc, false); + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17908,19 +18280,20 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { - mdbx_cursor_copy(&mn, mc); + cursor_copy_internal(&mn, mc); mc->mc_ki[mc->mc_top] = new_ki; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } - } else if (likely(right)) { + } + if (likely(right)) { /* try merge with right */ mdbx_cassert(mc, page_numkeys(right) >= minkeys); mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_page_merge(&mn, mc); + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -18058,7 +18431,7 @@ static __cold int mdbx_page_check(MDBX_cursor *const mc, lp->mp_pgno); continue; } - if (unlikely(number_of_ovpages(env, dsize) != lp->mp_pages)) + if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) rc = bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", dsize, lp->mp_pages); @@ -18278,135 +18651,115 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { MDBX_page *mp; indx_t ki; unsigned nkeys; - MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; + mdbx_cassert(mc, cursor_is_tracked(mc)); mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; mdbx_node_del(mc, mc->mc_db->md_xsize); mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } + rc = mdbx_rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - if (likely(rc == MDBX_SUCCESS)) { + if (unlikely(!mc->mc_snum)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done * by mdbx_rebalance and aren't needed here. */ - if (!mc->mc_snum) { - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_DEL | C_EOF; - return rc; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && - mc->mc_db->md_entries == 0 && nkeys == 0)); + mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } - /* Adjust THIS and other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, true); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } else if (unlikely(rc != MDBX_SUCCESS)) - break; + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; } - if (m3->mc_ki[mc->mc_top] >= ki || m3->mc_pg[mc->mc_top] != mp) { - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (m3->mc_flags & C_EOF) == 0) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - break; - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + MDBX_node *node = + page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; } + m3->mc_flags |= C_DEL; } } - - if (mc->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(mc, true); - if (rc == MDBX_NOTFOUND) { - mc->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - } - } - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (mc->mc_flags & C_EOF) == 0) { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (likely(rc != MDBX_SUCCESS)) - mc->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } - } - mc->mc_flags |= C_DEL; } - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - else if (mdbx_audit_enabled()) + mdbx_cassert(mc, rc == MDBX_SUCCESS); + if (mdbx_audit_enabled()) rc = mdbx_cursor_check(mc, 0); + return rc; +bailout: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } @@ -18461,10 +18814,10 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * is larger than the current one, the parent page may * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_del(&cx.outer, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; } return rc; } @@ -18551,7 +18904,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); } - mdbx_cursor_copy(mc, &mn); + cursor_copy_internal(mc, &mn); mn.mc_xcursor = NULL; mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[mn.mc_top] = 0; @@ -18744,7 +19097,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdbx_cursor_sibling(mc, false); + rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); } } } else { @@ -18900,7 +19253,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, MDBX_dbi dbi = mc->mc_dbi; nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc) continue; @@ -18976,8 +19329,8 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; /* LY: support for update (explicit overwrite) */ if (flags & MDBX_CURRENT) { @@ -18998,7 +19351,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, if (likely(rc == MDBX_SUCCESS)) rc = mdbx_cursor_put(&cx.outer, key, data, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } @@ -19008,7 +19361,6 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, #ifndef MDBX_WBUF #define MDBX_WBUF ((size_t)1024 * 1024) #endif -#define MDBX_EOF 0x10 /* mdbx_env_copythr() is done reading */ /* State needed for a double-buffering compacting copy. */ typedef struct mdbx_copy { @@ -19020,53 +19372,67 @@ typedef struct mdbx_copy { size_t mc_wlen[2]; size_t mc_olen[2]; mdbx_filehandle_t mc_fd; - volatile int mc_error; - pgno_t mc_next_pgno; - short mc_toggle; /* Buffer number in provider */ - short mc_new; /* (0-2 buffers to write) | (MDBX_EOF at end) */ /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ + volatile int mc_error; + pgno_t mc_next_pgno; + volatile unsigned mc_head; + volatile unsigned mc_tail; } mdbx_copy; /* Dedicated writer thread for compacting copy. */ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; - uint8_t *ptr; - int toggle = 0; + +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + sigset_t sigset; + sigemptyset(&sigset); + sigaddset(&sigset, SIGPIPE); + my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); +#endif /* EPIPE */ mdbx_condpair_lock(&my->mc_condpair); while (!my->mc_error) { - while (!my->mc_new && !my->mc_error) { + while (my->mc_tail == my->mc_head && !my->mc_error) { int err = mdbx_condpair_wait(&my->mc_condpair, true); if (err != MDBX_SUCCESS) { my->mc_error = err; goto bailout; } } - if (my->mc_new == 0 + MDBX_EOF) /* 0 buffers, just EOF */ - break; + const unsigned toggle = my->mc_tail & 1; size_t wsize = my->mc_wlen[toggle]; - ptr = my->mc_wbuf[toggle]; + if (wsize == 0) { + my->mc_tail += 1; + break /* EOF */; + } + my->mc_wlen[toggle] = 0; + uint8_t *ptr = my->mc_wbuf[toggle]; again: - if (wsize > 0 && !my->mc_error) { + if (!my->mc_error) { int err = mdbx_write(my->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + if (err == EPIPE) { + /* Collect the pending SIGPIPE, + * otherwise at least OS X gives it to the process on thread-exit. */ + int unused; + sigwait(&sigset, &unused); + } +#endif /* EPIPE */ my->mc_error = err; goto bailout; } } /* If there's an overflow page tail, write it too */ - if (my->mc_olen[toggle]) { - wsize = my->mc_olen[toggle]; - ptr = my->mc_over[toggle]; + wsize = my->mc_olen[toggle]; + if (wsize) { my->mc_olen[toggle] = 0; + ptr = my->mc_over[toggle]; goto again; } - my->mc_wlen[toggle] = 0; - toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; + my->mc_tail += 1; mdbx_condpair_signal(&my->mc_condpair, false); } bailout: @@ -19074,24 +19440,19 @@ bailout: return (THREAD_RESULT)0; } -/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. - * - * [in] my control structure. - * [in] adjust (1 to hand off 1 buffer) | (MDBX_EOF when ending). */ -static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { +/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ +static __cold int mdbx_env_cthr_toggle(mdbx_copy *my) { mdbx_condpair_lock(&my->mc_condpair); - my->mc_new += (short)adjust; + mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); + my->mc_head += 1; mdbx_condpair_signal(&my->mc_condpair, true); - while (!my->mc_error && (my->mc_new & 2) /* both buffers in use */) { + while (!my->mc_error && + my->mc_head - my->mc_tail == 2 /* both buffers in use */) { int err = mdbx_condpair_wait(&my->mc_condpair, false); if (err != MDBX_SUCCESS) my->mc_error = err; } mdbx_condpair_unlock(&my->mc_condpair); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; return my->mc_error; } @@ -19099,11 +19460,11 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { * [in] my control structure. * [in,out] pg database root. * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ -static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { +static __cold int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_cursor_couple couple; MDBX_page *mo, *mp, *leaf; char *buf, *ptr; - int rc, toggle; + int rc; unsigned i; /* Empty DB, nothing to do */ @@ -19139,11 +19500,9 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* This is writable space for a leaf page. Usually not needed. */ leaf = (MDBX_page *)ptr; - toggle = my->mc_toggle; while (couple.outer.mc_snum > 0) { - unsigned n; mp = couple.outer.mc_pg[couple.outer.mc_top]; - n = page_numkeys(mp); + unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { @@ -19166,11 +19525,12 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { pp_txnid4chk(mp, my->mc_txn)); if (unlikely(rc != MDBX_SUCCESS)) goto done; - if (my->mc_wlen[toggle] >= MDBX_WBUF) { - rc = mdbx_env_cthr_toggle(my, 1); + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_env->me_psize > MDBX_WBUF) { + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); @@ -19180,10 +19540,10 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { if (omp->mp_pages > 1) { my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; - rc = mdbx_env_cthr_toggle(my, 1); + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } } else if (node_flags(node) & F_SUBDATA) { if (node_ds(node) != sizeof(MDBX_db)) { @@ -19201,11 +19561,9 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_db db; memcpy(&db, node_data(node), sizeof(MDBX_db)); - my->mc_toggle = (short)toggle; rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); if (rc) goto done; - toggle = my->mc_toggle; memcpy(node_data(node), &db, sizeof(MDBX_db)); } } @@ -19234,11 +19592,12 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { continue; } } - if (my->mc_wlen[toggle] >= MDBX_WBUF) { - rc = mdbx_env_cthr_toggle(my, 1); + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > MDBX_WBUF) { + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdbx_page_copy(mo, mp, my->mc_env->me_psize); @@ -19298,7 +19657,7 @@ static __cold void make_sizeable(MDBX_meta *meta) { } /* Copy environment with compaction. */ -static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, +static __cold int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const int flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); @@ -19375,8 +19734,12 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } if (rc == MDBX_SUCCESS) rc = mdbx_env_cwalk(&ctx, &root, 0); - mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); + mdbx_env_cthr_toggle(&ctx); + mdbx_env_cthr_toggle(&ctx); thread_err = mdbx_thread_join(thread); + mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); mdbx_condpair_destroy(&ctx.mc_condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) @@ -19435,7 +19798,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* Copy environment as-is. */ -static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, +static __cold int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const int flags) { /* We must start the actual read txn after blocking writers */ @@ -19542,7 +19905,7 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, return rc; } -int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, +__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19607,7 +19970,7 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, return rc; } -int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, +__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19669,7 +20032,7 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, /******************************************************************************/ -int __cold mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, +__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, bool onoff) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19697,7 +20060,7 @@ int __cold mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, return MDBX_SUCCESS; } -int __cold mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { +__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19709,7 +20072,7 @@ int __cold mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_userctx(MDBX_env *env, void *ctx) { +__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19722,7 +20085,7 @@ void *__cold mdbx_env_get_userctx(const MDBX_env *env) { return env ? env->me_userctx : NULL; } -int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { +__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19736,7 +20099,7 @@ int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } -int __cold mdbx_env_get_path(const MDBX_env *env, const char **arg) { +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19744,11 +20107,11 @@ int __cold mdbx_env_get_path(const MDBX_env *env, const char **arg) { if (unlikely(!arg)) return MDBX_EINVAL; - *arg = env->me_path; + *arg = env->me_pathname; return MDBX_SUCCESS; } -int __cold mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { +__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19778,11 +20141,11 @@ static void mdbx_stat0(const MDBX_env *env, const MDBX_db *db, MDBX_stat *dest, dest->ms_mod_txnid = db->md_mod_txnid; } -int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *dest, size_t bytes) { - return mdbx_env_stat_ex(env, NULL, dest, bytes); +__cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) { + return __inline_mdbx_env_stat(env, stat, bytes); } -int __cold mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, +__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_stat *dest, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || dest == NULL)) return MDBX_EINVAL; @@ -19820,7 +20183,7 @@ int __cold mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, } } -int __cold mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, +__cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, uint32_t *mask) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -19871,11 +20234,12 @@ int __cold mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { - return mdbx_env_info_ex(env, NULL, arg, bytes); +__cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, + size_t bytes) { + return __inline_mdbx_env_info(env, info, bytes); } -int __cold mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, +__cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *arg, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; @@ -20249,6 +20613,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, goto later_bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; + txn->mt_flags |= MDBX_TXN_DIRTY; } /* Got info, register DBI in this txn */ @@ -20266,6 +20631,8 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; txn->mt_dbxs[slot].md_name.iov_len = len; + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) + txn->tw.cursors[slot] = NULL; txn->mt_numdbs += (slot == txn->mt_numdbs); if ((dbiflags & DBI_CREAT) == 0) { env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; @@ -20294,7 +20661,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); } -int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, +__cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -20376,8 +20743,7 @@ int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, } int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { - unsigned state; - return mdbx_dbi_flags_ex(txn, dbi, flags, &state); + return __inline_mdbx_dbi_flags(txn, dbi, flags); } /* Add all the DB's pages to the free list. @@ -20404,7 +20770,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { if (unlikely(rc)) goto done; - mdbx_cursor_copy(mc, &mx); + cursor_copy_internal(mc, &mx); while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = page_numkeys(mp); @@ -20446,7 +20812,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { break; mdbx_cassert(mc, i <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)i; - rc = mdbx_cursor_sibling(mc, 1); + rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); if (rc) { if (unlikely(rc != MDBX_NOTFOUND)) goto done; @@ -20501,7 +20867,7 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDBX_DUPSORT); /* Invalidate the dropped DB's cursors */ - for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + for (MDBX_cursor *m2 = txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); if (unlikely(rc)) goto bailout; @@ -20566,7 +20932,7 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { return MDBX_SUCCESS; } -int __cold mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, +__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, void *ctx) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -20666,7 +21032,7 @@ static bool __cold mdbx_pid_insert(uint32_t *ids, uint32_t pid) { return true; } -int __cold mdbx_reader_check(MDBX_env *env, int *dead) { +__cold int mdbx_reader_check(MDBX_env *env, int *dead) { if (dead) *dead = 0; return mdbx_cleanup_dead_readers(env, false, dead); @@ -20676,9 +21042,8 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { * MDBX_RESULT_TRUE - done and mutex recovered * MDBX_SUCCESS - done * Otherwise errcode. */ -MDBX_INTERNAL_FUNC int __cold mdbx_cleanup_dead_readers(MDBX_env *env, - int rdt_locked, - int *dead) { +MDBX_INTERNAL_FUNC __cold int +mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20775,7 +21140,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_cleanup_dead_readers(MDBX_env *env, return rc; } -int __cold mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { +__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); if (loglevel != MDBX_LOG_DONTCHANGE) @@ -20893,7 +21258,7 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, return mdbx_find_oldest(env->me_txn); } -int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { +__cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20913,7 +21278,7 @@ int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { +__cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20933,7 +21298,7 @@ int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { +__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20990,7 +21355,7 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, const char *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { @@ -21011,7 +21376,7 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { } /* Depth-first tree traversal. */ -static int __cold mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, +static __cold int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, const char *name, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); @@ -21249,7 +21614,7 @@ static int __cold mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, const char *name, int deep) { if (unlikely(db->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ @@ -21272,7 +21637,7 @@ static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, return rc; } -int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, +__cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user, bool dont_check_keys_ordering) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21333,10 +21698,11 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; for (unsigned i = 0; i < mc->mc_snum; ++i) { if (mc->mc_ki[i]) @@ -21351,10 +21717,11 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; for (unsigned i = 0; i < mc->mc_snum; ++i) { unsigned nkeys = page_numkeys(mc->mc_pg[i]); @@ -21370,7 +21737,8 @@ int mdbx_cursor_eof(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if ((mc->mc_flags & C_INITIALIZED) == 0) return MDBX_RESULT_TRUE; @@ -21401,9 +21769,13 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, r->level = 0; r->root_nkeys = 0; - if (unlikely(y->mc_signature != MDBX_MC_LIVE || - x->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + if (unlikely(x->mc_signature != MDBX_MC_LIVE)) + return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (unlikely(y->mc_signature != MDBX_MC_LIVE)) + return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21569,7 +21941,8 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(cursor->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21579,7 +21952,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, return MDBX_ENODATA; MDBX_cursor_couple next; - mdbx_cursor_copy(cursor, &next.outer); + cursor_copy_internal(cursor, &next.outer); next.outer.mc_xcursor = NULL; if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; @@ -21587,7 +21960,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; - mdbx_cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor); + cursor_copy_internal(&mx->mx_cursor, &next.inner.mx_cursor); } MDBX_val stub = {0, 0}; @@ -21824,8 +22197,8 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; MDBX_val present_key = *key; if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { @@ -21901,7 +22274,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); bailout: - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } @@ -22040,6 +22413,14 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) { + return __inline_mdbx_limits_pgsize_min(); +} + +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { + return __inline_mdbx_limits_pgsize_max(); +} + __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { if (pagesize < 1) pagesize = (intptr_t)mdbx_syspagesize(); @@ -22143,6 +22524,14 @@ uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { return float2key(ieee754_32bit); } +MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) { + return __inline_mdbx_key_from_int64(i64); +} + +MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) { + return __inline_mdbx_key_from_int32(i32); +} + #define IEEE754_DOUBLE_MANTISSA_SIZE 52 #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF @@ -22409,10 +22798,10 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && data) { - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; } return rc; } @@ -22427,11 +22816,11 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, old_data.iov_len) == 0))) return MDBX_SUCCESS; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, MDBX_CURRENT); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } #endif /* MDBX_NEXENTA_ATTRS */ @@ -22805,6 +23194,9 @@ typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { #ifndef STATUS_INVALID_DEVICE_REQUEST #define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) #endif +#ifndef STATUS_NOT_SUPPORTED +#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) +#endif #ifndef FILE_DEVICE_FILE_SYSTEM #define FILE_DEVICE_FILE_SYSTEM 0x00000009 @@ -23206,6 +23598,20 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #endif } +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : GetLastError(); +#else + return rmdir(pathname) ? errno : MDBX_SUCCESS; +#endif +} + MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, const MDBX_env *env, const char *pathname, mdbx_filehandle_t *fd, @@ -23256,6 +23662,12 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, FlagsAndAttributes |= (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; break; + case MDBX_OPEN_DELETE: + CreationDisposition = OPEN_EXISTING; + ShareMode |= FILE_SHARE_DELETE; + DesiredAccess = + FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; + break; } *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, @@ -23304,6 +23716,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, flags |= O_FSYNC; #endif break; + case MDBX_OPEN_DELETE: + flags = O_RDWR; + break; } const bool direct_nocache_for_copy = @@ -23729,7 +24144,8 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { if (!(flags & MDBX_EXCLUSIVE)) return ERROR_REMOTE_STORAGE_MEDIA_ERROR; } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && - rc != STATUS_INVALID_DEVICE_REQUEST) + rc != STATUS_INVALID_DEVICE_REQUEST && + rc != STATUS_NOT_SUPPORTED) return ntstatus2errcode(rc); } @@ -24113,10 +24529,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, LARGE_INTEGER SectionSize; int err, rc = MDBX_SUCCESS; - if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current) { + if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && + /* workaround for Wine */ mdbx_NtExtendSection) { /* growth rw-section */ - if (!mdbx_NtExtendSection) - return MDBX_UNABLE_EXTEND_MAPSIZE /* workaround for Wine */; SectionSize.QuadPart = size; status = mdbx_NtExtendSection(map->section, &SectionSize); if (!NT_SUCCESS(status)) @@ -24173,6 +24588,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, return err; } +retry_file_and_section: /* resizing of the file may take a while, * therefore we reserve address space to avoid occupy it by other threads */ ReservedAddress = map->address; @@ -24188,7 +24604,6 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, map->address = NULL; } -retry_file_and_section: err = mdbx_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) goto bailout; @@ -24255,7 +24670,7 @@ retry_mapview:; * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ rc = MDBX_UNABLE_EXTEND_MAPSIZE; size = map->current; - limit = map->limit; + ReservedSize = limit = map->limit; goto retry_file_and_section; } @@ -24997,10 +25412,10 @@ __dll_export const struct MDBX_version_info mdbx_version = { 0, 9, - 1, - 18, - {"2020-10-08T01:50:18+03:00", "5cbfdfdc65b62937f7fc927e55cec6304bf04f7c", "1d31ebdc1c930ceb1b223691ac27e507d86cf8e2", - "v0.9.1-18-g1d31ebdc1c"}, + 2, + 0, + {"2020-11-27T10:09:03+03:00", "481b41a5b377ff247625bd2cec60c0ca84f2f2f5", "092ab094c4f8e7c704030568db62bad5d106755f", + "v0.9.2-0-g092ab09"}, sourcery}; __dll_export @@ -25221,6 +25636,15 @@ MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { mdbx_srwlock_ReleaseShared(&env->me_remap_guard); } +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { + return flock(fd, + wait ? LCK_EXCLUSIVE | LCK_WAITFOR + : LCK_EXCLUSIVE | LCK_DONTWAIT, + 0, LCK_MAXLEN) + ? MDBX_SUCCESS + : GetLastError(); +} + static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; @@ -25866,13 +26290,17 @@ static void mdbx_winnt_import(void) { #ifndef MDBX_ALLOY uint32_t mdbx_linux_kernel_version; -bool mdbx_RunningOnWSL; +bool mdbx_RunningOnWSL1; #endif /* MDBX_ALLOY */ -static __cold bool probe_for_WSL(const char *tag) { - /* "Official" way of detecting WSL but not WSL2 - * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 */ - return strstr(tag, "Microsoft") || strstr(tag, "WSL"); +static __cold uint8_t probe_for_WSL(const char *tag) { + const char *const WSL = strstr(tag, "WSL"); + if (WSL && WSL[3] >= '2' && WSL[3] <= '9') + return WSL[3] - '0'; + const char *const wsl = strstr(tag, "wsl"); + if (wsl && wsl[3] >= '2' && wsl[3] <= '9') + return wsl[3] - '0'; + return (WSL || wsl || strcasestr(tag, "Microsoft")) ? 1 : 0; } #endif /* Linux */ @@ -25882,9 +26310,16 @@ mdbx_global_constructor(void) { #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { - mdbx_RunningOnWSL = probe_for_WSL(buffer.version) || - probe_for_WSL(buffer.sysname) || - probe_for_WSL(buffer.release); + /* "Official" way of detecting WSL1 but not WSL2 + * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 + * + * WARNING: False negative detection of WSL1 will result in DATA LOSS! + * So, the REQUIREMENTS for this code: + * 1. MUST detect WSL1 without false-negatives. + * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ + mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || + probe_for_WSL(buffer.sysname) == 1 || + probe_for_WSL(buffer.release) == 1; int i = 0; char *p = buffer.release; while (*p && i < 4) { @@ -26030,6 +26465,14 @@ static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset, } } +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); +} + MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); @@ -26146,10 +26589,10 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { int rc = MDBX_SUCCESS; #if defined(__linux__) || defined(__gnu_linux__) - if (unlikely(mdbx_RunningOnWSL)) { + if (unlikely(mdbx_RunningOnWSL1)) { rc = ENOLCK /* No record locks available */; mdbx_error("%s, err %u", - "WSL (Windows Subsystem for Linux) is mad and trouble-full, " + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " "injecting failure to avoid data loss", rc); return rc; |