diff options
Diffstat (limited to 'libs/libmdbx/src')
-rw-r--r-- | libs/libmdbx/src/CMakeLists.txt | 19 | ||||
-rw-r--r-- | libs/libmdbx/src/ChangeLog.md | 46 | ||||
-rw-r--r-- | libs/libmdbx/src/GNUmakefile | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/README.md | 10 | ||||
-rw-r--r-- | libs/libmdbx/src/VERSION | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/cmake/utils.cmake | 37 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c | 1959 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c++ | 249 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.h | 300 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.h++ | 120 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_chk.c | 97 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_copy.c | 97 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_dump.c | 111 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_load.c | 193 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_stat.c | 97 |
15 files changed, 2118 insertions, 1221 deletions
diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt index 20a50a4537..d41686bf82 100644 --- a/libs/libmdbx/src/CMakeLists.txt +++ b/libs/libmdbx/src/CMakeLists.txt @@ -340,10 +340,12 @@ if(NOT DEFINED MDBX_CXX_STANDARD) set(MDBX_CXX_STANDARD 98) endif() endif() -if(NOT HAS_C11 LESS 0) - set(MDBX_C_STANDARD 11) -else() +# MSVC >= 19.28 (Microsoft Visual Studio 16.8) is mad! +# It unable process Windows SDK headers in the C11 mode! +if(HAS_C11 LESS 0 OR (MSVC AND MSVC_VERSION GREATER 1927)) set(MDBX_C_STANDARD 99) +else() + set(MDBX_C_STANDARD 11) endif() if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND EXISTS "${MDBX_SOURCE_DIR}/ntdll.def") @@ -756,14 +758,21 @@ if(NOT CMAKE_CONFIGURATION_TYPES) endif() endif() +# choice target to fetch definitions and options +if(MDBX_BUILD_SHARED_LIBRARY) + set(target4fetch mdbx) +else() + set(target4fetch mdbx-static) +endif() + # get definitions -get_target_property(defs_list mdbx-static COMPILE_DEFINITIONS) +get_target_property(defs_list ${target4fetch} COMPILE_DEFINITIONS) if(defs_list) list(APPEND MDBX_BUILD_FLAGS ${defs_list}) endif() # get target compile options -get_target_property(options_list mdbx-static COMPILE_OPTIONS) +get_target_property(options_list ${target4fetch} COMPILE_OPTIONS) if(options_list) list(APPEND MDBX_BUILD_FLAGS ${options_list}) endif() diff --git a/libs/libmdbx/src/ChangeLog.md b/libs/libmdbx/src/ChangeLog.md index 34be056b8d..e7cf79534f 100644 --- a/libs/libmdbx/src/ChangeLog.md +++ b/libs/libmdbx/src/ChangeLog.md @@ -1,14 +1,38 @@ ChangeLog --------- -## v0.9.2 (in development) - -TODO: +## v0.9.3 (in development) + - Engage new terminology (https://github.com/erthink/libmdbx/issues/137). - Rework/speedup the implementation of the dirty page list (lazy compactification, lazy sorting via merge). - - Finalize C++ API (few typos and trivia bugs are likely for now). + - Resolve few TODOs (https://github.com/erthink/libmdbx/issues/123, https://github.com/erthink/libmdbx/issues/124, + https://github.com/erthink/libmdbx/issues/127, https://github.com/erthink/libmdbx/issues/128, + https://github.com/erthink/libmdbx/issues/132, https://github.com/erthink/libmdbx/issues/115). + - Finalize C++ API (few typos and trivia bugs are still likely for now). - Packages for ROSA Linux, ALT Linux, Fedora/RHEL, Debian/Ubuntu. +## v0.9.2 scheduled at 2020-11-27 + +Acknowledgements: + + - Jens Alfke (Mobile Architect at [Couchbase](https://www.couchbase.com/)) for [NimDBX](https://github.com/snej/nimdbx). + - Clément Renault (CTO at [MeiliSearch](https://www.meilisearch.com/)) for [mdbx-rs](https://github.com/Kerollmops/mdbx-rs). + - Alex Sharov (Go-Lang Teach Lead at [TurboGeth/Ethereum](https://ethereum.org/)) for an extreme test cases and bug reporting. + - George Hazan (CTO at [Miranda NG](https://www.miranda-ng.org/)) for bug reporting. + - [Positive Technologies](https://www.ptsecurity.com/) for funding and [The Standoff](https://standoff365.com/). + +Added features: + + - Provided package for [buildroot](https://buildroot.org/). + - Binding for Nim is [available](https://github.com/snej/nimdbx) now by Jens Alfke. + - Added `mdbx_env_delete()` for deletion an environment files in a proper and multiprocess-safe way. + - Added `mdbx_txn_commit_ex()` with collecting latency information. + - Fast completion pure nested transactions. + - Added `LIBMDBX_INLINE_API` macro and inline versions of some API functions. + - Added `mdbx_cursor_copy()` function. + - Extended tests for checking cursor tracking. + - Added `MDBX_SET_LOWERBOUND` operation for `mdbx_cursor_get()`. + Fixes: - Fixed missing installation of `mdbx.h++`. @@ -21,6 +45,20 @@ Fixes: - Fixed opening DB on a network shares (in the exclusive mode). - Fixed copy&paste typos. - Fixed minor false-positive GCC warning. + - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. + - Fixed cursor state after multimap/dupsort repeated deletes (https://github.com/erthink/libmdbx/issues/121). + - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. + - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://github.com/erthink/libmdbx/issues/131). + - Fixed spilled pages checking (https://github.com/erthink/libmdbx/issues/126). + - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://github.com/erthink/libmdbx/issues/136). + - Fixed save/restore/commit of cursors for nested transactions. + - Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on). + - Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation). + - Fixed paranoidal Clang C++ UB for bitwise operations with flags defined by enums. + - Fixed large pages checking (for compatibility and to avoid false-positive errors from `mdbx_chk`). + - Added workaround for Wine (https://github.com/miranda-ng/miranda-ng/issues/1209). + - Fixed `ERROR_NOT_SUPPORTED` while opening DB by UNC pathnames (https://github.com/miranda-ng/miranda-ng/issues/2627). + ## v0.9.1 2020-09-30 diff --git a/libs/libmdbx/src/GNUmakefile b/libs/libmdbx/src/GNUmakefile index 82d168b2b8..b14000f5f3 100644 --- a/libs/libmdbx/src/GNUmakefile +++ b/libs/libmdbx/src/GNUmakefile @@ -84,7 +84,7 @@ libmdbx.$(SO_SUFFIX): mdbx-dylib.o mdbx++-dylib.o ################################################################################ -# Amalgamated source code, i.e. distributed after `make dists` +# Amalgamated source code, i.e. distributed after `make dist` MAN_SRCDIR := man1/ config.h: mdbx.c $(lastword $(MAKEFILE_LIST)) diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md index 90f2924608..3f643d2706 100644 --- a/libs/libmdbx/src/README.md +++ b/libs/libmdbx/src/README.md @@ -1,12 +1,15 @@ <!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> -libmdbx -======== - > Please refer to the online [documentation](https://erthink.github.io/libmdbx/) > with [`C` API description](https://erthink.github.io/libmdbx/group__c__api.html) > and pay attention to the preliminary [`C++` API](https://github.com/erthink/libmdbx/blob/devel/mdbx.h%2B%2B). +> > Questions, feedback and suggestions are welcome to the [Telegram' group](https://t.me/libmdbx). +> +> For NEWS take a look to the [ChangeLog](./ChangeLog.md). + +libmdbx +======== <!-- section-begin overview --> _libmdbx_ is an extremely fast, compact, powerful, embedded, @@ -480,6 +483,7 @@ Bindings | Runtime | GitHub | Author | | ------- | ------ | ------ | +| [Nim](https://en.wikipedia.org/wiki/Nim_(programming_language)) | [NimDBX](https://github.com/snej/nimdbx) | [Jens Alfke](https://github.com/snej) | Rust | [mdbx-rs](https://github.com/Kerollmops/mdbx-rs) | [Clément Renault](https://github.com/Kerollmops) | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | diff --git a/libs/libmdbx/src/VERSION b/libs/libmdbx/src/VERSION index 4cef1f9b06..594150e32c 100644 --- a/libs/libmdbx/src/VERSION +++ b/libs/libmdbx/src/VERSION @@ -1 +1 @@ -0.9.1.18 +0.9.2.0 diff --git a/libs/libmdbx/src/cmake/utils.cmake b/libs/libmdbx/src/cmake/utils.cmake index dc6a240e1b..4a48a15a63 100644 --- a/libs/libmdbx/src/cmake/utils.cmake +++ b/libs/libmdbx/src/cmake/utils.cmake @@ -17,8 +17,8 @@ cmake_minimum_required(VERSION 3.8.2) cmake_policy(PUSH) cmake_policy(VERSION 3.8.2) -macro(add_compile_flags langs) - foreach(_lang ${langs}) +macro(add_compile_flags languages) + foreach(_lang ${languages}) string(REPLACE ";" " " _flags "${ARGN}") if(CMAKE_CXX_COMPILER_LOADED AND _lang STREQUAL "CXX") set("${_lang}_FLAGS" "${${_lang}_FLAGS} ${_flags}") @@ -113,23 +113,34 @@ macro(fetch_version name source_root_directory parent_scope) message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%H HEAD' failed)") endif() - execute_process(COMMAND ${GIT} tag --sort=-version:refname - OUTPUT_VARIABLE tag_list + execute_process(COMMAND ${GIT} describe --tags --abbrev=0 "--match=v[0-9]*" + OUTPUT_VARIABLE last_release_tag OUTPUT_STRIP_TRAILING_WHITESPACE WORKING_DIRECTORY ${source_root_directory} RESULT_VARIABLE rc) if(rc) - message(FATAL_ERROR "Please install latest version of git ('tag --sort=-version:refname' failed)") + message(FATAL_ERROR "Please install latest version of git ('describe --tags --abbrev=0 --match=v[0-9]*' failed)") endif() - string(REGEX REPLACE "\n" ";" tag_list "${tag_list}") - set(last_release_tag "") - set(git_revlist_arg "HEAD") - foreach(tag IN LISTS tag_list) - if(NOT last_release_tag) - string(REGEX MATCH "^v[0-9]+(\.[0-9]+)+" last_release_tag "${tag}") - set(git_revlist_arg "${tag}..HEAD") + if (last_release_tag) + set(git_revlist_arg "${last_release_tag}..HEAD") + else() + execute_process(COMMAND ${GIT} tag --sort=-version:refname + OUTPUT_VARIABLE tag_list + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${source_root_directory} + RESULT_VARIABLE rc) + if(rc) + message(FATAL_ERROR "Please install latest version of git ('tag --sort=-version:refname' failed)") endif() - endforeach(tag) + string(REGEX REPLACE "\n" ";" tag_list "${tag_list}") + set(git_revlist_arg "HEAD") + foreach(tag IN LISTS tag_list) + if(NOT last_release_tag) + string(REGEX MATCH "^v[0-9]+(\.[0-9]+)+" last_release_tag "${tag}") + set(git_revlist_arg "${tag}..HEAD") + endif() + endforeach(tag) + endif() execute_process(COMMAND ${GIT} rev-list --count "${git_revlist_arg}" OUTPUT_VARIABLE ${name}_GIT_REVISION OUTPUT_STRIP_TRAILING_WHITESPACE diff --git a/libs/libmdbx/src/mdbx.c b/libs/libmdbx/src/mdbx.c index 2fa4734952..3bcb83b536 100644 --- a/libs/libmdbx/src/mdbx.c +++ b/libs/libmdbx/src/mdbx.c @@ -12,11 +12,16 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -97,11 +102,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -817,7 +817,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -993,6 +993,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1034,8 +1063,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1088,8 +1117,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1160,7 +1188,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1169,7 +1198,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1429,32 +1460,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1890,7 +1895,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1974,7 +1979,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2222,7 +2227,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2381,8 +2386,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2409,6 +2412,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2546,7 +2551,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2556,7 +2561,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2832,7 +2837,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -3064,7 +3069,7 @@ static __maybe_unused void static_checks(void) { /*------------------------------------------------------------------------------ - * Internal inlines */ + * Internal inline functions */ MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); @@ -3806,12 +3811,24 @@ static __always_inline void atomic_yield(void) { #if MDBX_64BIT_CAS static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, uint64_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_compare_exchange_strong((_Atomic uint64_t *)p, &c, v); +#ifdef ATOMIC_LLONG_LOCK_FREE + STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); +#if ATOMIC_LLONG_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint64_t) == sizeof(uint64_t)); + return atomic_compare_exchange_strong((_Atomic volatile uint64_t *)p, &c, v); +#else + return atomic_compare_exchange_strong(p, &c, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #elif defined(_MSC_VER) @@ -3827,12 +3844,24 @@ static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, uint32_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_compare_exchange_strong((_Atomic uint32_t *)p, &c, v); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); + return atomic_compare_exchange_strong((_Atomic volatile uint32_t *)p, &c, v); +#else + return atomic_compare_exchange_strong(p, &c, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #elif defined(_MSC_VER) @@ -3846,12 +3875,24 @@ static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, } static __always_inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { -#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) +#if !defined(__STDC_NO_ATOMICS__) && \ + (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ + __has_extension(c_atomic)) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); -#ifndef __COVERITY__ - STATIC_ASSERT(atomic_is_lock_free(p)); -#endif /* Workaround for Coverity */ - return atomic_fetch_add((_Atomic uint32_t *)p, v); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(p)); +#endif +#else + assert(atomic_is_lock_free(p)); +#endif +#ifdef __clang__ + STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); + return atomic_fetch_add((_Atomic volatile uint32_t *)p, v); +#else + return atomic_fetch_add(p, v); +#endif #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(p, v); #elif defined(_MSC_VER) @@ -6128,7 +6169,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int __must_check_result mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl, - const txnid_t pp_txnid); + txnid_t pp_txnid); static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, int flags); @@ -6204,8 +6245,9 @@ static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data, unsigned flags); -static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, - int move_right); +#define SIBLING_LEFT 0 +#define SIBLING_RIGHT 2 +static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); @@ -6229,7 +6271,7 @@ static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, bool new_dupdata); -static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); +static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result mdbx_drop0(MDBX_cursor *mc, int subs); static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); @@ -6573,7 +6615,7 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { /*----------------------------------------------------------------------------*/ -/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ +/* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */ #define XCURSOR_INITED(mc) \ ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) @@ -6588,16 +6630,26 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ } while (0) +static __maybe_unused bool cursor_is_tracked(const MDBX_cursor *mc) { + for (MDBX_cursor *scan = mc->mc_txn->tw.cursors[mc->mc_dbi]; scan; + scan = scan->mc_next) + if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) + return true; + return false; +} + /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + mn.mc_txn->tw.cursors != NULL /* must be not rdonly txt */); \ + mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ MDBX_cursor mc_dummy; \ - MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + MDBX_cursor **tracking_head = &(mn).mc_txn->tw.cursors[mn.mc_dbi]; \ MDBX_cursor *tracked = &(mn); \ if ((mn).mc_flags & C_SUB) { \ mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_top = 0; \ mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ tracked = &mc_dummy; \ } \ @@ -7155,7 +7207,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { MDBX_txn *txn = mc->mc_txn; MDBX_cursor *m3, *m0 = mc; MDBX_xcursor *mx; - MDBX_page *dp, *mp; + MDBX_page *mp; unsigned i, j; int rc = MDBX_SUCCESS; @@ -7180,7 +7232,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { } } mc = mc->mc_next; - for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + for (; !mc || mc == m0; mc = txn->tw.cursors[--i]) if (i == 0) goto mark_done; } @@ -7193,11 +7245,8 @@ mark_done: pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - int level; - if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level, - txn->mt_txnid)) != MDBX_SUCCESS)) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) + MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); + if (dp && (dp->mp_flags & Mask) == pflags) dp->mp_flags ^= P_KEEP; } } @@ -7652,7 +7701,7 @@ static __always_inline __maybe_unused int ignore_enosys(int err) { #endif /* defined(_WIN32) || defined(_WIN64) */ /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, +static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset, const size_t length, const bool enable) { assert(length > 0); mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", @@ -7729,18 +7778,18 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); + const size_t prev_size = env->me_dxb_mmap.current; + const size_t prev_limit = env->me_dxb_mmap.limit; + const void *const prev_addr = env->me_map; mdbx_verbose("resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR, - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes); + prev_size, size_bytes, prev_limit, limit_bytes); mdbx_assert(env, limit_bytes >= size_bytes); mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); - const size_t prev_limit = env->me_dxb_mmap.limit; - const void *const prev_addr = env->me_map; #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: @@ -7785,9 +7834,11 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, goto bailout; if (limit_bytes != env->me_dxb_mmap.limit && env->me_lck && !implicit) { - rc = mdbx_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(rc != MDBX_SUCCESS)) + int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; goto bailout; + } /* looking for readers from this process */ MDBX_lockinfo *const lck = env->me_lck; @@ -7807,7 +7858,6 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #endif /* ! Windows */ - const size_t prev_size = env->me_dxb_mmap.current; if (size_bytes < prev_size) { mdbx_notice("resize-MADV_%s %u..%u", (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", @@ -7846,7 +7896,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes, mapping_can_be_moved); if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { - const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0); + const int readahead = + mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); if (readahead == MDBX_RESULT_FALSE) rc = mdbx_set_readahead( env, 0, (size_bytes > prev_size) ? size_bytes : prev_size, false); @@ -7889,14 +7940,12 @@ bailout: mdbx_error("failed resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes, rc); + prev_size, size_bytes, prev_limit, limit_bytes, rc); } else { mdbx_warning("unable resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, - limit_bytes, rc); + prev_size, size_bytes, prev_limit, limit_bytes, rc); } if (!env->me_dxb_mmap.address) { env->me_flags |= MDBX_FATAL_ERROR; @@ -8054,7 +8103,7 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, if (unlikely(mc->mc_flags & C_RECLAIMING)) { /* If mc is updating the GC, then the retired-list cannot play * catch-up with itself by growing while trying to save it. */ - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE | MDBX_LIFORECLAIM); + flags &= ~MDBX_ALLOC_GC; } else if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { /* avoid (recursive) search inside empty tree and while tree is updating, * https://github.com/erthink/libmdbx/issues/31 */ @@ -8096,12 +8145,6 @@ skip_cache: const unsigned wanna_range = num - 1; while (true) { /* hsr-kick retry loop */ - /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->tw.dirtyroom == 0)) { - rc = MDBX_TXN_FULL; - goto fail; - } - MDBX_cursor_couple recur; for (MDBX_cursor_op op = MDBX_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { @@ -8142,6 +8185,11 @@ skip_cache: } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ + if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) { + /* If our dirty list is already full, we can't touch GC */ + flags &= ~MDBX_ALLOC_GC; + } if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; @@ -8242,7 +8290,7 @@ skip_cache: } } - /* Append PNL from GC record to me_reclaimed_pglist */ + /* Append PNL from GC record to tw.reclaimed_pglist */ mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); pgno_t *gc_pnl = (pgno_t *)data.iov_base; mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); @@ -8252,6 +8300,24 @@ skip_cache: goto fail; } const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); + /* TODO: provide a user-configurable threshold */ + const unsigned threshold_2_stop_gc_reclaiming = MDBX_PNL_MAX / 4; + if (unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + threshold_2_stop_gc_reclaiming) && + (pgno_add(txn->mt_next_pgno, num) <= txn->mt_geo.upper || + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= + MDBX_PNL_MAX / 16 * 15)) { + /* Stop reclaiming to avoid overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. https://github.com/erthink/libmdbx/issues/123 */ + flags &= ~MDBX_ALLOC_GC; + if (unlikely((flags & MDBX_ALLOC_ALL) == 0)) { + /* Oh, we can't do anything */ + rc = MDBX_TXN_FULL; + goto fail; + } + break; + } rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -8474,7 +8540,7 @@ done: mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); mdbx_tassert(txn, pgno < txn->mt_next_pgno); mdbx_tassert(txn, pgno == re_list[range_begin]); - /* Cutoff allocated pages from me_reclaimed_pglist */ + /* Cutoff allocated pages from tw.reclaimed_pglist */ #if MDBX_PNL_ASCENDING for (unsigned i = range_begin + num; i <= re_len;) re_list[range_begin++] = re_list[i++]; @@ -8666,7 +8732,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { done: /* Adjust cursors pointing to mp */ mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; + m2 = txn->tw.cursors[mc->mc_dbi]; if (mc->mc_flags & C_SUB) { for (; m2; m2 = m2->mc_next) { m3 = &m2->mc_xcursor->mx_cursor; @@ -8818,43 +8884,42 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return mdbx_env_sync_internal(env, force, nonblock); } -__cold int mdbx_env_sync(MDBX_env *env) { - return mdbx_env_sync_ex(env, true, false); -} +__cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); } __cold int mdbx_env_sync_poll(MDBX_env *env) { - return mdbx_env_sync_ex(env, false, true); + return __inline_mdbx_env_sync_poll(env); } /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { - MDBX_cursor *mc, *bk; - MDBX_xcursor *mx; - - for (int i = src->mt_numdbs; --i >= 0;) { - dst->mt_cursors[i] = NULL; - if ((mc = src->mt_cursors[i]) != NULL) { - size_t size = sizeof(MDBX_cursor); - if (mc->mc_xcursor) - size += sizeof(MDBX_xcursor); - for (; mc; mc = bk->mc_next) { +static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { + for (int i = parent->mt_numdbs; --i >= 0;) { + nested->tw.cursors[i] = NULL; + MDBX_cursor *mc = parent->tw.cursors[i]; + if (mc != NULL) { + size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; mc; mc = bk->mc_next) { + bk = mc; + if (mc->mc_signature != MDBX_MC_LIVE) + continue; bk = mdbx_malloc(size); if (unlikely(!bk)) return MDBX_ENOMEM; *bk = *mc; mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; /* Kill pointers into src to reduce abuse: The * user may not use mc until dst ends. But we need a valid * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = dst; - mc->mc_dbistate = &dst->mt_dbistate[i]; - if ((mx = mc->mc_xcursor) != NULL) { + mc->mc_txn = nested; + mc->mc_db = &nested->mt_dbs[i]; + mc->mc_dbistate = &nested->mt_dbistate[i]; + MDBX_xcursor *mx = mc->mc_xcursor; + if (mx != NULL) { *(MDBX_xcursor *)(bk + 1) = *mx; - mx->mx_cursor.mc_txn = dst; + mx->mx_cursor.mc_txn = nested; } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; + mc->mc_next = nested->tw.cursors[i]; + nested->tw.cursors[i] = mc; } } } @@ -8867,47 +8932,57 @@ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { - MDBX_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDBX_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0;) { - for (mc = cursors[i]; mc; mc = next) { - unsigned stage = mc->mc_signature; - mdbx_ensure(txn->mt_env, - stage == MDBX_MC_LIVE || stage == MDBX_MC_WAIT4EOT); +static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { + mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + for (int i = txn->mt_numdbs; --i >= 0;) { + MDBX_cursor *next, *mc = txn->tw.cursors[i]; + if (!mc) + continue; + txn->tw.cursors[i] = NULL; + do { + const unsigned stage = mc->mc_signature; + MDBX_cursor *bk = mc->mc_backup; next = mc->mc_next; - mdbx_tassert(txn, !next || next->mc_signature == MDBX_MC_LIVE || - next->mc_signature == MDBX_MC_WAIT4EOT); - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - /* Commit changes to parent txn */ + mdbx_ensure(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + if (bk) { + MDBX_xcursor *mx = mc->mc_xcursor; + mdbx_cassert(mc, mx == bk->mc_xcursor); + mdbx_tassert(txn, txn->mt_parent != NULL); + mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) + mc->mc_signature = stage /* Promote closed state to parent txn */; + else if (merge) { + /* Preserve changes from nested to parent txn */ mc->mc_next = bk->mc_next; mc->mc_backup = bk->mc_backup; mc->mc_txn = bk->mc_txn; + *bk->mc_db = *mc->mc_db; mc->mc_db = bk->mc_db; + *bk->mc_dbistate = *mc->mc_dbistate; mc->mc_dbistate = bk->mc_dbistate; - if ((mx = mc->mc_xcursor) != NULL) + if (mx) { + if (mx != bk->mc_xcursor) { + *bk->mc_xcursor = *mx; + mx = bk->mc_xcursor; + } mx->mx_cursor.mc_txn = bk->mc_txn; + } } else { - /* Abort nested txn */ + /* Restore from backup, i.e. rollback/abort nested txn */ *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) + if (mx) *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; mdbx_free(bk); - } - if (stage == MDBX_MC_WAIT4EOT) { - mc->mc_signature = 0; - mdbx_free(mc); } else { - mc->mc_signature = MDBX_MC_READY4CLOSE; + mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } - } - cursors[i] = NULL; + } while ((mc = next) != NULL); } } @@ -9471,7 +9546,7 @@ int mdbx_txn_renew(MDBX_txn *txn) { int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret) { - return mdbx_txn_begin_ex(env, parent, flags, ret, nullptr); + return __inline_mdbx_txn_begin(env, parent, flags, ret); } int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { @@ -9552,7 +9627,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent) { mdbx_tassert(txn, mdbx_dirtylist_check(parent)); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->tw.dirtylist = mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1)); txn->tw.reclaimed_pglist = @@ -9596,7 +9671,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT); + txn->mt_dbistate[i] = + parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); mdbx_tassert(parent, parent->mt_parent || parent->tw.dirtyroom + parent->tw.dirtylist->length == @@ -9773,6 +9849,7 @@ int mdbx_txn_flags(const MDBX_txn *txn) { /* Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { + mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; @@ -9874,10 +9951,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (txn == env->me_txn0) mdbx_txn_valgrind(env, nullptr); #endif - /* Export or close DBI handles created in this txn */ - mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ - mdbx_cursors_eot(txn, 0); + mdbx_cursors_eot(txn, false); if (!(env->me_flags & MDBX_WRITEMAP)) mdbx_dlist_free(txn); @@ -9886,17 +9961,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { env->me_txn = txn->mt_parent; if (txn == env->me_txn0) { mdbx_assert(env, txn->mt_parent == NULL); + /* Export or close DBI handles created in this txn */ + mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); mdbx_pnl_shrink(&txn->tw.retired_pages); mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { mdbx_assert(env, txn->mt_parent != NULL); + MDBX_txn *const parent = txn->mt_parent; + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - MDBX_txn *const parent = txn->mt_parent; - env->me_txn->mt_child = NULL; - env->me_txn->mt_flags &= ~MDBX_TXN_HAS_CHILD; mdbx_pnl_free(txn->tw.reclaimed_pglist); mdbx_pnl_free(txn->tw.spill_pages); @@ -9917,6 +9995,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { } mdbx_free(txn->tw.dirtylist); + parent->mt_child = NULL; + parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { @@ -10088,7 +10168,7 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; } } - rc = mdbx_cursor_sibling(&cx.outer, 1); + rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); } mdbx_tassert(txn, rc == MDBX_NOTFOUND); } @@ -10200,8 +10280,8 @@ static int mdbx_update_gc(MDBX_txn *txn) { goto bailout_notracking; couple.outer.mc_flags |= C_RECLAIMING; - couple.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &couple.outer; + couple.outer.mc_next = txn->tw.cursors[FREE_DBI]; + txn->tw.cursors[FREE_DBI] = &couple.outer; retry: ++loop; @@ -10233,8 +10313,10 @@ retry_noaccount: mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (txn->tw.lifo_reclaimed) { - if (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (lifo) { + if (cleaned_gc_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)) { settled = 0; cleaned_gc_slot = 0; reused_gc_slot = 0; @@ -10265,7 +10347,7 @@ retry_noaccount: } } else { /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for me_reclaimed_pglist. */ + * now delete them and any we reserved for tw.reclaimed_pglist. */ while (cleaned_gc_id <= txn->tw.last_reclaimed) { gc_rid = cleaned_gc_id; settled = 0; @@ -10325,13 +10407,13 @@ retry_noaccount: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { - /* Return loose page numbers to me_reclaimed_pglist, + /* Return loose page numbers to tw.reclaimed_pglist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (txn->tw.loose_count > 0) { /* Put loose page numbers in tw.retired_pages, - * since unable to return them to me_reclaimed_pglist. */ + * since unable to return them to tw.reclaimed_pglist. */ if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, txn->tw.loose_count)) != 0)) goto bailout; @@ -10479,9 +10561,12 @@ retry_noaccount: env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ - couple.outer.mc_flags &= ~C_RECLAIMING; bool need_cleanup = false; + txnid_t snap_oldest; + retry_rid: + couple.outer.mc_flags &= ~C_RECLAIMING; do { + snap_oldest = mdbx_find_oldest(txn); rc = mdbx_page_alloc(&couple.outer, 0, NULL, MDBX_ALLOC_GC); if (likely(rc == MDBX_SUCCESS)) { mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, @@ -10509,7 +10594,13 @@ retry_noaccount: gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); - txn->tw.last_reclaimed = gc_rid = mdbx_find_oldest(txn) - 1; + if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) + /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + * if the oldest reader changes since the last attempt */ + goto retry_rid; + /* no reclaimable GC entries, + * therefore no entries with ID < mdbx_find_oldest(txn) */ + txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, gc_rid); } @@ -10842,7 +10933,7 @@ retry_noaccount: cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; + txn->tw.cursors[FREE_DBI] = couple.outer.mc_next; bailout_notracking: MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; @@ -10855,24 +10946,23 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, size_t iov_bytes) { MDBX_env *const env = txn->mt_env; mdbx_assert(env, iov_items > 0); + int rc; if (likely(iov_items == 1)) { mdbx_assert(env, iov->iov_len == iov_bytes); - int rc = mdbx_pwrite(env->me_lazy_fd, iov->iov_base, iov_bytes, iov_off); - mdbx_dpage_free(env, (MDBX_page *)iov->iov_base, - bytes2pgno(env, iov_bytes)); - return rc; + rc = mdbx_pwrite(env->me_lazy_fd, iov->iov_base, iov_bytes, iov_off); } else { - int rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); - if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_error("Write error: %s", mdbx_strerror(rc)); - txn->mt_flags |= MDBX_TXN_ERROR; - } + rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); + } - for (unsigned i = 0; i < iov_items; i++) - mdbx_dpage_free(env, (MDBX_page *)iov[i].iov_base, - bytes2pgno(env, iov[i].iov_len)); - return rc; + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_error("Write error: %s", mdbx_strerror(rc)); + txn->mt_flags |= MDBX_TXN_ERROR; } + + for (unsigned i = 0; i < iov_items; i++) + mdbx_dpage_free(env, (MDBX_page *)iov[i].iov_base, + bytes2pgno(env, iov[i].iov_len)); + return rc; } /* Flush (some) dirty pages to the map, after clearing their dirty flag. @@ -11011,12 +11101,18 @@ static __always_inline bool mdbx_txn_dbi_exists(MDBX_txn *txn, MDBX_dbi dbi, return mdbx_txn_import_dbi(txn, dbi); } -int mdbx_txn_commit(MDBX_txn *txn) { +int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } + +int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); + const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; + uint32_t audit_duration = 0; + int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) - return rc; + goto provide_latency; if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { rc = MDBX_RESULT_TRUE; @@ -11027,7 +11123,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { #if MDBX_ENV_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; + rc = MDBX_PANIC; + goto provide_latency; } #endif /* MDBX_ENV_CHECKPID */ @@ -11038,7 +11135,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { goto done; if (txn->mt_child) { - rc = mdbx_txn_commit(txn->mt_child); + rc = mdbx_txn_commit_ex(txn->mt_child, NULL); mdbx_tassert(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -11051,8 +11148,32 @@ int mdbx_txn_commit(MDBX_txn *txn) { } if (txn->mt_parent) { + mdbx_assert(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + mdbx_assert(env, mdbx_dirtylist_check(txn)); + + if (txn->tw.dirtylist->length == 0 && + (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0 && + parent->mt_numdbs == txn->mt_numdbs) { + for (int i = txn->mt_numdbs; --i >= 0;) { + mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbistate[i] & DBI_STALE) && + !(parent->mt_dbistate[i] & DBI_STALE)) + mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); + } + + mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + + end_mode = MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE; + goto done; + } /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ @@ -11081,7 +11202,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; /* Merge our cursors into parent's and close them */ - mdbx_cursors_eot(txn, 1); + mdbx_cursors_eot(txn, true); + end_mode |= MDBX_END_EOTDONE; /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); @@ -11093,6 +11215,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_dbistate[i] = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH)); } + ts_1 = latency ? mdbx_osal_monotime() : 0; /* Remove refunded pages from parent's dirty & spill lists */ MDBX_DPL dst = mdbx_dpl_sort(parent->tw.dirtylist); @@ -11264,6 +11387,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_flags |= MDBX_TXN_SPILLS; } + ts_2 = latency ? mdbx_osal_monotime() : 0; /* Append our loose page list to parent's */ if (txn->tw.loose_pages) { MDBX_page **lp = &parent->tw.loose_pages; @@ -11285,8 +11409,6 @@ int mdbx_txn_commit(MDBX_txn *txn) { env->me_txn = parent; parent->mt_child = NULL; - txn->mt_signature = 0; - mdbx_free(txn); mdbx_tassert(parent, mdbx_dirtylist_check(parent)); /* Scan parent's loose page for suitable for refund */ @@ -11296,13 +11418,18 @@ int mdbx_txn_commit(MDBX_txn *txn) { break; } } + + ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; + txn->mt_signature = 0; + mdbx_free(txn); mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - return MDBX_SUCCESS; + rc = MDBX_SUCCESS; + goto provide_latency; } mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == MDBX_DPL_TXNFULL); - mdbx_cursors_eot(txn, 0); + mdbx_cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; if (txn->tw.dirtylist->length == 0 && @@ -11346,17 +11473,23 @@ int mdbx_txn_commit(MDBX_txn *txn) { } } + ts_1 = latency ? mdbx_osal_monotime() : 0; rc = mdbx_update_gc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto fail; + ts_2 = latency ? mdbx_osal_monotime() : 0; if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + const uint64_t audit_end = mdbx_osal_monotime(); + audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); + ts_2 = audit_end; if (unlikely(rc != MDBX_SUCCESS)) goto fail; } rc = mdbx_page_flush(txn, 0); + ts_3 = latency ? mdbx_osal_monotime() : 0; if (likely(rc == MDBX_SUCCESS)) { if (txn->mt_dbs[MAIN_DBI].md_flags & DBI_DIRTY) txn->mt_dbs[MAIN_DBI].md_mod_txnid = pp_txnid2chk(txn); @@ -11378,6 +11511,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = mdbx_sync_locked( env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } + ts_4 = latency ? mdbx_osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; goto fail; @@ -11386,19 +11520,34 @@ int mdbx_txn_commit(MDBX_txn *txn) { end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: - return mdbx_txn_end(txn, end_mode); + rc = mdbx_txn_end(txn, end_mode); + +provide_latency: + if (latency) { + latency->audit = audit_duration; + latency->preparation = + ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc = + (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->write = + (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->sync = + (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + const uint64_t ts_5 = mdbx_osal_monotime(); + latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + } + return rc; fail: mdbx_txn_abort(txn); - return rc; + goto provide_latency; } -static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, - uint64_t *filesize, - const MDBX_page *const page, - const unsigned meta_number, - MDBX_meta *dest, - const unsigned guess_pagesize) { +static __cold int +mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, + const MDBX_page *const page, const unsigned meta_number, + MDBX_meta *dest, const unsigned guess_pagesize) { if (meta->mm_magic_and_version != MDBX_DATA_MAGIC && meta->mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) { mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, @@ -11590,7 +11739,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, /* Read the environment parameters of a DB environment * before mapping it into memory. */ -static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, +static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, uint64_t *filesize, const int lck_exclusive) { int rc = mdbx_filesize(env->me_lazy_fd, filesize); @@ -12080,7 +12229,7 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); } -int __cold mdbx_env_create(MDBX_env **penv) { +__cold int mdbx_env_create(MDBX_env **penv) { MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; @@ -12422,9 +12571,11 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EPERM; goto bailout; } - rc = mdbx_rdt_lock(env); - if (unlikely(rc != MDBX_SUCCESS)) + int err = mdbx_rdt_lock(env); + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; goto bailout; + } /* Check if there are any reading threads that do not use the SRWL */ const size_t CurrentTid = GetCurrentThreadId(); @@ -12481,11 +12632,11 @@ bailout: return rc; } -int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { - return mdbx_env_set_geometry(env, size, size, size, -1, -1, -1); +__cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return __inline_mdbx_env_set_mapsize(env, size); } -int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { +__cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12500,7 +12651,7 @@ int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { return MDBX_SUCCESS; } -int __cold mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { +__cold int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12512,7 +12663,7 @@ int __cold mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { +__cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12527,7 +12678,7 @@ int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { return MDBX_SUCCESS; } -int __cold mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { +__cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12540,7 +12691,7 @@ int __cold mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { +static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; @@ -12711,6 +12862,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + /* calculate readahead hint before mmap with zero redundant pages */ + const bool readahead = + (env->me_flags & MDBX_NORDAHEAD) == 0 && + mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); if (unlikely(err != MDBX_SUCCESS)) @@ -12974,9 +13129,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { #endif /* MADV_DONTNEED */ } - const bool readahead = (env->me_flags & MDBX_NORDAHEAD) == 0 && - mdbx_is_readahead_reasonable(env->me_dxb_mmap.current, - 0) == MDBX_RESULT_TRUE; err = mdbx_set_readahead(env, 0, used_bytes, readahead); if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) return err; @@ -12987,7 +13139,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, +static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mdbx_mode_t mode) { mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); @@ -13001,7 +13153,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, /* ensure the file system is read-only */ err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); - if (err != MDBX_SUCCESS) + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) return err; /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ @@ -13394,22 +13548,21 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, 0); } -__cold int mdbx_env_open(MDBX_env *env, const char *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +typedef struct { + void *buffer_for_free; + char *lck, *dxb; + size_t ent_len; +} MDBX_handle_env_pathname; + +__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, + const char *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { + int rc; + memset(ctx, 0, sizeof(*ctx)); if (unlikely(!pathname)) return MDBX_EINVAL; - if (flags & ~ENV_USABLE_FLAGS) - return MDBX_EINVAL; - - if (env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0) - return MDBX_EPERM; - #if defined(_WIN32) || defined(_WIN64) const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) @@ -13417,33 +13570,28 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) return ERROR_INVALID_NAME; -#endif /* Windows */ - - /* pickup previously mdbx_env_set_flags(), - * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - flags = merge_sync_flags(flags, env->me_flags); -#if defined(_WIN32) || defined(_WIN64) const DWORD dwAttrib = GetFileAttributesW(pathnameW); if (dwAttrib == INVALID_FILE_ATTRIBUTES) { rc = GetLastError(); if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (flags & MDBX_RDONLY) != 0) + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; /* auto-create directory if requested */ - if ((flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathnameW, nullptr)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathnameW, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - flags |= MDBX_NOSUBDIR; + *flags |= MDBX_NOSUBDIR; if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) - flags -= MDBX_NOSUBDIR; + *flags -= MDBX_NOSUBDIR; } #else struct stat st; @@ -13451,7 +13599,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, rc = errno; if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (flags & MDBX_RDONLY) != 0) + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; @@ -13462,41 +13610,151 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, /* always add read/write/search for owner */ S_IRWXU | ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if ((flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { rc = errno; if (rc != EEXIST) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - flags |= MDBX_NOSUBDIR; + *flags |= MDBX_NOSUBDIR; if (S_ISDIR(st.st_mode)) - flags -= MDBX_NOSUBDIR; + *flags -= MDBX_NOSUBDIR; } #endif - size_t len_full, len = strlen(pathname); - if (flags & MDBX_NOSUBDIR) { - len_full = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; - } else { - len_full = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); + static const char dxb_name[] = MDBX_DATANAME; + static const size_t dxb_name_len = sizeof(dxb_name) - 1; + static const char lck_name[] = MDBX_LOCKNAME; + static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + + ctx->ent_len = strlen(pathname); + if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && + !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; } - char *lck_pathname = mdbx_malloc(len_full); - if (!lck_pathname) + + const size_t bytes_needed = + ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) + ? sizeof(lock_suffix) + 1 + : sizeof(lck_name) + sizeof(dxb_name)); + ctx->buffer_for_free = mdbx_malloc(bytes_needed); + if (!ctx->buffer_for_free) return MDBX_ENOMEM; - char *dxb_pathname; - if (flags & MDBX_NOSUBDIR) { - dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); - sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, pathname); - strcpy(dxb_pathname, pathname); + ctx->lck = ctx->buffer_for_free; + if (*flags & MDBX_NOSUBDIR) { + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); + sprintf(ctx->lck, "%s%s", pathname, lock_suffix); + strcpy(ctx->dxb, pathname); } else { - dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCKNAME); - sprintf(lck_pathname, "%s" MDBX_LOCKNAME, pathname); - sprintf(dxb_pathname, "%s" MDBX_DATANAME, pathname); + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); + sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); + sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); } - rc = MDBX_SUCCESS; + return MDBX_SUCCESS; +} + +__cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { + switch (mode) { + default: + return MDBX_EINVAL; + case MDBX_ENV_JUST_DELETE: + case MDBX_ENV_ENSURE_UNUSED: + case MDBX_ENV_WAIT_FOR_UNUSED: + break; + } + + MDBX_env dummy_env; + memset(&dummy_env, 0, sizeof(dummy_env)); + dummy_env.me_flags = + (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; + dummy_env.me_psize = dummy_env.me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env.me_pathname = (char *)pathname; + + MDBX_handle_env_pathname env_pathname; + STATIC_ASSERT(sizeof(dummy_env.me_flags) == sizeof(MDBX_env_flags_t)); + int rc = MDBX_RESULT_TRUE, + err = mdbx_handle_env_pathname( + &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env.me_flags, 0); + if (likely(err == MDBX_SUCCESS)) { + mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, + dxb_handle = INVALID_HANDLE_VALUE; + if (mode > MDBX_ENV_JUST_DELETE) { + err = mdbx_openfile(MDBX_OPEN_DELETE, &dummy_env, env_pathname.dxb, + &dxb_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + if (err == MDBX_SUCCESS) { + err = mdbx_openfile(MDBX_OPEN_DELETE, &dummy_env, env_pathname.lck, + &clk_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + } + if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.dxb); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.lck); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS && !(dummy_env.me_flags & MDBX_NOSUBDIR)) { + err = mdbx_removedirectory(pathname); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (dxb_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(dxb_handle); + if (clk_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(clk_handle); + } else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + + mdbx_free(env_pathname.buffer_for_free); + return (err == MDBX_SUCCESS) ? rc : err; +} + +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { + int rc = check_env(env); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (flags & ~ENV_USABLE_FLAGS) + return MDBX_EINVAL; + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0) + return MDBX_EPERM; + + /* pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->me_flags; + flags = merge_sync_flags(flags, env->me_flags); + + MDBX_handle_env_pathname env_pathname; + rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when * we're only getting read access */ @@ -13524,38 +13782,39 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, rc = MDBX_ENOMEM; } - const uint32_t saved_me_flags = env->me_flags; env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - env->me_path = mdbx_strdup(pathname); + env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); - if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && + env->me_dbiseqs)) { rc = MDBX_ENOMEM; goto bailout; } + memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ : MDBX_OPEN_DXB_LAZY, - env, dxb_pathname, &env->me_lazy_fd, mode); + env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, dxb_pathname, + rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, &env->me_dsync_fd, 0); mdbx_ensure(env, (rc != MDBX_SUCCESS) == (env->me_dsync_fd == INVALID_HANDLE_VALUE)); } #if MDBX_LOCKING == MDBX_LOCKING_SYSV - env->me_sysv_ipc.key = ftok(dxb_pathname, 42); + env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); if (env->me_sysv_ipc.key == -1) { rc = errno; goto bailout; @@ -13565,6 +13824,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, #if !(defined(_WIN32) || defined(_WIN64)) if (mode == 0) { /* pickup mode for lck-file */ + struct stat st; if (fstat(env->me_lazy_fd, &st)) { rc = errno; goto bailout; @@ -13577,12 +13837,19 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); + const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; } + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); + mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) + mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + const MDBX_env_flags_t rigorous_flags = MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | @@ -13667,8 +13934,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_txn *txn = mdbx_calloc(1, size); if (txn) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->tw.cursors + env->me_maxdbs); txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; @@ -13712,12 +13979,12 @@ bailout: mdbx_txn_valgrind(env, nullptr); #endif } - mdbx_free(lck_pathname); + mdbx_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -static int __cold mdbx_env_close0(MDBX_env *env) { +static __cold int mdbx_env_close0(MDBX_env *env) { env->me_stuck_meta = -1; if (!(env->me_flags & MDBX_ENV_ACTIVE)) { mdbx_ensure(env, env->me_lcklist_next == nullptr); @@ -13773,7 +14040,7 @@ static int __cold mdbx_env_close0(MDBX_env *env) { mdbx_memalign_free(env->me_pbuf); mdbx_free(env->me_dbiseqs); mdbx_free(env->me_dbflags); - mdbx_free(env->me_path); + mdbx_free(env->me_pathname); mdbx_free(env->me_dirtylist); if (env->me_txn0) { mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); @@ -13786,7 +14053,7 @@ static int __cold mdbx_env_close0(MDBX_env *env) { return rc; } -int __cold mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { +__cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { MDBX_page *dp; int rc = MDBX_SUCCESS; @@ -13866,7 +14133,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { } __cold int mdbx_env_close(MDBX_env *env) { - return mdbx_env_close_ex(env, false); + return __inline_mdbx_env_close(env); } /* Compare two items pointing at aligned unsigned int's. */ @@ -13903,7 +14170,7 @@ static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { } } -/* Compare two items pointing at unsigneds of unknown alignment. +/* Compare two items pointing at unsigned values with unknown alignment. * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { @@ -14059,7 +14326,7 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, const MDBX_val *key, static void mdbx_cursor_adjust(MDBX_cursor *mc, func) { MDBX_cursor *m2; - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { func(mc, m2); } @@ -14068,13 +14335,11 @@ static void mdbx_cursor_adjust(MDBX_cursor *mc, func) { #endif /* Pop a page off the top of the cursor's stack. */ -static void mdbx_cursor_pop(MDBX_cursor *mc) { +static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { if (mc->mc_snum) { mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - - mc->mc_snum--; - if (mc->mc_snum) { + if (--mc->mc_snum) { mc->mc_top--; } else { mc->mc_flags &= ~C_INITIALIZED; @@ -14084,7 +14349,7 @@ static void mdbx_cursor_pop(MDBX_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDBX_TXN_ERROR on failure. */ -static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { +static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); @@ -14113,7 +14378,7 @@ static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { * * Returns 0 on success, non-zero on failure. */ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, - int *lvl, const txnid_t pp_txnid) { + int *lvl, txnid_t pp_txnid) { MDBX_txn *txn = mc->mc_txn; if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); @@ -14138,8 +14403,11 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, * because the dirty list got full. Bring this page * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ - if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) + if (txn->tw.spill_pages && + mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) { + pp_txnid = txn->mt_txnid; goto spilled; + } p = mdbx_dpl_find(txn->tw.dirtylist, pgno); if (p) goto dirty; @@ -14495,9 +14763,6 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14516,21 +14781,7 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_val save_data = *data; - int exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_RANGE, &exact); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (exact && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) != 0) { - *data = save_data; - exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_GET_BOTH_RANGE, &exact); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - - return exact ? MDBX_SUCCESS : MDBX_RESULT_TRUE; + return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, @@ -14584,15 +14835,15 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * Replaces the page at the top of the cursor's stack with the specified * sibling, if one exists. * - * [in] mc The cursor for this operation. - * [in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. + * [in] mc The cursor for this operation. + * [in] dir SIBLING_LEFT or SIBLING_RIGHT. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { +static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { int rc; - MDBX_node *indx; + MDBX_node *node; MDBX_page *mp; + assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ @@ -14601,29 +14852,28 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { mdbx_debug("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - if (move_right + if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) : (mc->mc_ki[mc->mc_top] == 0)) { - mdbx_debug("no more keys left, moving to %s sibling", - move_right ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDBX_SUCCESS)) { + mdbx_debug("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; return rc; } } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - mdbx_debug("just moving to %s index key %u", move_right ? "right" : "left", + assert((dir - 1) == -1 || (dir - 1) == 1); + mc->mc_ki[mc->mc_top] += dir - 1; + mdbx_debug("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); } mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - indx = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(indx), &mp, NULL, + node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, NULL, pp_txnid4chk(mp, mc->mc_txn))) != 0)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -14633,9 +14883,9 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { rc = mdbx_cursor_push(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (!move_right) - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; + mc->mc_ki[mc->mc_top] = + (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); return MDBX_SUCCESS; } @@ -14685,17 +14935,21 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto skip; } - if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) { + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)++ki; + const int numkeys = page_numkeys(mp); + if (unlikely(ki >= numkeys)) { mdbx_debug("%s", "=====> move to next sibling page"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { + mc->mc_ki[mc->mc_top] = numkeys - 1; + if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != + MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } mp = mc->mc_pg[mc->mc_top]; mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]++; + } skip: mdbx_debug("==> cursor points to page %" PRIaPGNO @@ -14718,18 +14972,14 @@ skip: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -14781,22 +15031,24 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_flags &= ~(C_EOF | C_DEL); - if (mc->mc_ki[mc->mc_top] == 0) { + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)--ki; + if (unlikely(ki < 0)) { + mc->mc_ki[mc->mc_top] = 0; mdbx_debug("%s", "=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, 0)) != MDBX_SUCCESS) { + if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) return rc; - } mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]--; - + } mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + if (IS_LEAF2(mp)) { if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; @@ -14805,25 +15057,20 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - mdbx_cassert(mc, IS_LEAF(mp)); node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -14878,8 +15125,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; - if (!page_numkeys(mp)) { + if (unlikely(!page_numkeys(mp))) { mc->mc_ki[mc->mc_top] = 0; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } if (IS_LEAF2(mp)) { @@ -14895,6 +15143,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * was the one we wanted. */ mc->mc_ki[mc->mc_top] = 0; *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } if (rc > 0) { @@ -14913,6 +15164,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } if (rc < 0) { @@ -14929,6 +15183,9 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc == 0) { /* current node was the one we wanted */ *exactp = 1; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto set1; } } @@ -14946,17 +15203,22 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* There are no other pages */ mdbx_cassert(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } } if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDBX_SET_RANGE && exactp == &stub_exactp) { + if (op == MDBX_SET_RANGE) { rc = 0; goto set1; - } else + } else { + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); return MDBX_NOTFOUND; + } } } else { mc->mc_pg[0] = 0; @@ -14971,21 +15233,28 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, set2: node = mdbx_node_search(mc, &aligned_key, exactp); - if (exactp != &stub_exactp && !*exactp) { + if (!*exactp && op != MDBX_SET_RANGE) { /* MDBX_SET specified and not an exact match. */ + if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } if (node == NULL) { mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { + if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != + MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_LEAF(mp)); - node = page_node(mp, 0); + if (!IS_LEAF2(mp)) + node = page_node(mp, 0); } + mdbx_cassert(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); set1: mc->mc_flags |= C_INITIALIZED; @@ -15003,19 +15272,16 @@ set1: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2 = 0, *ex2p = (op == MDBX_GET_BOTH) ? &ex2 : NULL; - rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE, ex2p); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - } else if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE, NULL); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { + if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { mdbx_cassert(mc, !"Invalid data-size"); @@ -15050,20 +15316,20 @@ set1: return rc; rc = mc->mc_dbx->md_dcmp(&aligned_data, &olddata); if (rc) { + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); if (op != MDBX_GET_BOTH_RANGE || rc > 0) return MDBX_NOTFOUND; + *exactp = 0; rc = 0; } *data = olddata; - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + } else if (unlikely((rc = mdbx_node_read(mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], + mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } /* The key already matches in all other cases */ @@ -15095,28 +15361,29 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } + get_key_optional(node, key); return MDBX_SUCCESS; } @@ -15128,12 +15395,10 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (likely((mc->mc_flags & (C_EOF | C_DEL)) != C_EOF)) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } if (unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) @@ -15143,28 +15408,28 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_flags |= C_INITIALIZED | C_EOF; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], - key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], + mc->mc_ki[mc->mc_top], key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } get_key_optional(node, key); @@ -15177,7 +15442,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -15188,12 +15454,13 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; + return MDBX_ENODATA; MDBX_page *mp = mc->mc_pg[mc->mc_top]; const unsigned nkeys = page_numkeys(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { mdbx_cassert(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } mdbx_cassert(mc, nkeys > 0); @@ -15214,14 +15481,17 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; + } else { + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); + if (unlikely(rc)) + return rc; } - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); } else { rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(rc)) + return rc; } - if (unlikely(rc)) - return rc; } } break; @@ -15239,8 +15509,13 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op, - op == MDBX_SET_RANGE ? NULL : &exact); + rc = mdbx_cursor_set(mc, key, data, op, &exact); + if (mc->mc_flags & C_INITIALIZED) { + mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + } break; case MDBX_GET_MULTIPLE: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) @@ -15248,8 +15523,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) != + C_INITIALIZED) break; goto fetchm; case MDBX_NEXT_MULTIPLE: @@ -15283,7 +15558,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc == MDBX_SUCCESS) { MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { - rc = mdbx_cursor_sibling(mx, 0); + rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); if (rc == MDBX_SUCCESS) goto fetchm; } else { @@ -15313,6 +15588,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_INCOMPATIBLE; if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); + mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } { @@ -15334,6 +15610,33 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_LAST_DUP: mfunc = mdbx_cursor_last; goto mmove; + case MDBX_SET_LOWERBOUND: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + rc = mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE, &exact); + if (rc == MDBX_SUCCESS && exact && mc->mc_xcursor) { + mc->mc_flags &= ~C_DEL; + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + *data = save_data; + exact = 0; + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE, &exact); + if (rc == MDBX_NOTFOUND) { + mdbx_cassert(mc, !exact); + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } else { + int cmp = mc->mc_dbx->md_dcmp(&save_data, data); + exact = (cmp == 0); + if (cmp > 0) + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } + if (rc == MDBX_SUCCESS && !exact) + rc = MDBX_RESULT_TRUE; + break; + } default: mdbx_debug("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; @@ -15363,6 +15666,7 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { if (unlikely(rc)) return rc; *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; } mc->mc_top = 0; if (mc->mc_snum) { @@ -15387,12 +15691,14 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; + mdbx_cassert(mc, cursor_is_tracked(mc)); env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ @@ -15570,7 +15876,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; } else { - if (unlikely(rc != 0 || !(flags & MDBX_APPENDDUP))) + if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP))) /* new-key < last-key * or new-key == last-key without MDBX_APPENDDUP */ return MDBX_EKEYMISMATCH; @@ -15667,6 +15973,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, data->iov_len); } *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) np->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; @@ -16050,18 +16357,17 @@ new_sub:; rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - MDBX_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDBX_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + const MDBX_dbi dbi = mc->mc_dbi; + const unsigned i = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[i]; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { - m3->mc_ki[i]++; - } + if (m3->mc_ki[i] >= mc->mc_ki[i]) + m3->mc_ki[i] += insert_key; if (XCURSOR_INITED(m3)) XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); } @@ -16111,7 +16417,7 @@ new_sub:; MDBX_page *mp = mc->mc_pg[i]; const int nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -16184,7 +16490,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -16205,6 +16512,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; if (IS_LEAF2(mp)) goto del_key; @@ -16215,9 +16524,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) { + if (!F_ISSET(node_flags(node), F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) return rc; @@ -16235,7 +16543,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -16847,7 +17155,7 @@ int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EINVAL; + return MDBX_EBADSIGN; MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); couple->mc_userctx = ctx; @@ -16870,26 +17178,9 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!mc)) return MDBX_EINVAL; - if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE)) { - if (unlikely(mc->mc_signature != MDBX_MC_LIVE || mc->mc_backup)) - return MDBX_EINVAL; - if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_PROBLEM; - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - mc->mc_dbi = UINT_MAX; - } - - assert(!mc->mc_backup && !mc->mc_flags); - if (unlikely(mc->mc_backup || mc->mc_flags)) - return MDBX_PROBLEM; + if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && + mc->mc_signature != MDBX_MC_LIVE)) + return MDBX_EBADSIGN; int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -16901,13 +17192,52 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; + if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { + mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + if (unlikely(mc->mc_dbi != dbi || + /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || + mc->mc_txn != txn)) + return MDBX_EINVAL; + + assert(mc->mc_db == &txn->mt_dbs[dbi]); + assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); + assert(mc->mc_dbi == dbi); + assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); + return likely(mc->mc_dbi == dbi && + /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && + mc->mc_txn == txn) + ? MDBX_SUCCESS + : MDBX_EINVAL /* Disallow change DBI in nested transactions */; + } + + if (mc->mc_signature == MDBX_MC_LIVE) { + if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_PROBLEM; + if (mc->mc_flags & C_UNTRACK) { + mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); + MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + mdbx_cassert(mc, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + mc->mc_dbi = UINT_MAX; + mc->mc_next = NULL; + mc->mc_db = NULL; + mc->mc_dbx = NULL; + mc->mc_dbistate = NULL; + } + mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + rc = mdbx_cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) { + mc->mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = mc; mc->mc_flags |= C_UNTRACK; } @@ -16937,43 +17267,39 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; } -/* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { - if (unlikely(mc == NULL)) +int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { + if (unlikely(!src)) return MDBX_EINVAL; + if (unlikely(src->mc_signature != MDBX_MC_LIVE)) + return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; - - if (!mc->mc_snum) { - *countp = 0; - return MDBX_NOTFOUND; + assert(dest->mc_db == src->mc_db); + assert(dest->mc_dbi == src->mc_dbi); + assert(dest->mc_dbx == src->mc_dbx); + assert(dest->mc_dbistate == src->mc_dbistate); +again: + assert(dest->mc_txn == src->mc_txn); + dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; + dest->mc_top = src->mc_top; + dest->mc_snum = src->mc_snum; + for (unsigned i = 0; i < src->mc_snum; ++i) { + dest->mc_ki[i] = src->mc_ki[i]; + dest->mc_pg[i] = src->mc_pg[i]; } - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { - *countp = 0; - return MDBX_NOTFOUND; + if (src->mc_xcursor) { + dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db; + dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx; + src = &src->mc_xcursor->mx_cursor; + dest = &dest->mc_xcursor->mx_cursor; + goto again; } - *countp = 1; - if (mc->mc_xcursor != NULL) { - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); - *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) - ? PTRDIFF_MAX - : (size_t)mc->mc_xcursor->mx_db.md_entries; - } - } return MDBX_SUCCESS; } @@ -16985,17 +17311,19 @@ void mdbx_cursor_close(MDBX_cursor *mc) { /* Remove from txn, if tracked. * A read-only txn (!C_UNTRACK) may have been freed already, * so do not peek inside it. Only write txns track cursors. */ - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_UNTRACK) { + mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); + MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; + mdbx_cassert(mc, *prev == mc); + *prev = mc->mc_next; } mc->mc_signature = 0; + mc->mc_next = mc; mdbx_free(mc); } else { - /* cursor closed before nested txn ends */ + /* Cursor closed before nested txn ends */ mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); mc->mc_signature = MDBX_MC_WAIT4EOT; } @@ -17019,6 +17347,47 @@ MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { return mc->mc_dbi; } +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + + if (!mc->mc_snum) { + *countp = 0; + return MDBX_NOTFOUND; + } + + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { + *countp = 0; + return MDBX_NOTFOUND; + } + + *countp = 1; + if (mc->mc_xcursor != NULL) { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); + *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) + ? PTRDIFF_MAX + : (size_t)mc->mc_xcursor->mx_db.md_entries; + } + } + return MDBX_SUCCESS; +} + /* Replace the key for a branch node with a new key. * Set MDBX_TXN_ERROR on failure. * [in] mc Cursor pointing to the node to operate on. @@ -17033,6 +17402,7 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { int ptr, i, nkeys, indx; DKBUF; + mdbx_cassert(mc, cursor_is_tracked(mc)); indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node = page_node(mp, indx); @@ -17042,8 +17412,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, mdbx_dump_val(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), + mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, + indx, ptr, mdbx_dump_val(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } @@ -17151,7 +17521,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { const unsigned snum = cdst->mc_snum; mdbx_cassert(csrc, snum > 0); MDBX_cursor mn; - mdbx_cursor_copy(cdst, &mn); + cursor_copy_internal(cdst, &mn); mn.mc_xcursor = NULL; /* must find the lowest key below dst */ rc = mdbx_page_search_lowest(&mn); @@ -17190,7 +17560,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - rc = mdbx_update_key(&mn, &key); + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); if (unlikely(rc)) return rc; } else { @@ -17274,7 +17644,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); if (fromleft) { /* If we're adding on the left, bump others up */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) continue; @@ -17294,7 +17664,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { } } else { /* Adding on the right, bump others down */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == csrc) continue; @@ -17333,7 +17703,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", psrc->mp_pgno, DKEY(&key)); MDBX_cursor mn; - mdbx_cursor_copy(csrc, &mn); + cursor_copy_internal(csrc, &mn); mn.mc_xcursor = NULL; mdbx_cassert(csrc, mn.mc_snum > 0); mn.mc_snum--; @@ -17368,7 +17738,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", pdst->mp_pgno, DKEY(&key)); MDBX_cursor mn; - mdbx_cursor_copy(cdst, &mn); + cursor_copy_internal(cdst, &mn); mn.mc_xcursor = NULL; mdbx_cassert(cdst, mn.mc_snum > 0); mn.mc_snum--; @@ -17405,6 +17775,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { int rc; mdbx_cassert(csrc, csrc != cdst); + mdbx_cassert(csrc, cursor_is_tracked(csrc)); + mdbx_cassert(cdst, cursor_is_tracked(cdst)); const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, @@ -17449,7 +17821,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { key.iov_base = node_key(srcnode); if (pagetype & P_BRANCH) { MDBX_cursor mn; - mdbx_cursor_copy(csrc, &mn); + cursor_copy_internal(csrc, &mn); mn.mc_xcursor = NULL; /* must find the lowest key below src */ rc = mdbx_page_search_lowest(&mn); @@ -17534,7 +17906,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { const MDBX_dbi dbi = csrc->mc_dbi; const unsigned top = csrc->mc_top; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == csrc || top >= m3->mc_snum) continue; @@ -17645,7 +18017,7 @@ bailout: /* Copy the contents of a cursor. * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ -static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { +static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= *csrc->mc_txn->mt_env->me_oldest); cdst->mc_txn = csrc->mc_txn; @@ -17666,6 +18038,7 @@ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ static int mdbx_rebalance(MDBX_cursor *mc) { + mdbx_cassert(mc, cursor_is_tracked(mc)); mdbx_cassert(mc, mc->mc_snum > 0); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); @@ -17723,8 +18096,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_db->md_overflow_pages == 0 && mc->mc_db->md_leaf_pages == 1); /* Adjust cursors pointing to mp */ - const MDBX_dbi dbi = mc->mc_dbi; - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; @@ -17758,10 +18130,10 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - MDBX_dbi dbi = mc->mc_dbi; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_pg[0] == mp) { @@ -17802,7 +18174,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* Find neighbors. */ MDBX_cursor mn; - mdbx_cursor_copy(mc, &mn); + cursor_copy_internal(mc, &mn); mn.mc_xcursor = NULL; MDBX_page *left = nullptr, *right = nullptr; @@ -17839,7 +18211,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { - mdbx_cursor_copy(&mn, mc); + cursor_copy_internal(&mn, mc); mc->mc_ki[mc->mc_top] = new_ki; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; @@ -17852,7 +18224,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_page_merge(&mn, mc); + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17867,7 +18239,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); mc->mc_ki[mc->mc_top] = 0; - rc = mdbx_node_move(&mn, mc, true); + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top + 1; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17880,7 +18252,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_node_move(&mn, mc, false); + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -17908,19 +18280,20 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { - mdbx_cursor_copy(&mn, mc); + cursor_copy_internal(&mn, mc); mc->mc_ki[mc->mc_top] = new_ki; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } - } else if (likely(right)) { + } + if (likely(right)) { /* try merge with right */ mdbx_cassert(mc, page_numkeys(right) >= minkeys); mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = nkeys; - rc = mdbx_page_merge(&mn, mc); + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = ki_top; mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); @@ -18058,7 +18431,7 @@ static __cold int mdbx_page_check(MDBX_cursor *const mc, lp->mp_pgno); continue; } - if (unlikely(number_of_ovpages(env, dsize) != lp->mp_pages)) + if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) rc = bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", dsize, lp->mp_pages); @@ -18278,135 +18651,115 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { MDBX_page *mp; indx_t ki; unsigned nkeys; - MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; + mdbx_cassert(mc, cursor_is_tracked(mc)); mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; mdbx_node_del(mc, mc->mc_db->md_xsize); mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } + rc = mdbx_rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - if (likely(rc == MDBX_SUCCESS)) { + if (unlikely(!mc->mc_snum)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done * by mdbx_rebalance and aren't needed here. */ - if (!mc->mc_snum) { - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_DEL | C_EOF; - return rc; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && - mc->mc_db->md_entries == 0 && nkeys == 0)); + mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } - /* Adjust THIS and other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, true); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } else if (unlikely(rc != MDBX_SUCCESS)) - break; + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; } - if (m3->mc_ki[mc->mc_top] >= ki || m3->mc_pg[mc->mc_top] != mp) { - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (m3->mc_flags & C_EOF) == 0) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - break; - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + MDBX_node *node = + page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; } + m3->mc_flags |= C_DEL; } } - - if (mc->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(mc, true); - if (rc == MDBX_NOTFOUND) { - mc->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - } - } - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (mc->mc_flags & C_EOF) == 0) { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (likely(rc != MDBX_SUCCESS)) - mc->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } - } - mc->mc_flags |= C_DEL; } - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - else if (mdbx_audit_enabled()) + mdbx_cassert(mc, rc == MDBX_SUCCESS); + if (mdbx_audit_enabled()) rc = mdbx_cursor_check(mc, 0); + return rc; +bailout: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } @@ -18461,10 +18814,10 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * is larger than the current one, the parent page may * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_del(&cx.outer, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; } return rc; } @@ -18551,7 +18904,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); } - mdbx_cursor_copy(mc, &mn); + cursor_copy_internal(mc, &mn); mn.mc_xcursor = NULL; mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[mn.mc_top] = 0; @@ -18744,7 +19097,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdbx_cursor_sibling(mc, false); + rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); } } } else { @@ -18900,7 +19253,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, MDBX_dbi dbi = mc->mc_dbi; nkeys = page_numkeys(mp); - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + for (m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc) continue; @@ -18976,8 +19329,8 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; /* LY: support for update (explicit overwrite) */ if (flags & MDBX_CURRENT) { @@ -18998,7 +19351,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, if (likely(rc == MDBX_SUCCESS)) rc = mdbx_cursor_put(&cx.outer, key, data, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } @@ -19008,7 +19361,6 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, #ifndef MDBX_WBUF #define MDBX_WBUF ((size_t)1024 * 1024) #endif -#define MDBX_EOF 0x10 /* mdbx_env_copythr() is done reading */ /* State needed for a double-buffering compacting copy. */ typedef struct mdbx_copy { @@ -19020,53 +19372,67 @@ typedef struct mdbx_copy { size_t mc_wlen[2]; size_t mc_olen[2]; mdbx_filehandle_t mc_fd; - volatile int mc_error; - pgno_t mc_next_pgno; - short mc_toggle; /* Buffer number in provider */ - short mc_new; /* (0-2 buffers to write) | (MDBX_EOF at end) */ /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ + volatile int mc_error; + pgno_t mc_next_pgno; + volatile unsigned mc_head; + volatile unsigned mc_tail; } mdbx_copy; /* Dedicated writer thread for compacting copy. */ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; - uint8_t *ptr; - int toggle = 0; + +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + sigset_t sigset; + sigemptyset(&sigset); + sigaddset(&sigset, SIGPIPE); + my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); +#endif /* EPIPE */ mdbx_condpair_lock(&my->mc_condpair); while (!my->mc_error) { - while (!my->mc_new && !my->mc_error) { + while (my->mc_tail == my->mc_head && !my->mc_error) { int err = mdbx_condpair_wait(&my->mc_condpair, true); if (err != MDBX_SUCCESS) { my->mc_error = err; goto bailout; } } - if (my->mc_new == 0 + MDBX_EOF) /* 0 buffers, just EOF */ - break; + const unsigned toggle = my->mc_tail & 1; size_t wsize = my->mc_wlen[toggle]; - ptr = my->mc_wbuf[toggle]; + if (wsize == 0) { + my->mc_tail += 1; + break /* EOF */; + } + my->mc_wlen[toggle] = 0; + uint8_t *ptr = my->mc_wbuf[toggle]; again: - if (wsize > 0 && !my->mc_error) { + if (!my->mc_error) { int err = mdbx_write(my->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + if (err == EPIPE) { + /* Collect the pending SIGPIPE, + * otherwise at least OS X gives it to the process on thread-exit. */ + int unused; + sigwait(&sigset, &unused); + } +#endif /* EPIPE */ my->mc_error = err; goto bailout; } } /* If there's an overflow page tail, write it too */ - if (my->mc_olen[toggle]) { - wsize = my->mc_olen[toggle]; - ptr = my->mc_over[toggle]; + wsize = my->mc_olen[toggle]; + if (wsize) { my->mc_olen[toggle] = 0; + ptr = my->mc_over[toggle]; goto again; } - my->mc_wlen[toggle] = 0; - toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; + my->mc_tail += 1; mdbx_condpair_signal(&my->mc_condpair, false); } bailout: @@ -19074,24 +19440,19 @@ bailout: return (THREAD_RESULT)0; } -/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. - * - * [in] my control structure. - * [in] adjust (1 to hand off 1 buffer) | (MDBX_EOF when ending). */ -static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { +/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ +static __cold int mdbx_env_cthr_toggle(mdbx_copy *my) { mdbx_condpair_lock(&my->mc_condpair); - my->mc_new += (short)adjust; + mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); + my->mc_head += 1; mdbx_condpair_signal(&my->mc_condpair, true); - while (!my->mc_error && (my->mc_new & 2) /* both buffers in use */) { + while (!my->mc_error && + my->mc_head - my->mc_tail == 2 /* both buffers in use */) { int err = mdbx_condpair_wait(&my->mc_condpair, false); if (err != MDBX_SUCCESS) my->mc_error = err; } mdbx_condpair_unlock(&my->mc_condpair); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; return my->mc_error; } @@ -19099,11 +19460,11 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { * [in] my control structure. * [in,out] pg database root. * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ -static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { +static __cold int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_cursor_couple couple; MDBX_page *mo, *mp, *leaf; char *buf, *ptr; - int rc, toggle; + int rc; unsigned i; /* Empty DB, nothing to do */ @@ -19139,11 +19500,9 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* This is writable space for a leaf page. Usually not needed. */ leaf = (MDBX_page *)ptr; - toggle = my->mc_toggle; while (couple.outer.mc_snum > 0) { - unsigned n; mp = couple.outer.mc_pg[couple.outer.mc_top]; - n = page_numkeys(mp); + unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { @@ -19166,11 +19525,12 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { pp_txnid4chk(mp, my->mc_txn)); if (unlikely(rc != MDBX_SUCCESS)) goto done; - if (my->mc_wlen[toggle] >= MDBX_WBUF) { - rc = mdbx_env_cthr_toggle(my, 1); + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_env->me_psize > MDBX_WBUF) { + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); @@ -19180,10 +19540,10 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { if (omp->mp_pages > 1) { my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; - rc = mdbx_env_cthr_toggle(my, 1); + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } } else if (node_flags(node) & F_SUBDATA) { if (node_ds(node) != sizeof(MDBX_db)) { @@ -19201,11 +19561,9 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_db db; memcpy(&db, node_data(node), sizeof(MDBX_db)); - my->mc_toggle = (short)toggle; rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); if (rc) goto done; - toggle = my->mc_toggle; memcpy(node_data(node), &db, sizeof(MDBX_db)); } } @@ -19234,11 +19592,12 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { continue; } } - if (my->mc_wlen[toggle] >= MDBX_WBUF) { - rc = mdbx_env_cthr_toggle(my, 1); + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > MDBX_WBUF) { + rc = mdbx_env_cthr_toggle(my); if (unlikely(rc != MDBX_SUCCESS)) goto done; - toggle = my->mc_toggle; + toggle = my->mc_head & 1; } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdbx_page_copy(mo, mp, my->mc_env->me_psize); @@ -19298,7 +19657,7 @@ static __cold void make_sizeable(MDBX_meta *meta) { } /* Copy environment with compaction. */ -static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, +static __cold int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const int flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); @@ -19375,8 +19734,12 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } if (rc == MDBX_SUCCESS) rc = mdbx_env_cwalk(&ctx, &root, 0); - mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); + mdbx_env_cthr_toggle(&ctx); + mdbx_env_cthr_toggle(&ctx); thread_err = mdbx_thread_join(thread); + mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); mdbx_condpair_destroy(&ctx.mc_condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) @@ -19435,7 +19798,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* Copy environment as-is. */ -static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, +static __cold int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const int flags) { /* We must start the actual read txn after blocking writers */ @@ -19542,7 +19905,7 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, return rc; } -int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, +__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19607,7 +19970,7 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, return rc; } -int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, +__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19669,7 +20032,7 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, /******************************************************************************/ -int __cold mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, +__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, bool onoff) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -19697,7 +20060,7 @@ int __cold mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, return MDBX_SUCCESS; } -int __cold mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { +__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19709,7 +20072,7 @@ int __cold mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_userctx(MDBX_env *env, void *ctx) { +__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19722,7 +20085,7 @@ void *__cold mdbx_env_get_userctx(const MDBX_env *env) { return env ? env->me_userctx : NULL; } -int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { +__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19736,7 +20099,7 @@ int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } -int __cold mdbx_env_get_path(const MDBX_env *env, const char **arg) { +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19744,11 +20107,11 @@ int __cold mdbx_env_get_path(const MDBX_env *env, const char **arg) { if (unlikely(!arg)) return MDBX_EINVAL; - *arg = env->me_path; + *arg = env->me_pathname; return MDBX_SUCCESS; } -int __cold mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { +__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19778,11 +20141,11 @@ static void mdbx_stat0(const MDBX_env *env, const MDBX_db *db, MDBX_stat *dest, dest->ms_mod_txnid = db->md_mod_txnid; } -int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *dest, size_t bytes) { - return mdbx_env_stat_ex(env, NULL, dest, bytes); +__cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) { + return __inline_mdbx_env_stat(env, stat, bytes); } -int __cold mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, +__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_stat *dest, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || dest == NULL)) return MDBX_EINVAL; @@ -19820,7 +20183,7 @@ int __cold mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, } } -int __cold mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, +__cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, uint32_t *mask) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -19871,11 +20234,12 @@ int __cold mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { - return mdbx_env_info_ex(env, NULL, arg, bytes); +__cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, + size_t bytes) { + return __inline_mdbx_env_info(env, info, bytes); } -int __cold mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, +__cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *arg, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; @@ -20249,6 +20613,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, goto later_bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; + txn->mt_flags |= MDBX_TXN_DIRTY; } /* Got info, register DBI in this txn */ @@ -20266,6 +20631,8 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; txn->mt_dbxs[slot].md_name.iov_len = len; + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) + txn->tw.cursors[slot] = NULL; txn->mt_numdbs += (slot == txn->mt_numdbs); if ((dbiflags & DBI_CREAT) == 0) { env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; @@ -20294,7 +20661,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); } -int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, +__cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -20376,8 +20743,7 @@ int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, } int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { - unsigned state; - return mdbx_dbi_flags_ex(txn, dbi, flags, &state); + return __inline_mdbx_dbi_flags(txn, dbi, flags); } /* Add all the DB's pages to the free list. @@ -20404,7 +20770,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { if (unlikely(rc)) goto done; - mdbx_cursor_copy(mc, &mx); + cursor_copy_internal(mc, &mx); while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = page_numkeys(mp); @@ -20446,7 +20812,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { break; mdbx_cassert(mc, i <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)i; - rc = mdbx_cursor_sibling(mc, 1); + rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); if (rc) { if (unlikely(rc != MDBX_NOTFOUND)) goto done; @@ -20501,7 +20867,7 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDBX_DUPSORT); /* Invalidate the dropped DB's cursors */ - for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + for (MDBX_cursor *m2 = txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); if (unlikely(rc)) goto bailout; @@ -20566,7 +20932,7 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { return MDBX_SUCCESS; } -int __cold mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, +__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, void *ctx) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) @@ -20666,7 +21032,7 @@ static bool __cold mdbx_pid_insert(uint32_t *ids, uint32_t pid) { return true; } -int __cold mdbx_reader_check(MDBX_env *env, int *dead) { +__cold int mdbx_reader_check(MDBX_env *env, int *dead) { if (dead) *dead = 0; return mdbx_cleanup_dead_readers(env, false, dead); @@ -20676,9 +21042,8 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { * MDBX_RESULT_TRUE - done and mutex recovered * MDBX_SUCCESS - done * Otherwise errcode. */ -MDBX_INTERNAL_FUNC int __cold mdbx_cleanup_dead_readers(MDBX_env *env, - int rdt_locked, - int *dead) { +MDBX_INTERNAL_FUNC __cold int +mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20775,7 +21140,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_cleanup_dead_readers(MDBX_env *env, return rc; } -int __cold mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { +__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); if (loglevel != MDBX_LOG_DONTCHANGE) @@ -20893,7 +21258,7 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, return mdbx_find_oldest(env->me_txn); } -int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { +__cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20913,7 +21278,7 @@ int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { +__cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20933,7 +21298,7 @@ int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { +__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { int rc = check_env(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20990,7 +21355,7 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, const char *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { @@ -21011,7 +21376,7 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { } /* Depth-first tree traversal. */ -static int __cold mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, +static __cold int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, const char *name, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); @@ -21249,7 +21614,7 @@ static int __cold mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, const char *name, int deep) { if (unlikely(db->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ @@ -21272,7 +21637,7 @@ static int __cold mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, return rc; } -int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, +__cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user, bool dont_check_keys_ordering) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21333,10 +21698,11 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; for (unsigned i = 0; i < mc->mc_snum; ++i) { if (mc->mc_ki[i]) @@ -21351,10 +21717,11 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; for (unsigned i = 0; i < mc->mc_snum; ++i) { unsigned nkeys = page_numkeys(mc->mc_pg[i]); @@ -21370,7 +21737,8 @@ int mdbx_cursor_eof(const MDBX_cursor *mc) { return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; if ((mc->mc_flags & C_INITIALIZED) == 0) return MDBX_RESULT_TRUE; @@ -21401,9 +21769,13 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, r->level = 0; r->root_nkeys = 0; - if (unlikely(y->mc_signature != MDBX_MC_LIVE || - x->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + if (unlikely(x->mc_signature != MDBX_MC_LIVE)) + return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (unlikely(y->mc_signature != MDBX_MC_LIVE)) + return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21569,7 +21941,8 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(cursor->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -21579,7 +21952,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, return MDBX_ENODATA; MDBX_cursor_couple next; - mdbx_cursor_copy(cursor, &next.outer); + cursor_copy_internal(cursor, &next.outer); next.outer.mc_xcursor = NULL; if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; @@ -21587,7 +21960,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; - mdbx_cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor); + cursor_copy_internal(&mx->mx_cursor, &next.inner.mx_cursor); } MDBX_val stub = {0, 0}; @@ -21824,8 +22197,8 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; MDBX_val present_key = *key; if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { @@ -21901,7 +22274,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); bailout: - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } @@ -22040,6 +22413,14 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) { + return __inline_mdbx_limits_pgsize_min(); +} + +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { + return __inline_mdbx_limits_pgsize_max(); +} + __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { if (pagesize < 1) pagesize = (intptr_t)mdbx_syspagesize(); @@ -22143,6 +22524,14 @@ uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { return float2key(ieee754_32bit); } +MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) { + return __inline_mdbx_key_from_int64(i64); +} + +MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) { + return __inline_mdbx_key_from_int32(i32); +} + #define IEEE754_DOUBLE_MANTISSA_SIZE 52 #define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF #define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF @@ -22409,10 +22798,10 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && data) { - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; } return rc; } @@ -22427,11 +22816,11 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, old_data.iov_len) == 0))) return MDBX_SUCCESS; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, MDBX_CURRENT); - txn->mt_cursors[dbi] = cx.outer.mc_next; + txn->tw.cursors[dbi] = cx.outer.mc_next; return rc; } #endif /* MDBX_NEXENTA_ATTRS */ @@ -22805,6 +23194,9 @@ typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { #ifndef STATUS_INVALID_DEVICE_REQUEST #define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) #endif +#ifndef STATUS_NOT_SUPPORTED +#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) +#endif #ifndef FILE_DEVICE_FILE_SYSTEM #define FILE_DEVICE_FILE_SYSTEM 0x00000009 @@ -23206,6 +23598,20 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #endif } +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : GetLastError(); +#else + return rmdir(pathname) ? errno : MDBX_SUCCESS; +#endif +} + MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, const MDBX_env *env, const char *pathname, mdbx_filehandle_t *fd, @@ -23256,6 +23662,12 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, FlagsAndAttributes |= (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; break; + case MDBX_OPEN_DELETE: + CreationDisposition = OPEN_EXISTING; + ShareMode |= FILE_SHARE_DELETE; + DesiredAccess = + FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; + break; } *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, @@ -23304,6 +23716,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, flags |= O_FSYNC; #endif break; + case MDBX_OPEN_DELETE: + flags = O_RDWR; + break; } const bool direct_nocache_for_copy = @@ -23729,7 +24144,8 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { if (!(flags & MDBX_EXCLUSIVE)) return ERROR_REMOTE_STORAGE_MEDIA_ERROR; } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && - rc != STATUS_INVALID_DEVICE_REQUEST) + rc != STATUS_INVALID_DEVICE_REQUEST && + rc != STATUS_NOT_SUPPORTED) return ntstatus2errcode(rc); } @@ -24113,10 +24529,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, LARGE_INTEGER SectionSize; int err, rc = MDBX_SUCCESS; - if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current) { + if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && + /* workaround for Wine */ mdbx_NtExtendSection) { /* growth rw-section */ - if (!mdbx_NtExtendSection) - return MDBX_UNABLE_EXTEND_MAPSIZE /* workaround for Wine */; SectionSize.QuadPart = size; status = mdbx_NtExtendSection(map->section, &SectionSize); if (!NT_SUCCESS(status)) @@ -24173,6 +24588,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, return err; } +retry_file_and_section: /* resizing of the file may take a while, * therefore we reserve address space to avoid occupy it by other threads */ ReservedAddress = map->address; @@ -24188,7 +24604,6 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, map->address = NULL; } -retry_file_and_section: err = mdbx_filesize(map->fd, &map->filesize); if (err != MDBX_SUCCESS) goto bailout; @@ -24255,7 +24670,7 @@ retry_mapview:; * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ rc = MDBX_UNABLE_EXTEND_MAPSIZE; size = map->current; - limit = map->limit; + ReservedSize = limit = map->limit; goto retry_file_and_section; } @@ -24997,10 +25412,10 @@ __dll_export const struct MDBX_version_info mdbx_version = { 0, 9, - 1, - 18, - {"2020-10-08T01:50:18+03:00", "5cbfdfdc65b62937f7fc927e55cec6304bf04f7c", "1d31ebdc1c930ceb1b223691ac27e507d86cf8e2", - "v0.9.1-18-g1d31ebdc1c"}, + 2, + 0, + {"2020-11-27T10:09:03+03:00", "481b41a5b377ff247625bd2cec60c0ca84f2f2f5", "092ab094c4f8e7c704030568db62bad5d106755f", + "v0.9.2-0-g092ab09"}, sourcery}; __dll_export @@ -25221,6 +25636,15 @@ MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { mdbx_srwlock_ReleaseShared(&env->me_remap_guard); } +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { + return flock(fd, + wait ? LCK_EXCLUSIVE | LCK_WAITFOR + : LCK_EXCLUSIVE | LCK_DONTWAIT, + 0, LCK_MAXLEN) + ? MDBX_SUCCESS + : GetLastError(); +} + static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; @@ -25866,13 +26290,17 @@ static void mdbx_winnt_import(void) { #ifndef MDBX_ALLOY uint32_t mdbx_linux_kernel_version; -bool mdbx_RunningOnWSL; +bool mdbx_RunningOnWSL1; #endif /* MDBX_ALLOY */ -static __cold bool probe_for_WSL(const char *tag) { - /* "Official" way of detecting WSL but not WSL2 - * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 */ - return strstr(tag, "Microsoft") || strstr(tag, "WSL"); +static __cold uint8_t probe_for_WSL(const char *tag) { + const char *const WSL = strstr(tag, "WSL"); + if (WSL && WSL[3] >= '2' && WSL[3] <= '9') + return WSL[3] - '0'; + const char *const wsl = strstr(tag, "wsl"); + if (wsl && wsl[3] >= '2' && wsl[3] <= '9') + return wsl[3] - '0'; + return (WSL || wsl || strcasestr(tag, "Microsoft")) ? 1 : 0; } #endif /* Linux */ @@ -25882,9 +26310,16 @@ mdbx_global_constructor(void) { #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { - mdbx_RunningOnWSL = probe_for_WSL(buffer.version) || - probe_for_WSL(buffer.sysname) || - probe_for_WSL(buffer.release); + /* "Official" way of detecting WSL1 but not WSL2 + * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 + * + * WARNING: False negative detection of WSL1 will result in DATA LOSS! + * So, the REQUIREMENTS for this code: + * 1. MUST detect WSL1 without false-negatives. + * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ + mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || + probe_for_WSL(buffer.sysname) == 1 || + probe_for_WSL(buffer.release) == 1; int i = 0; char *p = buffer.release; while (*p && i < 4) { @@ -26030,6 +26465,14 @@ static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset, } } +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); +} + MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); @@ -26146,10 +26589,10 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { int rc = MDBX_SUCCESS; #if defined(__linux__) || defined(__gnu_linux__) - if (unlikely(mdbx_RunningOnWSL)) { + if (unlikely(mdbx_RunningOnWSL1)) { rc = ENOLCK /* No record locks available */; mdbx_error("%s, err %u", - "WSL (Windows Subsystem for Linux) is mad and trouble-full, " + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " "injecting failure to avoid data loss", rc); return rc; diff --git a/libs/libmdbx/src/mdbx.c++ b/libs/libmdbx/src/mdbx.c++ index ef4cb789fb..e3c3552b39 100644 --- a/libs/libmdbx/src/mdbx.c++ +++ b/libs/libmdbx/src/mdbx.c++ @@ -12,11 +12,16 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -97,11 +102,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h++" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -817,7 +817,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -993,6 +993,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1034,8 +1063,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1088,8 +1117,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1160,7 +1188,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1169,7 +1198,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1429,32 +1460,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1890,7 +1895,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1974,7 +1979,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2222,7 +2227,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2381,8 +2386,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2409,6 +2412,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2546,7 +2551,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2556,7 +2561,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2832,7 +2837,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -3441,15 +3446,15 @@ __cold void error::throw_exception() const { bool slice::is_printable(bool disable_utf8) const noexcept { enum : byte { - LS = 5, // shift for UTF8 sequence length - P_ = 1 << (LS - 1), // printable ASCII flag + LS = 4, // shift for UTF8 sequence length + P_ = 1 << LS, // printable ASCII flag N_ = 0, // non-printable ASCII second_range_mask = P_ - 1, // mask for range flag - r80_BF = P_ | 0, // flag for UTF8 2nd byte range - rA0_BF = P_ | 1, // flag for UTF8 2nd byte range - r80_9F = P_ | 2, // flag for UTF8 2nd byte range - r90_BF = P_ | 3, // flag for UTF8 2nd byte range - r80_8F = P_ | 4, // flag for UTF8 2nd byte range + r80_BF = 0, // flag for UTF8 2nd byte range + rA0_BF = 1, // flag for UTF8 2nd byte range + r80_9F = 2, // flag for UTF8 2nd byte range + r90_BF = 3, // flag for UTF8 2nd byte range + r80_8F = 4, // flag for UTF8 2nd byte range // valid utf-8 byte sequences // http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 @@ -4117,9 +4122,35 @@ bool env::is_pristine() const { bool env::is_empty() const { return get_stat().ms_branch_pages == 0; } -env &env::copy(const path &destination, bool compactify, +#ifdef MDBX_STD_FILESYSTEM_PATH +env &env::copy(const ::std::filesystem::path &destination, bool compactify, + bool force_dynamic_size) { + const path_to_pchar<::std::filesystem::path> utf8(destination); + error::success_or_throw( + ::mdbx_env_copy(handle_, utf8, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); + return *this; +} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + +#if defined(_WIN32) || defined(_WIN64) +env &env::copy(const ::std::wstring &destination, bool compactify, + bool force_dynamic_size) { + const path_to_pchar<::std::wstring> utf8(destination); + error::success_or_throw( + ::mdbx_env_copy(handle_, utf8, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); + return *this; +} +#endif /* Windows */ + +env &env::copy(const ::std::string &destination, bool compactify, bool force_dynamic_size) { - const path_to_pchar<path> utf8(destination); + const path_to_pchar<::std::string> utf8(destination); error::success_or_throw( ::mdbx_env_copy(handle_, utf8, (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | @@ -4143,6 +4174,29 @@ path env::get_path() const { return pchar_to_path<path>(c_str); } +#ifdef MDBX_STD_FILESYSTEM_PATH +bool env::remove(const ::std::filesystem::path &pathname, + const remove_mode mode) { + const path_to_pchar<::std::filesystem::path> utf8(pathname); + return error::boolean_or_throw( + ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); +} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + +#if defined(_WIN32) || defined(_WIN64) +bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { + const path_to_pchar<::std::wstring> utf8(pathname); + return error::boolean_or_throw( + ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); +} +#endif /* Windows */ + +bool env::remove(const ::std::string &pathname, const remove_mode mode) { + const path_to_pchar<::std::string> utf8(pathname); + return error::boolean_or_throw( + ::mdbx_env_delete(utf8, MDBX_env_delete_mode_t(mode))); +} + //------------------------------------------------------------------------------ static inline MDBX_env *create_env() { @@ -4179,11 +4233,43 @@ __cold void env_managed::setup(unsigned max_maps, unsigned max_readers) { error::success_or_throw(::mdbx_env_set_maxdbs(handle_, max_maps)); } -__cold env_managed::env_managed(const path &pathname, +#ifdef MDBX_STD_FILESYSTEM_PATH +__cold env_managed::env_managed(const ::std::filesystem::path &pathname, + const operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + const path_to_pchar<::std::filesystem::path> utf8(pathname); + error::success_or_throw( + ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const ::std::filesystem::path &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + const path_to_pchar<::std::filesystem::path> utf8(pathname); + set_geometry(cp.geometry); + error::success_or_throw( + ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + error::throw_exception(MDBX_INCOMPATIBLE); +} +#endif /* MDBX_STD_FILESYSTEM_PATH */ + +#if defined(_WIN32) || defined(_WIN64) +__cold env_managed::env_managed(const ::std::wstring &pathname, const operate_parameters &op, bool accede) : env_managed(create_env()) { setup(op.max_maps, op.max_readers); - const path_to_pchar<path> utf8(pathname); + const path_to_pchar<::std::wstring> utf8(pathname); error::success_or_throw( ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); @@ -4192,12 +4278,42 @@ __cold env_managed::env_managed(const path &pathname, error::throw_exception(MDBX_INCOMPATIBLE); } -__cold env_managed::env_managed(const path &pathname, +__cold env_managed::env_managed(const ::std::wstring &pathname, const env_managed::create_parameters &cp, const env::operate_parameters &op, bool accede) : env_managed(create_env()) { setup(op.max_maps, op.max_readers); - const path_to_pchar<path> utf8(pathname); + const path_to_pchar<::std::wstring> utf8(pathname); + set_geometry(cp.geometry); + error::success_or_throw( + ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + error::throw_exception(MDBX_INCOMPATIBLE); +} +#endif /* Windows */ + +__cold env_managed::env_managed(const ::std::string &pathname, + const operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + const path_to_pchar<::std::string> utf8(pathname); + error::success_or_throw( + ::mdbx_env_open(handle_, utf8, op.make_flags(accede), 0)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const ::std::string &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + const path_to_pchar<::std::string> utf8(pathname); set_geometry(cp.geometry); error::success_or_throw( ::mdbx_env_open(handle_, utf8, op.make_flags(accede, cp.use_subdirectory), @@ -4300,9 +4416,13 @@ __cold ::std::ostream &operator<<(::std::ostream &out, const slice &it) { out << "EMPTY->" << it.data(); else { const slice root(it.head(std::min(it.length(), size_t(64)))); - out << it.length() << "->" - << (root.is_printable() ? root.string() : root.base58_encode()) - << ((root == it) ? "" : "..."); + out << it.length() << "."; + if (root.is_printable()) + (out << "\"").write(root.char_ptr(), root.length()) << "\""; + else + out << root.base58_encode(); + if (root.length() < it.length()) + out << "..."; } return out << "}"; } @@ -4311,6 +4431,11 @@ __cold ::std::ostream &operator<<(::std::ostream &out, const pair &it) { return out << "{" << it.key << " => " << it.value << "}"; } +__cold ::std::ostream &operator<<(::std::ostream &out, const pair_result &it) { + return out << "{" << (it.done ? "done: " : "non-done: ") << it.key << " => " + << it.value << "}"; +} + __cold ::std::ostream &operator<<(::std::ostream &out, const ::mdbx::env::geometry::size &it) { switch (it.bytes) { diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index 4cf29e4502..966cf2e701 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -350,6 +350,17 @@ typedef mode_t mdbx_mode_t; #endif #endif /* __dll_import */ +/** \brief Auxiliary macro for robustly define the both inline version of API + * function and non-inline fallback dll-exported version for applications linked + * with old version of libmdbx, with a strictly ODR-common implementation. */ +#if !defined(LIBMDBX_INTERNALS) +#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) static __inline TYPE NAME ARGS +#else +#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) \ + /* proto of exported which uses common impl */ LIBMDBX_API TYPE NAME ARGS; \ + /* definition of common impl */ static __inline TYPE __inline_##NAME ARGS +#endif /* LIBMDBX_INLINE_API */ + /*----------------------------------------------------------------------------*/ #ifndef __cplusplus @@ -440,8 +451,27 @@ typedef mode_t mdbx_mode_t; #endif #endif /* MDBX_PRINTF_ARGS */ +/* Oh, below are some songs and dances since: + * - C++ requires explicit definition of the necessary operators. + * - the proper implementation of DEFINE_ENUM_FLAG_OPERATORS for C++ required + * the constexpr feature which is broken in most old compilers; + * - DEFINE_ENUM_FLAG_OPERATORS may be defined broken as in the Windows SDK. */ #ifndef DEFINE_ENUM_FLAG_OPERATORS -#if defined(__cplusplus) + +#ifdef __cplusplus +#if !defined(__cpp_constexpr) || __cpp_constexpr < 200704L || \ + (defined(__LCC__) && __LCC__ < 124) || \ + (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ < 407) && \ + !defined(__clang__) && !defined(__LCC__)) || \ + (defined(_MSC_VER) && _MSC_VER < 1910) || \ + (defined(__clang__) && __clang_major__ < 4) +/* The constexpr feature is not available or (may be) broken */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 0 +#else +/* C always allows these operators for enums */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif /* __cpp_constexpr */ + /// Define operator overloads to enable bit operations on enum values that are /// used to define flags (based on Microsoft's DEFINE_ENUM_FLAG_OPERATORS). #define DEFINE_ENUM_FLAG_OPERATORS(ENUM) \ @@ -453,19 +483,41 @@ typedef mode_t mdbx_mode_t; MDBX_CXX01_CONSTEXPR ENUM operator&(ENUM a, ENUM b) { \ return ENUM(std::size_t(a) & std::size_t(b)); \ } \ + MDBX_CXX01_CONSTEXPR ENUM operator&(ENUM a, size_t b) { \ + return ENUM(std::size_t(a) & b); \ + } \ + MDBX_CXX01_CONSTEXPR ENUM operator&(size_t a, ENUM b) { \ + return ENUM(a & std::size_t(b)); \ + } \ MDBX_CXX14_CONSTEXPR ENUM &operator&=(ENUM &a, ENUM b) { return a = a & b; } \ - MDBX_CXX01_CONSTEXPR ENUM operator~(ENUM a) { \ - return ENUM(~std::size_t(a)); \ + MDBX_CXX14_CONSTEXPR ENUM &operator&=(ENUM &a, size_t b) { \ + return a = a & b; \ + } \ + MDBX_CXX01_CONSTEXPR std::size_t operator~(ENUM a) { \ + return ~std::size_t(a); \ } \ MDBX_CXX01_CONSTEXPR ENUM operator^(ENUM a, ENUM b) { \ return ENUM(std::size_t(a) ^ std::size_t(b)); \ } \ MDBX_CXX14_CONSTEXPR ENUM &operator^=(ENUM &a, ENUM b) { return a = a ^ b; } \ } -#else /* __cplusplus */ -#define DEFINE_ENUM_FLAG_OPERATORS(ENUM) /* nope, C allows these operators */ -#endif /* !__cplusplus */ -#endif /* DEFINE_ENUM_FLAG_OPERATORS */ +#else /* __cplusplus */ +/* nope for C since it always allows these operators for enums */ +#define DEFINE_ENUM_FLAG_OPERATORS(ENUM) +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif /* !__cplusplus */ + +#elif !defined(CONSTEXPR_ENUM_FLAGS_OPERATIONS) + +#ifdef __cplusplus +/* DEFINE_ENUM_FLAG_OPERATORS may be defined broken as in the Windows SDK */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 0 +#else +/* C always allows these operators for enums */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif + +#endif /* DEFINE_ENUM_FLAG_OPERATORS */ /** @} end of Common Macros */ @@ -1120,7 +1172,7 @@ enum MDBX_env_flags_t { /** Don't sync anything but keep previous steady commits. * - * Like \ref MDBX_UTTERLY_NOSYNC the `MDBX_SAFE_NOSYNC` flag similarly disable + * Like \ref MDBX_UTTERLY_NOSYNC the `MDBX_SAFE_NOSYNC` flag disable similarly * flush system buffers to disk when committing a transaction. But there is a * huge difference in how are recycled the MVCC snapshots corresponding to * previous "steady" transactions (see below). @@ -1202,7 +1254,7 @@ enum MDBX_env_flags_t { * - a system crash immediately after commit the write transaction * high likely lead to database corruption. * - successful completion of mdbx_env_sync(force = true) after one or - * more commited transactions guarantees consistency and durability. + * more committed transactions guarantees consistency and durability. * - BUT by committing two or more transactions you back database into * a weak state, in which a system crash may lead to database corruption! * In case single transaction after mdbx_env_sync, you may lose transaction @@ -1251,10 +1303,10 @@ enum MDBX_txn_flags_t { * will be ready for use with \ref mdbx_txn_renew(). This flag allows to * preallocate memory and assign a reader slot, thus avoiding these operations * at the next start of the transaction. */ -#if defined(__cplusplus) && !defined(__cpp_constexpr) && !defined(DOXYGEN) - MDBX_TXN_RDONLY_PREPARE = uint32_t(MDBX_RDONLY) | uint32_t(MDBX_NOMEMINIT), +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS || defined(DOXYGEN) + MDBX_TXN_RDONLY_PREPARE = MDBX_RDONLY | MDBX_NOMEMINIT, #else - MDBX_TXN_RDONLY_PREPARE = MDBX_RDONLY + MDBX_NOMEMINIT, + MDBX_TXN_RDONLY_PREPARE = uint32_t(MDBX_RDONLY) | uint32_t(MDBX_NOMEMINIT), #endif /** Do not block when starting a write transaction. */ @@ -1295,9 +1347,9 @@ enum MDBX_db_flags_t { /** With \ref MDBX_DUPSORT; sorted dup items have fixed size */ MDBX_DUPFIXED = UINT32_C(0x10), - /** With \ref MDBX_DUPSORT; dups are \ref MDBX_INTEGERKEY -style integers. The - * data values must all be of the same size and must be aligned while passing - * as arguments. */ + /** With \ref MDBX_DUPSORT and with \ref MDBX_DUPFIXED; dups are fixed size + * \ref MDBX_INTEGERKEY -style integers. The data values must all be of the + * same size and must be aligned while passing as arguments. */ MDBX_INTEGERDUP = UINT32_C(0x20), /** With \ref MDBX_DUPSORT; use reverse string comparison */ @@ -1462,7 +1514,20 @@ enum MDBX_cursor_op { /** \ref MDBX_DUPFIXED -only: Position at previous page and return up to * a page of duplicate data items. */ - MDBX_PREV_MULTIPLE + MDBX_PREV_MULTIPLE, + + /** Position at first key-value pair greater than or equal to specified, + * return both key and data, and the return code depends on a exact match. + * + * For non DUPSORT-ed collections this work the same to \ref MDBX_SET_RANGE, + * but returns \ref MDBX_SUCCESS if key found exactly and + * \ref MDBX_RESULT_TRUE if greater key was found. + * + * For DUPSORT-ed a data value is taken into account for duplicates, + * i.e. for a pairs/tuples of a key and an each data value of duplicates. + * Returns \ref MDBX_SUCCESS if key-value pair found exactly and + * \ref MDBX_RESULT_TRUE if the next pair was returned. */ + MDBX_SET_LOWERBOUND }; #ifndef __cplusplus /** \ingroup c_cursors */ @@ -1737,8 +1802,9 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * \param [in] env An environment handle returned * by \ref mdbx_env_create() * - * \param [in] pathname The directory in which the database files reside. - * This directory must already exist and be writable. + * \param [in] pathname The pathname for the database or the directory in which + * the database files reside. In the case of directory it + * must already exist and be writable. * * \param [in] flags Special options for this environment. This parameter * must be set to 0 or by bitwise OR'ing together one @@ -1800,6 +1866,49 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); +/** \brief Deletion modes for \ref mdbx_env_delete(). + * \ingroup c_extra + * \see mdbx_env_delete() */ +enum MDBX_env_delete_mode_t { + /** \brief Just delete the environment's files and directory if any. + * \note On POSIX systems, processes already working with the database will + * continue to work without interference until it close the environment. + * \note On Windows, the behavior of `MDB_ENV_JUST_DELETE` is different + * because the system does not support deleting files that are currently + * memory mapped. */ + MDBX_ENV_JUST_DELETE = 0, + /** \brief Make sure that the environment is not being used by other + * processes, or return an error otherwise. */ + MDBX_ENV_ENSURE_UNUSED = 1, + /** \brief Wait until other processes closes the environment before deletion. + */ + MDBX_ENV_WAIT_FOR_UNUSED = 2, +}; +#ifndef __cplusplus +/** \ingroup c_extra */ +typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; +#endif + +/** \brief Delete the environment's files in a proper and multiprocess-safe way. + * \ingroup c_extra + * + * \param [in] pathname The pathname for the database or the directory in which + * the database files reside. + * + * \param [in] mode Special deletion mode for the environment. This + * parameter must be set to one of the values described + * above in the \ref MDBX_env_delete_mode_t section. + * + * \note The \ref MDBX_ENV_JUST_DELETE don't supported on Windows since system + * unable to delete a memory-mapped files. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_RESULT_TRUE No corresponding files or directories were found, + * so no deletion was performed. */ +LIBMDBX_API int mdbx_env_delete(const char *pathname, + MDBX_env_delete_mode_t mode); + /** \brief Copy an MDBX environment to the specified path, with options. * \ingroup c_extra * @@ -1867,7 +1976,7 @@ struct MDBX_stat { uint64_t ms_leaf_pages; /**< Number of leaf pages */ uint64_t ms_overflow_pages; /**< Number of overflow pages */ uint64_t ms_entries; /**< Number of data items */ - uint64_t ms_mod_txnid; /**< Transaction ID of commited last modification */ + uint64_t ms_mod_txnid; /**< Transaction ID of committed last modification */ }; #ifndef __cplusplus /** \ingroup c_statinfo */ @@ -1895,11 +2004,15 @@ typedef struct MDBX_stat MDBX_stat; * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_stat *stat, size_t bytes); + /** \brief Return statistics about the MDBX environment. * \ingroup c_statinfo * \deprecated Please use mdbx_env_stat_ex() instead. */ -MDBX_DEPRECATED LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, - size_t bytes); +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_stat, + (const MDBX_env *env, MDBX_stat *stat, + size_t bytes)) { + return mdbx_env_stat_ex(env, NULL, stat, bytes); +} /** \brief Information about the environment * \ingroup c_statinfo @@ -1985,8 +2098,11 @@ LIBMDBX_API int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, /** \brief Return information about the MDBX environment. * \ingroup c_statinfo * \deprecated Please use mdbx_env_info_ex() instead. */ -MDBX_DEPRECATED LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, - size_t bytes); +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_info, + (const MDBX_env *env, MDBX_envinfo *info, + size_t bytes)) { + return mdbx_env_info_ex(env, NULL, info, bytes); +} /** \brief Flush the environment data buffers to disk. * \ingroup c_extra @@ -2028,12 +2144,16 @@ LIBMDBX_API int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock); /** \brief The shortcut to calling \ref mdbx_env_sync_ex() with * the `force=true` and `nonblock=false` arguments. * \ingroup c_extra */ -LIBMDBX_API int mdbx_env_sync(MDBX_env *env); +LIBMDBX_INLINE_API(int, mdbx_env_sync, (MDBX_env * env)) { + return mdbx_env_sync_ex(env, true, false); +} /** \brief The shortcut to calling \ref mdbx_env_sync_ex() with * the `force=false` and `nonblock=true` arguments. * \ingroup c_extra */ -LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); +LIBMDBX_INLINE_API(int, mdbx_env_sync_poll, (MDBX_env * env)) { + return mdbx_env_sync_ex(env, false, true); +} /** \brief Sets threshold to force flush the data buffers to disk, even any of * \ref MDBX_SAFE_NOSYNC flag in the environment. @@ -2135,7 +2255,9 @@ LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, bool dont_sync); /** \brief The shortcut to calling \ref mdbx_env_close_ex() with * the `dont_sync=false` argument. * \ingroup c_opening */ -LIBMDBX_API int mdbx_env_close(MDBX_env *env); +LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { + return mdbx_env_close_ex(env, false); +} /** \brief Set environment flags. * \ingroup c_settings @@ -2356,9 +2478,11 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * means "keep current or use default". * * \param [in] shrink_threshold The shrink threshold in bytes, must be greater - * than zero to allow the database to shrink. + * than zero to allow the database to shrink and + * greater than growth_step to avoid shrinking + * right after grow. * Negative value means "keep current - * or use default". + * or use default". Default is 2*growth_step. * * \param [in] pagesize The database page size for new database * creation or -1 otherwise. Must be power of 2 @@ -2389,8 +2513,10 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, /** \deprecated Please use \ref mdbx_env_set_geometry() instead. * \ingroup c_settings */ -MDBX_DEPRECATED LIBMDBX_API int mdbx_env_set_mapsize(MDBX_env *env, - size_t size); +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_set_mapsize, + (MDBX_env * env, size_t size)) { + return mdbx_env_set_geometry(env, size, size, size, -1, -1, -1); +} /** \brief Find out whether to use readahead or not, based on the given database * size and the amount of available memory. \ingroup c_extra @@ -2411,13 +2537,15 @@ LIBMDBX_API int mdbx_is_readahead_reasonable(size_t volume, /** \brief Returns the minimal database page size in bytes. * \ingroup c_statinfo */ -MDBX_NOTHROW_CONST_FUNCTION __inline intptr_t mdbx_limits_pgsize_min(void) { +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(intptr_t, mdbx_limits_pgsize_min, + (void)) { return MDBX_MIN_PAGESIZE; } /** \brief Returns the maximal database page size in bytes. * \ingroup c_statinfo */ -MDBX_NOTHROW_CONST_FUNCTION __inline intptr_t mdbx_limits_pgsize_max(void) { +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(intptr_t, mdbx_limits_pgsize_max, + (void)) { return MDBX_MAX_PAGESIZE; } @@ -2574,12 +2702,13 @@ LIBMDBX_API int mdbx_env_set_userctx(MDBX_env *env, void *ctx); MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API void * mdbx_env_get_userctx(const MDBX_env *env); -/** \brief Create a transaction for use with the environment. +/** \brief Create a transaction with a user provided context pointer + * for use with the environment. * \ingroup c_transactions * * The transaction handle may be discarded using \ref mdbx_txn_abort() * or \ref mdbx_txn_commit(). - * \see mdbx_txn_begin_ex() + * \see mdbx_txn_begin() * * \note A transaction and its cursors must only be used by a single thread, * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS @@ -2587,7 +2716,8 @@ mdbx_env_get_userctx(const MDBX_env *env); * * \note Cursors may not span transactions. * - * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * * \param [in] parent If this parameter is non-NULL, the new transaction will * be a nested transaction, with the transaction indicated * by parent as its parent. Transactions may be nested @@ -2595,6 +2725,7 @@ mdbx_env_get_userctx(const MDBX_env *env); * not issue any other operations than mdbx_txn_commit and * \ref mdbx_txn_abort() while it has active child * transactions. + * * \param [in] flags Special options for this transaction. This parameter * must be set to 0 or by bitwise OR'ing together one * or more of the values described here: @@ -2611,6 +2742,10 @@ mdbx_env_get_userctx(const MDBX_env *env); * * \param [out] txn Address where the new MDBX_txn handle will be stored. * + * \param [in] context A pointer to application context to be associated with + * created transaction and could be retrieved by + * \ref mdbx_txn_get_userctx() until transaction finished. + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_PANIC A fatal error occurred earlier and the @@ -2625,16 +2760,16 @@ mdbx_env_get_userctx(const MDBX_env *env); * \retval MDBX_ENOMEM Out of memory. * \retval MDBX_BUSY The write transaction is already started by the * current thread. */ -LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, - MDBX_txn_flags_t flags, MDBX_txn **txn); +LIBMDBX_API int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, + MDBX_txn_flags_t flags, MDBX_txn **txn, + void *context); -/** \brief Create a transaction with a user provided context pointer - * for use with the environment. +/** \brief Create a transaction for use with the environment. * \ingroup c_transactions * * The transaction handle may be discarded using \ref mdbx_txn_abort() * or \ref mdbx_txn_commit(). - * \see mdbx_txn_begin() + * \see mdbx_txn_begin_ex() * * \note A transaction and its cursors must only be used by a single thread, * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS @@ -2668,10 +2803,6 @@ LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, * * \param [out] txn Address where the new MDBX_txn handle will be stored. * - * \param [in] context A pointer to application context to be associated with - * created transaction and could be retrieved by - * \ref mdbx_txn_get_userctx() until transaction finished. - * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_PANIC A fatal error occurred earlier and the @@ -2686,9 +2817,11 @@ LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, * \retval MDBX_ENOMEM Out of memory. * \retval MDBX_BUSY The write transaction is already started by the * current thread. */ -LIBMDBX_API int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, - MDBX_txn_flags_t flags, MDBX_txn **txn, - void *context); +LIBMDBX_INLINE_API(int, mdbx_txn_begin, + (MDBX_env * env, MDBX_txn *parent, MDBX_txn_flags_t flags, + MDBX_txn **txn)) { + return mdbx_txn_begin_ex(env, parent, flags, txn, NULL); +} /** \brief Set application information associated with the \ref MDBX_txn. * \ingroup c_transactions @@ -2722,9 +2855,9 @@ struct MDBX_txn_info { uint64_t txn_id; /** For READ-ONLY transaction: the lag from a recent MVCC-snapshot, i.e. the - number of committed transaction since read transaction started. For WRITE - transaction (provided if `scan_rlt=true`): the lag of the oldest reader - from current transaction (i.e. at least 1 if any reader running). */ + number of committed transaction since read transaction started. + For WRITE transaction (provided if `scan_rlt=true`): the lag of the oldest + reader from current transaction (i.e. at least 1 if any reader running). */ uint64_t txn_reader_lag; /** Used space by this transaction, i.e. corresponding to the last used @@ -2748,7 +2881,8 @@ struct MDBX_txn_info { /** For READ-ONLY transaction: the space available for writer(s) and that must be exhausted for reason to call the Handle-Slow-Readers callback for - this read transaction. For WRITE transaction: the space inside transaction + this read transaction. + For WRITE transaction: the space inside transaction that left to `MDBX_TXN_FULL` error. */ uint64_t txn_space_leftover; @@ -2812,6 +2946,38 @@ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_txn_flags(const MDBX_txn *txn); MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API uint64_t mdbx_txn_id(const MDBX_txn *txn); +/** \brief Latency of commit stages in 1/65536 of seconds units. + * \warning This structure may be changed in future releases. + * \see mdbx_txn_commit_ex() */ +struct MDBX_commit_latency { + /** \brief Duration of preparation (commit child transactions, update + * sub-databases records and cursors destroying). */ + uint32_t preparation; + /** \brief Duration of GC/freeDB handling & updation. */ + uint32_t gc; + /** \brief Duration of internal audit if enabled. */ + uint32_t audit; + /** \brief Duration of writing dirty/modified data pages. */ + uint32_t write; + /** \brief Duration of syncing written data to the dist/storage. */ + uint32_t sync; + /** \brief Duration of transaction ending (releasing resources). */ + uint32_t ending; + /** \brief The total duration of a commit. */ + uint32_t whole; +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef struct MDBX_commit_latency MDBX_commit_latency; +#endif + +/** \brief Commit all the operations of a transaction into the database and + * collect latency information. + * \see mdbx_txn_commit() + * \ingroup c_statinfo + * \warning This function may be changed in future releases. */ +LIBMDBX_API int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency); + /** \brief Commit all the operations of a transaction into the database. * \ingroup c_transactions * @@ -2849,7 +3015,9 @@ mdbx_txn_id(const MDBX_txn *txn); * \retval MDBX_ENOSPC No more disk space. * \retval MDBX_EIO A system-level I/O error occurred. * \retval MDBX_ENOMEM Out of memory. */ -LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn); +LIBMDBX_INLINE_API(int, mdbx_txn_commit, (MDBX_txn * txn)) { + return mdbx_txn_commit_ex(txn, NULL); +} /** \brief Abandon all the operations of the transaction instead of saving them. * \ingroup c_transactions @@ -3149,13 +3317,13 @@ mdbx_key_from_float(const float ieee754_32bit); MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit); -MDBX_NOTHROW_CONST_FUNCTION __inline uint64_t -mdbx_key_from_int64(const int64_t i64) { +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(uint64_t, mdbx_key_from_int64, + (const int64_t i64)) { return UINT64_C(0x8000000000000000) + i64; } -MDBX_NOTHROW_CONST_FUNCTION __inline uint32_t -mdbx_key_from_int32(const int32_t i32) { +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(uint32_t, mdbx_key_from_int32, + (const int32_t i32)) { return UINT32_C(0x80000000) + i32; } /** @} */ @@ -3247,7 +3415,11 @@ LIBMDBX_API int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, unsigned *state); /** \brief The shortcut to calling \ref mdbx_dbi_flags_ex() with `state=NULL` * for discarding it result. \ingroup c_statinfo */ -LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags); +LIBMDBX_INLINE_API(int, mdbx_dbi_flags, + (MDBX_txn * txn, MDBX_dbi dbi, unsigned *flags)) { + unsigned state; + return mdbx_dbi_flags_ex(txn, dbi, flags, &state); +} /** \brief Close a database handle. Normally unnecessary. * \ingroup c_dbi @@ -3711,6 +3883,18 @@ mdbx_cursor_txn(const MDBX_cursor *cursor); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). */ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *cursor); +/** \brief Copy cursor position and state. + * \ingroup c_cursors + * + * \param [in] src A source cursor handle returned + * by \ref mdbx_cursor_create() or \ref mdbx_cursor_open(). + * + * \param [in,out] dest A destination cursor handle returned + * by \ref mdbx_cursor_create() or \ref mdbx_cursor_open(). + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); + /** \brief Retrieve by cursor. * \ingroup c_crud * @@ -4251,7 +4435,7 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * \param [in] pid A pid of the reader process. * \param [in] tid A thread_id of the reader thread. * \param [in] laggard An oldest read transaction number on which stalled. - * \param [in] gap A lag from the last commited txn. + * \param [in] gap A lag from the last committed txn. * \param [in] space A space that actually become available for reuse after * this reader finished. The callback function can take * this value into account to evaluate the impact that diff --git a/libs/libmdbx/src/mdbx.h++ b/libs/libmdbx/src/mdbx.h++ index 2050f4b925..29d3f6fdca 100644 --- a/libs/libmdbx/src/mdbx.h++ +++ b/libs/libmdbx/src/mdbx.h++ @@ -208,7 +208,8 @@ using filehandle = ::mdbx_filehandle_t; (defined(__cpp_lib_filesystem) && __cpp_lib_filesystem >= 201703L && \ (!defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || \ __MAC_OS_X_VERSION_MIN_REQUIRED >= 101500)) -using path = std::filesystem::path; +#define MDBX_STD_FILESYSTEM_PATH +using path = ::std::filesystem::path; #elif defined(_WIN32) || defined(_WIN64) using path = ::std::wstring; #else @@ -396,8 +397,7 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { template <size_t SIZE> MDBX_CXX14_CONSTEXPR slice(const char (&text)[SIZE]) noexcept : slice(text, SIZE - 1) { - static_assert(SIZE > 0 && text[SIZE - 1] == '\0', - "Must be a null-terminated C-string"); + MDBX_CONSTEXPR_ASSERT(SIZE > 0 && text[SIZE - 1] == '\0'); } /// \brief Create a slice that refers to c_str[0,strlen(c_str)-1]. explicit MDBX_CXX17_CONSTEXPR slice(const char *c_str); @@ -1563,14 +1563,7 @@ enum class value_mode { ///< lexicographic comparison like `std::memcmp()`. ///< In terms of keys, they are not unique, i.e. has ///< duplicates which are sorted by associated data values. -#if !defined(__cpp_constexpr) && !defined(DOXYGEN) - multi_reverse = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_REVERSEDUP), - multi_samelength = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED), - multi_ordinal = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED) | - uint32_t(MDBX_INTEGERDUP), - multi_reverse_samelength = uint32_t(MDBX_DUPSORT) | - uint32_t(MDBX_REVERSEDUP) | uint32_t(MDBX_DUPFIXED) -#else +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS || defined(DOXYGEN) multi_reverse = MDBX_DUPSORT | MDBX_REVERSEDUP, ///< A more than one data value could be associated with @@ -1617,6 +1610,13 @@ enum class value_mode { ///< In terms of keys, they are not unique, i.e. has duplicates ///< which are sorted by associated data values. ///< \note Not yet implemented and PRs are welcome. +#else + multi_reverse = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_REVERSEDUP), + multi_samelength = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED), + multi_ordinal = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED) | + uint32_t(MDBX_INTEGERDUP), + multi_reverse_samelength = uint32_t(MDBX_DUPSORT) | + uint32_t(MDBX_REVERSEDUP) | uint32_t(MDBX_DUPFIXED) #endif }; @@ -1651,9 +1651,9 @@ struct LIBMDBX_API_TYPE map_handle { /// \brief Key-value pairs put mode. enum put_mode { - insert = MDBX_NOOVERWRITE, ///< Insert only unique keys. - upsert = MDBX_UPSERT, ///< Insert or update. - update = MDBX_CURRENT, ///< Update existing, don't insert new. + insert_unique = MDBX_NOOVERWRITE, ///< Insert only unique keys. + upsert = MDBX_UPSERT, ///< Insert or update. + update = MDBX_CURRENT, ///< Update existing, don't insert new. }; /// \brief Unmanaged database environment. @@ -1916,12 +1916,50 @@ public: /// \brief Make a copy (backup) of an existing environment to the specified /// path. - env ©(const path &destination, bool compactify, +#ifdef MDBX_STD_FILESYSTEM_PATH + env ©(const ::std::filesystem::path &destination, bool compactify, + bool force_dynamic_size = false); +#endif /* MDBX_STD_FILESYSTEM_PATH */ +#if defined(_WIN32) || defined(_WIN64) + env ©(const ::std::wstring &destination, bool compactify, + bool force_dynamic_size = false); +#endif /* Windows */ + env ©(const ::std::string &destination, bool compactify, bool force_dynamic_size = false); /// \brief Copy an environment to the specified file descriptor. env ©(filehandle fd, bool compactify, bool force_dynamic_size = false); + /// \brief Deletion modes for \ref remove(). + enum remove_mode { + /// \brief Just delete the environment's files and directory if any. + /// \note On POSIX systems, processes already working with the database will + /// continue to work without interference until it close the environment. + /// \note On Windows, the behavior of `just_remove` is different + /// because the system does not support deleting files that are currently + /// memory mapped. + just_remove = MDBX_ENV_JUST_DELETE, + /// \brief Make sure that the environment is not being used by other + /// processes, or return an error otherwise. + ensure_unused = MDBX_ENV_ENSURE_UNUSED, + /// \brief Wait until other processes closes the environment before + /// deletion. + wait_for_unused = MDBX_ENV_WAIT_FOR_UNUSED + }; + + /// \brief Removes the environment's files in a proper and multiprocess-safe + /// way. +#ifdef MDBX_STD_FILESYSTEM_PATH + static bool remove(const ::std::filesystem::path &, + const remove_mode mode = just_remove); +#endif /* MDBX_STD_FILESYSTEM_PATH */ +#if defined(_WIN32) || defined(_WIN64) + static bool remove(const ::std::wstring &, + const remove_mode mode = just_remove); +#endif /* Windows */ + static bool remove(const ::std::string &, + const remove_mode mode = just_remove); + /// \brief Statistics for a database in the MDBX environment. using stat = ::MDBX_stat; @@ -2060,7 +2098,7 @@ public: /// \ref MDBX_BAD_DBI (since the DB name is gone). inline void close_map(const map_handle &); - /// \brief Readed information + /// \brief Reader information struct reader_info { int slot; ///< The reader lock table slot number. mdbx_pid_t pid; ///< The reader process ID. @@ -2156,7 +2194,16 @@ public: MDBX_CXX11_CONSTEXPR env_managed() noexcept = default; /// \brief Open existing database. - env_managed(const path &, const operate_parameters &, bool accede = true); +#ifdef MDBX_STD_FILESYSTEM_PATH + env_managed(const ::std::filesystem::path &, const operate_parameters &, + bool accede = true); +#endif /* MDBX_STD_FILESYSTEM_PATH */ +#if defined(_WIN32) || defined(_WIN64) + env_managed(const ::std::wstring &, const operate_parameters &, + bool accede = true); +#endif /* Windows */ + env_managed(const ::std::string &, const operate_parameters &, + bool accede = true); /// \brief Additional parameters for creating a new database. struct create_parameters { @@ -2166,7 +2213,15 @@ public: }; /// \brief Create new or open existing database. - env_managed(const path &, const create_parameters &, +#ifdef MDBX_STD_FILESYSTEM_PATH + env_managed(const ::std::filesystem::path &, const create_parameters &, + const operate_parameters &, bool accede = true); +#endif /* MDBX_STD_FILESYSTEM_PATH */ +#if defined(_WIN32) || defined(_WIN64) + env_managed(const ::std::wstring &, const create_parameters &, + const operate_parameters &, bool accede = true); +#endif /* Windows */ + env_managed(const ::std::string &, const create_parameters &, const operate_parameters &, bool accede = true); /// \brief Explicitly closes the environment and release the memory map. @@ -2659,6 +2714,7 @@ public: LIBMDBX_API ::std::ostream &operator<<(::std::ostream &, const slice &); LIBMDBX_API ::std::ostream &operator<<(::std::ostream &, const pair &); +LIBMDBX_API ::std::ostream &operator<<(::std::ostream &, const pair_result &); template <class ALLOCATOR> inline ::std::ostream &operator<<(::std::ostream &out, const buffer<ALLOCATOR> &it) { @@ -3106,7 +3162,7 @@ inline ::mdbx::string<ALLOCATOR> slice::hex_encode(bool uppercase, const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(to_hex_bytes()); + result.resize(to_hex_bytes()); result.resize(to_hex(const_cast<char *>(result.data()), result.capacity()) - result.data(), uppercase); @@ -3119,7 +3175,7 @@ inline ::mdbx::string<ALLOCATOR> slice::hex_decode(const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(from_hex_bytes()); + result.resize(from_hex_bytes()); result.resize( from_hex(static_cast<byte *>( static_cast<void *>(const_cast<char *>(result.data()))), @@ -3134,7 +3190,7 @@ inline ::mdbx::string<ALLOCATOR> slice::base58_encode(const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(to_base58_bytes()); + result.resize(to_base58_bytes()); result.resize( to_base58(const_cast<char *>(result.data()), result.capacity()) - result.data()); @@ -3147,7 +3203,7 @@ inline ::mdbx::string<ALLOCATOR> slice::base58_decode(const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(from_base58_bytes()); + result.resize(from_base58_bytes()); result.resize( from_base58(static_cast<byte *>( static_cast<void *>(const_cast<char *>(result.data()))), @@ -3162,7 +3218,7 @@ inline ::mdbx::string<ALLOCATOR> slice::base64_encode(const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(to_base64_bytes()); + result.resize(to_base64_bytes()); result.resize( to_base64(const_cast<char *>(result.data()), result.capacity()) - result.data()); @@ -3175,7 +3231,7 @@ inline ::mdbx::string<ALLOCATOR> slice::base64_decode(const ALLOCATOR &allocator) const { ::mdbx::string<ALLOCATOR> result(allocator); if (MDBX_LIKELY(length() > 0)) { - result.reserve(from_base64_bytes()); + result.resize(from_base64_bytes()); result.resize( from_base64(static_cast<byte *>( static_cast<void *>(const_cast<char *>(result.data()))), @@ -3818,14 +3874,14 @@ inline void txn::put(map_handle map, const slice &key, slice value, inline void txn::insert(map_handle map, const slice &key, slice value) { error::success_or_throw( put(map, key, &value /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert))); + MDBX_put_flags_t(put_mode::insert_unique))); } inline value_result txn::try_insert(map_handle map, const slice &key, slice value) { const int err = put(map, key, &value /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert)); + MDBX_put_flags_t(put_mode::insert_unique)); switch (err) { case MDBX_SUCCESS: return value_result{slice(), true}; @@ -3841,7 +3897,7 @@ inline slice txn::insert_reserve(map_handle map, const slice &key, slice result(nullptr, value_length); error::success_or_throw( put(map, key, &result /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert) | MDBX_RESERVE)); + MDBX_put_flags_t(put_mode::insert_unique) | MDBX_RESERVE)); return result; } @@ -3850,7 +3906,7 @@ inline value_result txn::try_insert_reserve(map_handle map, const slice &key, slice result(nullptr, value_length); const int err = put(map, key, &result /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert) | MDBX_RESERVE); + MDBX_put_flags_t(put_mode::insert_unique) | MDBX_RESERVE); switch (err) { case MDBX_SUCCESS: return value_result{result, true}; @@ -4287,13 +4343,13 @@ inline MDBX_error_t cursor::put(const slice &key, slice *value, inline void cursor::insert(const slice &key, slice value) { error::success_or_throw( put(key, &value /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert))); + MDBX_put_flags_t(put_mode::insert_unique))); } inline value_result cursor::try_insert(const slice &key, slice value) { const int err = put(key, &value /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert)); + MDBX_put_flags_t(put_mode::insert_unique)); switch (err) { case MDBX_SUCCESS: return value_result{slice(), true}; @@ -4308,7 +4364,7 @@ inline slice cursor::insert_reserve(const slice &key, size_t value_length) { slice result(nullptr, value_length); error::success_or_throw( put(key, &result /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert) | MDBX_RESERVE)); + MDBX_put_flags_t(put_mode::insert_unique) | MDBX_RESERVE)); return result; } @@ -4317,7 +4373,7 @@ inline value_result cursor::try_insert_reserve(const slice &key, slice result(nullptr, value_length); const int err = put(key, &result /* takes the present value in case MDBX_KEYEXIST */, - MDBX_put_flags_t(put_mode::insert) | MDBX_RESERVE); + MDBX_put_flags_t(put_mode::insert_unique) | MDBX_RESERVE); switch (err) { case MDBX_SUCCESS: return value_result{result, true}; diff --git a/libs/libmdbx/src/mdbx_chk.c b/libs/libmdbx/src/mdbx_chk.c index 120e4b4048..194bde6f9a 100644 --- a/libs/libmdbx/src/mdbx_chk.c +++ b/libs/libmdbx/src/mdbx_chk.c @@ -34,11 +34,16 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -119,11 +124,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -839,7 +839,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -1015,6 +1015,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,8 +1085,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1110,8 +1139,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1182,7 +1210,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1191,7 +1220,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1451,32 +1482,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1912,7 +1917,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1996,7 +2001,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2244,7 +2249,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2403,8 +2408,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2431,6 +2434,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2568,7 +2573,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2578,7 +2583,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2854,7 +2859,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ diff --git a/libs/libmdbx/src/mdbx_copy.c b/libs/libmdbx/src/mdbx_copy.c index 71f6a05527..eb3c49d1d9 100644 --- a/libs/libmdbx/src/mdbx_copy.c +++ b/libs/libmdbx/src/mdbx_copy.c @@ -34,11 +34,16 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -119,11 +124,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -839,7 +839,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -1015,6 +1015,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,8 +1085,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1110,8 +1139,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1182,7 +1210,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1191,7 +1220,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1451,32 +1482,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1912,7 +1917,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1996,7 +2001,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2244,7 +2249,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2403,8 +2408,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2431,6 +2434,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2568,7 +2573,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2578,7 +2583,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2854,7 +2859,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ diff --git a/libs/libmdbx/src/mdbx_dump.c b/libs/libmdbx/src/mdbx_dump.c index f338498c7c..96cf69c47a 100644 --- a/libs/libmdbx/src/mdbx_dump.c +++ b/libs/libmdbx/src/mdbx_dump.c @@ -34,11 +34,16 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -119,11 +124,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -839,7 +839,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -1015,6 +1015,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,8 +1085,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1110,8 +1139,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1182,7 +1210,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1191,7 +1220,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1451,32 +1482,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1912,7 +1917,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1996,7 +2001,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2244,7 +2249,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2403,8 +2408,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2431,6 +2434,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2568,7 +2573,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2578,7 +2583,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2854,7 +2859,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -3320,6 +3325,8 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { rc = MDBX_SUCCESS; if (unlikely(rc != MDBX_SUCCESS)) error("mdbx_cursor_get", rc); + + mdbx_cursor_close(cursor); return rc; } @@ -3354,7 +3361,7 @@ int main(int argc, char *argv[]) { MDBX_dbi dbi; prog = argv[0]; char *envname; - char *subname = nullptr; + char *subname = nullptr, *buf4free = nullptr; unsigned envflags = 0; bool alldbs = false, list = false; @@ -3500,7 +3507,13 @@ int main(int argc, char *argv[]) { if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_malloc(key.iov_len + 1); + subname = mdbx_realloc(buf4free, key.iov_len + 1); + if (!subname) { + rc = MDBX_ENOMEM; + break; + } + + buf4free = subname; memcpy(subname, key.iov_base, key.iov_len); subname[key.iov_len] = '\0'; @@ -3553,7 +3566,6 @@ int main(int argc, char *argv[]) { break; } } - mdbx_free(subname); } mdbx_cursor_close(cursor); cursor = nullptr; @@ -3587,6 +3599,7 @@ txn_abort: mdbx_txn_abort(txn); env_close: mdbx_env_close(env); + free(buf4free); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/libs/libmdbx/src/mdbx_load.c b/libs/libmdbx/src/mdbx_load.c index 6832e0f73a..3a49a60ec7 100644 --- a/libs/libmdbx/src/mdbx_load.c +++ b/libs/libmdbx/src/mdbx_load.c @@ -34,11 +34,16 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -119,11 +124,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -839,7 +839,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -1015,6 +1015,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,8 +1085,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1110,8 +1139,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1182,7 +1210,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1191,7 +1220,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1451,32 +1482,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1912,7 +1917,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1996,7 +2001,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2244,7 +2249,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2403,8 +2408,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2431,6 +2434,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2568,7 +2573,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2578,7 +2583,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2854,7 +2859,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -3229,7 +3234,6 @@ static MDBX_envinfo envinfo; static int mode = GLOBAL; static MDBX_val kbuf, dbuf; -static MDBX_val k0buf; #define STRLENOF(s) (sizeof(s) - 1) @@ -3561,7 +3565,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { static void usage(void) { fprintf(stderr, "usage: %s [-V] [-q] [-a] [-f file] [-s name] [-N] [-T] [-r] [-n]" - "dbpath\n" + " dbpath\n" " -V\t\tprint version and exit\n" " -q\t\tbe quiet\n" " -a\t\tappend records in input order (required for custom " @@ -3592,11 +3596,9 @@ int main(int argc, char *argv[]) { MDBX_cursor *mc = nullptr; MDBX_dbi dbi; char *envname = nullptr; - int envflags = MDBX_UTTERLY_NOSYNC, putflags = 0; - bool append = false; + int envflags = MDBX_UTTERLY_NOSYNC, putflags = MDBX_UPSERT; bool quiet = false; bool rescue = false; - MDBX_val prevk; prog = argv[0]; if (argc < 2) @@ -3619,7 +3621,7 @@ int main(int argc, char *argv[]) { mdbx_build.options); return EXIT_SUCCESS; case 'a': - append = true; + putflags |= MDBX_APPEND; break; case 'f': if (freopen(optarg, "r", stdin) == nullptr) { @@ -3635,7 +3637,7 @@ int main(int argc, char *argv[]) { subname = mdbx_strdup(optarg); break; case 'N': - putflags = MDBX_NOOVERWRITE | MDBX_NODUPDATA; + putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; break; case 'T': mode |= NOHDR | PRINT; @@ -3676,6 +3678,11 @@ int main(int argc, char *argv[]) { dbuf.iov_len = 4096; dbuf.iov_base = mdbx_malloc(dbuf.iov_len); + if (!dbuf.iov_base) { + rc = MDBX_ENOMEM; + error("value-buffer", rc); + goto env_close; + } /* read first header for mapsize= */ if (!(mode & NOHDR)) { @@ -3703,7 +3710,7 @@ int main(int argc, char *argv[]) { } } - if (envinfo.mi_mapsize) { + if (envinfo.mi_geo.current | envinfo.mi_mapsize) { if (envinfo.mi_geo.current) { rc = mdbx_env_set_geometry( env, (intptr_t)envinfo.mi_geo.lower, (intptr_t)envinfo.mi_geo.current, @@ -3736,17 +3743,19 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, MDBX_DUPSORT); - if (kbuf.iov_len >= INTPTR_MAX / 4) { + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + 1; + if (kbuf.iov_len >= INTPTR_MAX / 2) { fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", kbuf.iov_len); goto env_close; } - kbuf.iov_len = (kbuf.iov_len + 1) * 2; - kbuf.iov_base = malloc(kbuf.iov_len * 2); - k0buf.iov_len = kbuf.iov_len; - k0buf.iov_base = (char *)kbuf.iov_base + kbuf.iov_len; - prevk.iov_base = k0buf.iov_base; + + kbuf.iov_base = malloc(kbuf.iov_len); + if (!kbuf.iov_base) { + rc = MDBX_ENOMEM; + error("key-buffer", rc); + goto env_close; + } while (rc == MDBX_SUCCESS) { if (user_break) { @@ -3772,9 +3781,10 @@ int main(int argc, char *argv[]) { } const char *const dbi_name = subname ? subname : "@MAIN"; - rc = mdbx_dbi_open_ex(txn, subname, dbi_flags | MDBX_CREATE, &dbi, - append ? equal_or_greater : nullptr, - append ? equal_or_greater : nullptr); + rc = + mdbx_dbi_open_ex(txn, subname, dbi_flags | MDBX_CREATE, &dbi, + (putflags & MDBX_APPEND) ? equal_or_greater : nullptr, + (putflags & MDBX_APPEND) ? equal_or_greater : nullptr); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_dbi_open_ex", rc); goto txn_abort; @@ -3789,7 +3799,7 @@ int main(int argc, char *argv[]) { if (present_sequence > sequence) { fprintf(stderr, "present sequence for '%s' value (%" PRIu64 - ") is greated than loaded (%" PRIu64 ")\n", + ") is greater than loaded (%" PRIu64 ")\n", dbi_name, present_sequence, sequence); rc = MDBX_RESULT_TRUE; goto txn_abort; @@ -3802,19 +3812,17 @@ int main(int argc, char *argv[]) { } } + if (putflags & MDBX_APPEND) + putflags = (dbi_flags & MDBX_DUPSORT) ? putflags | MDBX_APPENDDUP + : putflags & ~MDBX_APPENDDUP; + rc = mdbx_cursor_open(txn, dbi, &mc); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_cursor_open", rc); goto txn_abort; } - /* if (append) { - mc->mc_flags |= C_SKIPORD; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; - } */ int batch = 0; - prevk.iov_len = 0; while (rc == MDBX_SUCCESS) { MDBX_val key, data; rc = readline(&key, &kbuf); @@ -3829,18 +3837,7 @@ int main(int argc, char *argv[]) { goto txn_abort; } - int appflag = 0; - if (append) { - appflag = MDBX_APPEND; - if (dbi_flags & MDBX_DUPSORT) { - if (prevk.iov_len == key.iov_len && - memcmp(prevk.iov_base, key.iov_base, key.iov_len) == 0) - appflag = MDBX_APPEND | MDBX_APPENDDUP; - else - memcpy(prevk.iov_base, key.iov_base, prevk.iov_len = key.iov_len); - } - } - rc = mdbx_cursor_put(mc, &key, &data, putflags | appflag); + rc = mdbx_cursor_put(mc, &key, &data, putflags); if (rc == MDBX_KEYEXIST && putflags) continue; if (rc == MDBX_BAD_VALSIZE && rescue) { @@ -3861,9 +3858,7 @@ int main(int argc, char *argv[]) { goto txn_abort; } - if (batch == 10000 || txn_info.txn_space_dirty > MEGABYTE * 16) { - mdbx_cursor_close(mc); - mc = nullptr; + if (batch == 10000 || txn_info.txn_space_dirty > MEGABYTE * 256) { rc = mdbx_txn_commit(txn); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_txn_commit", rc); @@ -3876,16 +3871,11 @@ int main(int argc, char *argv[]) { error("mdbx_txn_begin", rc); goto env_close; } - rc = mdbx_cursor_open(txn, dbi, &mc); + rc = mdbx_cursor_bind(txn, mc, dbi); if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_cursor_open", rc); + error("mdbx_cursor_bind", rc); goto txn_abort; } - /* if (append) { - mc->mc_flags |= C_SKIPORD; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; - } */ } } @@ -3897,15 +3887,22 @@ int main(int argc, char *argv[]) { error("mdbx_txn_commit", rc); goto env_close; } - rc = mdbx_dbi_close(env, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_dbi_close", rc); - goto env_close; + if (subname) { + assert(dbi != MAIN_DBI); + rc = mdbx_dbi_close(env, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + error("mdbx_dbi_close", rc); + goto env_close; + } + } else { + assert(dbi == MAIN_DBI); } /* try read next header */ if (!(mode & NOHDR)) rc = readhdr(); + else if (ferror(stdin) || feof(stdin)) + break; } switch (rc) { @@ -3926,6 +3923,8 @@ txn_abort: mdbx_txn_abort(txn); env_close: mdbx_env_close(env); + free(kbuf.iov_base); + free(dbuf.iov_base); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/libs/libmdbx/src/mdbx_stat.c b/libs/libmdbx/src/mdbx_stat.c index 37ebc55a00..ce47bd75fd 100644 --- a/libs/libmdbx/src/mdbx_stat.c +++ b/libs/libmdbx/src/mdbx_stat.c @@ -34,11 +34,16 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 3b5677a6062b714f1e138b0066c5590ee3c9ebf3bf8cfa3bb9503515ea0d1f02_v0_9_1_18_g1d31ebdc1c +#define MDBX_BUILD_SOURCERY 47492323531afee427a3de6ddaeae26eed45bfd1b52d92fd121a5a13a9747dbb_v0_9_2_0_g092ab09 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif +#define LIBMDBX_INTERNALS +#ifdef MDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* MDBX_TOOLS */ + /* *INDENT-OFF* */ /* clang-format off */ @@ -119,11 +124,6 @@ #pragma warning(disable : 4505) /* unreferenced local function has been removed */ #endif /* _MSC_VER (warnings) */ -#if defined(MDBX_TOOLS) -#undef MDBX_DEPRECATED -#define MDBX_DEPRECATED -#endif /* MDBX_TOOLS */ - #include "mdbx.h" /* * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> @@ -839,7 +839,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) #ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ +/* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ #if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ @@ -1015,6 +1015,35 @@ typedef union MDBX_srwlock { #ifdef __cplusplus extern void mdbx_osal_jitter(bool tiny); #else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#include <cstdatomic> +#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include <stdatomic.h> +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include <libkern/OSAtomic.h> +#else +#error FIXME atomic-ops +#endif + /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,8 +1085,8 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) __atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__clang__) || defined(__GNUC__) @@ -1110,8 +1139,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #if defined(__linux__) || defined(__gnu_linux__) MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; -MDBX_INTERNAL_VAR bool - mdbx_RunningOnWSL /* Windows Subsystem for Linux is mad and trouble-full */; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef mdbx_strdup @@ -1182,7 +1210,8 @@ enum mdbx_openfile_purpose { MDBX_OPEN_DXB_LAZY = 1, MDBX_OPEN_DXB_DSYNC = 2, MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4 + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 }; MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, @@ -1191,7 +1220,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, mdbx_mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 @@ -1451,32 +1482,6 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* Windows */ -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -#if !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include <stdatomic.h> -#elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include <libkern/OSAtomic.h> -#else -#error FIXME atomic-ops -#endif - #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ @@ -1912,7 +1917,7 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last commited modification */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ } MDBX_db; /* database size-related parameters */ @@ -1996,7 +2001,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ + uint64_t mp_txnid; /* txnid that committed this page */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -2244,7 +2249,7 @@ typedef struct MDBX_lockinfo { #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif #define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) @@ -2403,8 +2408,6 @@ struct MDBX_txn { MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; - /* In write txns, array of cursors for each DB */ - MDBX_cursor **mt_cursors; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -2431,6 +2434,8 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ pgno_t loose_refund_wl /* FIXME: describe */; @@ -2568,7 +2573,7 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck - unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ @@ -2578,7 +2583,7 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ + char *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ @@ -2854,7 +2859,7 @@ static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) /* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_LMDB_ERRCODE + 10) +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) /* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ |