diff options
author | George Hazan <ghazan@miranda.im> | 2021-01-23 12:25:21 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2021-01-23 12:25:28 +0300 |
commit | 19dfb93282192821ad6d555fc80bcd9732f0b0dc (patch) | |
tree | d75fd4ab4719d22e4afb0844d31ff1ceadd9f356 /libs/libmdbx | |
parent | 114662b245aac5d6c15db73d7dd75dfc429dc88a (diff) |
libmdbx: upgrade to 0.9.2.110
Diffstat (limited to 'libs/libmdbx')
-rw-r--r-- | libs/libmdbx/src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | libs/libmdbx/src/ChangeLog.md | 36 | ||||
-rw-r--r-- | libs/libmdbx/src/GNUmakefile | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/VERSION | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/cmake/compiler.cmake | 12 | ||||
-rw-r--r-- | libs/libmdbx/src/config.h.in | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c | 3224 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c++ | 130 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.h | 229 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_chk.c | 150 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_copy.c | 130 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_dump.c | 141 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_load.c | 165 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_stat.c | 139 |
14 files changed, 2836 insertions, 1527 deletions
diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt index 789fc4685a..ff21fb5e0a 100644 --- a/libs/libmdbx/src/CMakeLists.txt +++ b/libs/libmdbx/src/CMakeLists.txt @@ -438,7 +438,6 @@ add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy CMAKE_DEPENDENT_OPTION(MDBX_INSTALL_MANPAGES "Install man-pages for MDBX tools (mdbx_chk/stat/dump/load/copy)" ON MDBX_BUILD_TOOLS OFF) add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON) add_mdbx_option(MDBX_ENV_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO) -add_mdbx_option(MDBX_HUGE_TRANSACTIONS "Support for huge write-transactions" OFF) mark_as_advanced(MDBX_ENV_CHECKPID) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF) diff --git a/libs/libmdbx/src/ChangeLog.md b/libs/libmdbx/src/ChangeLog.md index 66c5de3705..1d79b89061 100644 --- a/libs/libmdbx/src/ChangeLog.md +++ b/libs/libmdbx/src/ChangeLog.md @@ -1,7 +1,7 @@ ChangeLog --------- -## v0.9.3 (in development) +## v0.9.3 (in development) scheduled at 2021-02-23 TODO: - Engage new terminology (https://github.com/erthink/libmdbx/issues/137). @@ -13,18 +13,48 @@ TODO: - Packages for ROSA Linux, ALT Linux, Fedora/RHEL, Debian/Ubuntu. Acknowledgements: + - Mahlon E. Smith (http://www.martini.nu/) for FreeBSD port of libmdbx. - 장세연 (http://www.castis.com) for bug fixing and PR. -Added features: +Removed options and features: + + - Drop `MDBX_HUGE_TRANSACTIONS` build-option (now no longer required). + +New features: + - Package for FreeBSD is available now by Mahlon E. Smith. + - New API functions to get/set various options (https://github.com/erthink/libmdbx/issues/128). + - the maximum number of named databases for the environment; + - the maximum number of threads/reader slots; + - threshold (since the last unsteady commit) to force flush the data buffers to disk; + - relative period (since the last unsteady commit) to force flush the data buffers to disk; + - limit to grow a list of reclaimed/recycled page's numbers for finding a sequence of contiguous pages for large data items; + - limit to grow a cache of dirty pages for reuse in the current transaction; + - limit of a pre-allocated memory items for dirty pages; + - limit of dirty pages for a write transaction; + - initial allocation size for dirty pages list of a write transaction; + - maximal part of the dirty pages may be spilled when necessary; + - minimal part of the dirty pages should be spilled when necessary; + - how much of the parent transaction dirty pages will be spilled while start each child transaction; + - Unlimited/Dynamic size of retired and dirty page lists (https://github.com/erthink/libmdbx/issues/123). + - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. + - Reworked spilling of large transaction and committing of nested transactions. + - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. Fixes: + - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://github.com/erthink/libmdbx/pull/143). - Fixed `mdbx_realloc()` for case of nullptr and `MDBX_AVOID_CRT=ON` for Windows. + - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://github.com/erthink/libmdbx/issues/146). + - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://github.com/erthink/libmdbx/issues/153). + - Fixed minor/potential memory leak during page flushing and unspilling. + - Fixed handling states of cursors's and subDBs's for nested transactions. + - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. + - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://github.com/erthink/libmdbx/issues/153). -## v0.9.2 scheduled at 2020-11-27 +## v0.9.2 at 2020-11-27 Acknowledgements: diff --git a/libs/libmdbx/src/GNUmakefile b/libs/libmdbx/src/GNUmakefile index b14000f5f3..da781606ee 100644 --- a/libs/libmdbx/src/GNUmakefile +++ b/libs/libmdbx/src/GNUmakefile @@ -72,7 +72,7 @@ strip: all strip libmdbx.$(SO_SUFFIX) $(TOOLS) clean: - rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* \ + rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \ *.gcov *.log *.err src/*.o test/*.o mdbx_example dist \ config.h src/config.h src/version.c *.tar* diff --git a/libs/libmdbx/src/VERSION b/libs/libmdbx/src/VERSION index 418c133552..b57017e6e7 100644 --- a/libs/libmdbx/src/VERSION +++ b/libs/libmdbx/src/VERSION @@ -1 +1 @@ -0.9.2.12 +0.9.2.110 diff --git a/libs/libmdbx/src/cmake/compiler.cmake b/libs/libmdbx/src/cmake/compiler.cmake index ab72b897d1..22324a64f8 100644 --- a/libs/libmdbx/src/cmake/compiler.cmake +++ b/libs/libmdbx/src/cmake/compiler.cmake @@ -533,7 +533,7 @@ macro(setup_compile_flags) add_compile_flags("C;CXX" "-fexceptions") endif() if(CC_HAS_FCXX_EXCEPTIONS) - add_compile_flags("CXX" "-fcxx-exceptions -frtti") + add_compile_flags("CXX" "-fcxx-exceptions" "-frtti") endif() if(MSVC) # checks for /EHa or /clr options exists, @@ -583,13 +583,13 @@ macro(setup_compile_flags) endif() if(CC_HAS_WNO_UNKNOWN_PRAGMAS AND NOT HAVE_OPENMP) - add_compile_flags("C;CXX" -Wno-unknown-pragmas) + add_compile_flags("C;CXX" "-Wno-unknown-pragmas") endif() if(CC_HAS_SECTIONS) - add_compile_flags("C;CXX" -ffunction-sections -fdata-sections) + add_compile_flags("C;CXX" "-ffunction-sections" "-fdata-sections") elseif(MSVC) - add_compile_flags("C;CXX" /Gy) + add_compile_flags("C;CXX" "/Gy") endif() # We must set -fno-omit-frame-pointer here, since we rely @@ -649,11 +649,11 @@ macro(setup_compile_flags) endif() if(ENABLE_ASAN) - add_compile_flags("C;CXX" -fsanitize=address) + add_compile_flags("C;CXX" "-fsanitize=address") endif() if(ENABLE_UBSAN) - add_compile_flags("C;CXX" -fsanitize=undefined) + add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") endif() if(ENABLE_GCOV) diff --git a/libs/libmdbx/src/config.h.in b/libs/libmdbx/src/config.h.in index 5048d4e021..bb3eea07b4 100644 --- a/libs/libmdbx/src/config.h.in +++ b/libs/libmdbx/src/config.h.in @@ -9,10 +9,10 @@ #cmakedefine ENABLE_GPROF #cmakedefine ENABLE_GCOV #cmakedefine ENABLE_ASAN +#cmakedefine ENABLE_UBSAN #cmakedefine MDBX_FORCE_ASSERTIONS /* Common */ -#cmakedefine01 MDBX_HUGE_TRANSACTIONS #cmakedefine01 MDBX_TXN_CHECKOWNER #cmakedefine MDBX_ENV_CHECKPID_AUTO #ifndef MDBX_ENV_CHECKPID_AUTO diff --git a/libs/libmdbx/src/mdbx.c b/libs/libmdbx/src/mdbx.c index 7901b0b90d..895c263eef 100644 --- a/libs/libmdbx/src/mdbx.c +++ b/libs/libmdbx/src/mdbx.c @@ -12,7 +12,7 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -339,7 +339,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -347,7 +347,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1542,11 +1542,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1596,6 +1591,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1917,10 +1930,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1940,17 +1953,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2199,7 +2213,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2237,19 +2252,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2264,37 +2280,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2418,14 +2424,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2435,26 +2443,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2584,11 +2585,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2601,6 +2600,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; @@ -3431,10 +3441,11 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, const unsigned page_ln2 = log2n(pagesize); const size_t hard = 0x7FF00000ul; const size_t hard_pages = hard >> page_ln2; - const size_t limit = (hard_pages < MDBX_DPL_TXNFULL / 3) - ? hard - : ((size_t)MDBX_DPL_TXNFULL / 3 << page_ln2); - return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2; + STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO); + const size_t pages_limit = MDBX_PGL_LIMIT / 4; + const size_t limit = + (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2); + return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; } /* Calculate the size of a leaf node. @@ -3615,7 +3626,7 @@ __cold static int MDBX_PRINTF_ARGS(2, 3) if (prev != mp) { prev = mp; mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted page #%u, mod-txnid %" PRIaTXN " \n", + "corrupted page #%u, mod-txnid %" PRIaTXN "\n", mp->mp_pgno, mp->mp_txnid); } @@ -4032,11 +4043,13 @@ static __always_inline uint64_t safe64_read(const mdbx_safe64_t *ptr) { return v; } +#if 0 /* unused for now */ static __always_inline void safe64_update(mdbx_safe64_t *ptr, const uint64_t v) { safe64_reset(ptr, true); safe64_write(ptr, v); } +#endif /* unused for now */ /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ @@ -5594,14 +5607,14 @@ static int lcklist_detach_locked(MDBX_env *env) { * LY: Binary search */ #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ - static __always_inline TYPE_LIST *NAME(TYPE_LIST *first, unsigned length, \ - const TYPE_ARG item) { \ - TYPE_LIST *const begin = first, *const end = begin + length; \ + static __always_inline const TYPE_LIST *NAME( \ + const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = first, *const end = begin + length; \ \ while (length > 3) { \ const unsigned whole = length; \ length >>= 1; \ - TYPE_LIST *const middle = first + length; \ + const TYPE_LIST *const middle = first + length; \ if (CMP(*middle, item)) { \ first = middle + 1; \ length = whole - length - 1; \ @@ -5631,9 +5644,9 @@ static int lcklist_detach_locked(MDBX_env *env) { } \ \ if (mdbx_audit_enabled()) { \ - for (TYPE_LIST *scan = begin; scan < first; ++scan) \ + for (const TYPE_LIST *scan = begin; scan < first; ++scan) \ assert(CMP(*scan, item)); \ - for (TYPE_LIST *scan = first; scan < end; ++scan) \ + for (const TYPE_LIST *scan = first; scan < end; ++scan) \ assert(!CMP(*scan, item)); \ (void)begin, (void)end; \ } \ @@ -5644,7 +5657,7 @@ static int lcklist_detach_locked(MDBX_env *env) { /*----------------------------------------------------------------------------*/ static __always_inline size_t pnl2bytes(const size_t size) { - assert(size > 0 && size <= MDBX_PNL_MAX * 2); + assert(size > 0 && size <= MDBX_PGL_LIMIT); size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), MDBX_PNL_GRANULATE * sizeof(pgno_t)) - @@ -5654,7 +5667,7 @@ static __always_inline size_t pnl2bytes(const size_t size) { static __always_inline pgno_t bytes2pnl(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); - assert(size > 2 && size <= MDBX_PNL_MAX * 2); + assert(size > 2 && size <= MDBX_PGL_LIMIT); return (pgno_t)size - 2; } @@ -5681,7 +5694,7 @@ static void mdbx_pnl_free(MDBX_PNL pl) { /* Shrink the PNL to the default size if it has grown larger */ static void mdbx_pnl_shrink(MDBX_PNL *ppl) { assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) == MDBX_PNL_INITIAL); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX && + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); MDBX_PNL_SIZE(*ppl) = 0; if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > @@ -5701,17 +5714,19 @@ static void mdbx_pnl_shrink(MDBX_PNL *ppl) { /* Grow the PNL to the size growed to at least given size */ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX && + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); if (likely(allocated >= wanna)) return MDBX_SUCCESS; - if (unlikely(wanna > /* paranoia */ MDBX_PNL_MAX)) + if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { + mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); return MDBX_TXN_FULL; + } - const size_t size = (wanna + wanna - allocated < MDBX_PNL_MAX) + const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) ? wanna + wanna - allocated - : MDBX_PNL_MAX; + : MDBX_PGL_LIMIT; size_t bytes = pnl2bytes(size); MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); if (likely(pl)) { @@ -5729,9 +5744,9 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { /* Make room for num additional elements in an PNL */ static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, size_t num) { - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX && + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); - assert(num <= MDBX_PNL_MAX); + assert(num <= MDBX_PGL_LIMIT); const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS @@ -5748,42 +5763,9 @@ static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { MDBX_PNL_LAST(pl) = pgno; } -/* Append an pgno onto an unsorted PNL */ -static __hot int __must_check_result mdbx_pnl_append(MDBX_PNL *ppl, - pgno_t pgno) { - /* Too big? */ - if (unlikely(MDBX_PNL_SIZE(*ppl) == MDBX_PNL_ALLOCLEN(*ppl))) { - int rc = mdbx_pnl_need(ppl, MDBX_PNL_GRANULATE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - mdbx_pnl_xappend(*ppl, pgno); - return MDBX_SUCCESS; -} - -/* Append an PNL onto an unsorted PNL */ -static int __must_check_result mdbx_pnl_append_list(MDBX_PNL *ppl, - MDBX_PNL append) { - const unsigned len = MDBX_PNL_SIZE(append); - if (likely(len)) { - int rc = mdbx_pnl_need(ppl, MDBX_PNL_SIZE(append)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - const MDBX_PNL pnl = *ppl; - unsigned w = MDBX_PNL_SIZE(pnl), r = 1; - do - pnl[++w] = append[r]; - while (++r <= len); - MDBX_PNL_SIZE(pnl) = w; - } - return MDBX_SUCCESS; -} - /* Append an pgno range onto an unsorted PNL */ -static __hot int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl, - pgno_t pgno, - unsigned n) { +__always_inline static int __must_check_result +mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { assert(n > 0); int rc = mdbx_pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) @@ -5792,16 +5774,18 @@ static __hot int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl, const MDBX_PNL pnl = *ppl; #if MDBX_PNL_ASCENDING unsigned w = MDBX_PNL_SIZE(pnl); - do - pnl[++w] = pgno++; - while (--n); + do { + pnl[++w] = pgno; + pgno += spilled ? 2 : 1; + } while (--n); MDBX_PNL_SIZE(pnl) = w; #else unsigned w = MDBX_PNL_SIZE(pnl) + n; MDBX_PNL_SIZE(pnl) = w; - do - pnl[w--] = --n + pgno; - while (n); + do { + pnl[w--] = pgno; + pgno += spilled ? 2 : 1; + } while (--n); #endif return MDBX_SUCCESS; @@ -5829,12 +5813,11 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, } static bool mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { - assert(limit >= MIN_PAGENO && limit <= MAX_PAGENO + 1); if (likely(MDBX_PNL_SIZE(pl))) { assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); assert(MDBX_PNL_MOST(pl) < limit); - assert(MDBX_PNL_SIZE(pl) <= MDBX_PNL_MAX); - if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PNL_MAX * 3 / 2)) + assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT); + if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) return false; if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) return false; @@ -5880,6 +5863,55 @@ static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); } +static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { + mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && + txn->tw.spill_least_removed > 0); + txn->tw.spill_least_removed = + (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; + txn->tw.spill_pages[idx] |= 1; + MDBX_PNL_SIZE(txn->tw.spill_pages) -= + (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + + while (unlikely(npages > 1)) { + const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; + if (MDBX_PNL_ASCENDING) { + if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) || + (txn->tw.spill_pages[idx] >> 1) != pgno) + return; + } else { + if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) + return; + txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) + ? idx + : txn->tw.spill_least_removed; + } + txn->tw.spill_pages[idx] |= 1; + MDBX_PNL_SIZE(txn->tw.spill_pages) -= + (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + --npages; + } +} + +static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { + mdbx_tassert(txn, txn->tw.spill_least_removed > 0); + const MDBX_PNL sl = txn->tw.spill_pages; + if (txn->tw.spill_least_removed != INT_MAX) { + unsigned len = MDBX_PNL_SIZE(sl), r, w; + for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { + sl[w] = sl[r]; + w += 1 - (sl[r] & 1); + } + for (size_t i = 1; i < w; ++i) + mdbx_tassert(txn, (sl[i] & 1) == 0); + MDBX_PNL_SIZE(sl) = w - 1; + txn->tw.spill_least_removed = INT_MAX; + } else { + for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) + mdbx_tassert(txn, (sl[i] & 1) == 0); + } + return sl; +} + SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) static __hot void mdbx_pnl_sort(MDBX_PNL pnl) { pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); @@ -5890,22 +5922,50 @@ static __hot void mdbx_pnl_sort(MDBX_PNL pnl) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -static __hot unsigned mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) { +static __hot unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno) { assert(mdbx_pnl_check4assert(pnl, MAX_PAGENO + 1)); - pgno_t *begin = MDBX_PNL_BEGIN(pnl); - pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), id); - pgno_t *end = begin + MDBX_PNL_SIZE(pnl); + const pgno_t *begin = MDBX_PNL_BEGIN(pnl); + const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); + const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); assert(it >= begin && it <= end); if (it != begin) - assert(MDBX_PNL_ORDERED(it[-1], id)); + assert(MDBX_PNL_ORDERED(it[-1], pgno)); if (it != end) - assert(!MDBX_PNL_ORDERED(it[0], id)); + assert(!MDBX_PNL_ORDERED(it[0], pgno)); return (unsigned)(it - begin + 1); } -static __hot unsigned mdbx_pnl_exist(MDBX_PNL pnl, pgno_t id) { - unsigned n = mdbx_pnl_search(pnl, id); - return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == id) ? n : 0; +static __inline unsigned mdbx_pnl_exist(const MDBX_PNL pnl, pgno_t pgno) { + unsigned n = mdbx_pnl_search(pnl, pgno); + return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; +} + +static __inline unsigned mdbx_pnl_intersect(const MDBX_PNL pnl, pgno_t pgno, + unsigned npages) { + const unsigned len = MDBX_PNL_SIZE(pnl); + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + mdbx_debug_extra("PNL len %u [", len); + for (unsigned i = 1; i <= len; ++i) + mdbx_debug_extra_print(" %" PRIaPGNO, pnl[i]); + mdbx_debug_extra_print("%s\n", "]"); + } + const pgno_t range_last = pgno + npages - 1; +#if MDBX_PNL_ASCENDING + const unsigned n = mdbx_pnl_search(pnl, pgno); + assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || pgno <= pnl[n])); + const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= range_last; +#else + const unsigned n = mdbx_pnl_search(pnl, range_last); + assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || range_last >= pnl[n])); + const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= pgno; +#endif + if (mdbx_assert_enabled()) { + bool check = false; + for (unsigned i = 0; i < npages; ++i) + check |= mdbx_pnl_exist(pnl, pgno + i) != 0; + assert(check == rc); + } + return rc; } /*----------------------------------------------------------------------------*/ @@ -5952,8 +6012,10 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { if (likely(allocated >= wanna)) return MDBX_SUCCESS; - if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) + if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { + mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); return MDBX_TXN_FULL; + } const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) ? wanna + wanna - allocated @@ -5976,7 +6038,7 @@ static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, size_t num) { assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); - assert(num <= MDBX_PNL_MAX); + assert(num <= MDBX_PGL_LIMIT); const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS @@ -6007,14 +6069,69 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { /*----------------------------------------------------------------------------*/ +static __always_inline size_t dpl2bytes(const ptrdiff_t size) { + assert(size > 2 && size <= MDBX_PGL_LIMIT); + size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + + (size + 2) * sizeof(MDBX_dp), + MDBX_PNL_GRANULATE * sizeof(void *) * 2) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} + +static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { + size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); + assert(size > 4 && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + return (unsigned)size - 2; +} + +static __always_inline void mdbx_dpl_clear(MDBX_dpl *dl) { + dl->sorted = dl->length = 0; +} + +static void mdbx_dpl_free(MDBX_txn *txn) { + if (likely(txn->tw.dirtylist)) { + mdbx_free(txn->tw.dirtylist); + txn->tw.dirtylist = NULL; + } +} + +static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) { + mdbx_tassert(txn, + txn->tw.dirtylist == NULL || txn->tw.dirtylist->length <= size); + size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); + MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes); + if (likely(dl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(dl); +#endif /* malloc_usable_size */ + dl->allocated = bytes2dpl(bytes); + dl->items[0].pgno = 0; + dl->items[0].ptr = nullptr; + mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->allocated); + txn->tw.dirtylist = dl; + } + return dl; +} + +static int mdbx_dpl_alloc(MDBX_txn *txn) { + mdbx_tassert(txn, + (txn->mt_flags & MDBX_TXN_RDONLY) == 0 && !txn->tw.dirtylist); + MDBX_dpl *const dl = + mdbx_dpl_reserve(txn, txn->mt_env->me_options.dp_initial); + if (unlikely(!dl)) + return MDBX_ENOMEM; + mdbx_dpl_clear(dl); + return MDBX_SUCCESS; +} + #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) -SORT_IMPL(dp_sort, false, MDBX_DP, DP_SORT_CMP) -static __always_inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) { - assert(dl->length <= MDBX_DPL_TXNFULL); +SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) +static __always_inline MDBX_dpl *mdbx_dpl_sort(MDBX_dpl *dl) { + assert(dl->length <= MDBX_PGL_LIMIT); assert(dl->sorted <= dl->length); if (dl->sorted != dl->length) { dl->sorted = dl->length; - dp_sort(dl + 1, dl + dl->length + 1); + dp_sort(dl->items + 1, dl->items + dl->length + 1); } return dl; } @@ -6022,11 +6139,11 @@ static __always_inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) { /* Returns the index of the first dirty-page whose pgno * member is greater than or equal to id. */ #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) -SEARCH_IMPL(dp_bsearch, MDBX_DP, pgno_t, DP_SEARCH_CMP) +SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) { +static unsigned __hot mdbx_dpl_search(MDBX_dpl *dl, pgno_t pgno) { if (mdbx_audit_enabled()) { - for (const MDBX_DP *ptr = dl + dl->sorted; --ptr > dl;) { + for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { assert(ptr[0].pgno < ptr[1].pgno); assert(ptr[0].pgno >= NUM_METAS); } @@ -6036,21 +6153,15 @@ static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) { default: /* sort a whole */ dl->sorted = dl->length; - dp_sort(dl + 1, dl + dl->length + 1); - __fallthrough; /* fall through */ + dp_sort(dl->items + 1, dl->items + dl->length + 1); + break; case 0: /* whole sorted cases */ - if (mdbx_audit_enabled()) { - for (const MDBX_DP *ptr = dl + dl->length; --ptr > dl;) { - assert(ptr[0].pgno < ptr[1].pgno); - assert(ptr[0].pgno >= NUM_METAS); - } - } - return (unsigned)(dp_bsearch(dl + 1, dl->length, pgno) - dl); + break; #define LINEAR_SEARCH_CASE(N) \ case N: \ - if (dl[dl->length - N + 1].pgno == pgno) \ + if (dl->items[dl->length - N + 1].pgno == pgno) \ return dl->length - N + 1; \ __fallthrough @@ -6071,70 +6182,120 @@ static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) { LINEAR_SEARCH_CASE(3); /* fall through */ LINEAR_SEARCH_CASE(2); /* fall through */ case 1: - if (dl[dl->length].pgno == pgno) + if (dl->items[dl->length].pgno == pgno) return dl->length; /* continue bsearch on the sorted part */ - return (unsigned)(dp_bsearch(dl + 1, dl->sorted, pgno) - dl); + break; + } + return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); +} + +static __inline bool mdbx_dpl_intersect(MDBX_dpl *dl, pgno_t pgno, + unsigned npages) { + assert(dl->sorted == dl->length); + unsigned const n = mdbx_dpl_search(dl, pgno); + assert(n >= 1 && n <= dl->length + 1); + assert(n > dl->length || pgno <= dl->items[n].pgno); + assert(n == 1 || pgno > dl->items[n - 1].pgno); + const MDBX_page *const prev = dl->items[n - 1].ptr; + const bool rc = + (/* intersection with founded */ n <= dl->length && + pgno + npages > dl->items[n].pgno) || + (/* intersection with prev */ n > 1 && unlikely(IS_OVERFLOW(prev)) && + prev->mp_pgno + prev->mp_pages > pgno); + if (mdbx_assert_enabled()) { + bool check = false; + for (unsigned i = 1; i <= dl->length; ++i) { + const MDBX_page *const dp = dl->items[i].ptr; + if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || + dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) /* end */ <= + /* begin */ pgno)) + check |= true; + } + assert(check == rc); } + return rc; } -static __always_inline MDBX_page *mdbx_dpl_find(MDBX_DPL dl, pgno_t pgno) { - const unsigned i = mdbx_dpl_search(dl, pgno); +static __always_inline unsigned mdbx_dpl_exist(MDBX_dpl *dl, pgno_t pgno) { + unsigned i = mdbx_dpl_search(dl, pgno); assert((int)i > 0); - return (i <= dl->length && dl[i].pgno == pgno) ? dl[i].ptr : nullptr; + return (i <= dl->length && dl->items[i].pgno == pgno) ? i : 0; } -static __hot MDBX_page *mdbx_dpl_remove(MDBX_DPL dl, pgno_t prno) { - unsigned i = mdbx_dpl_search(dl, prno); +static __always_inline MDBX_page *mdbx_dpl_find(MDBX_dpl *dl, pgno_t pgno) { + const unsigned i = mdbx_dpl_search(dl, pgno); assert((int)i > 0); - MDBX_page *mp = nullptr; - if (i <= dl->length && dl[i].pgno == prno) { - dl->sorted -= dl->sorted >= i; - mp = dl[i].ptr; - while (i < dl->length) { - dl[i] = dl[i + 1]; - ++i; - } - dl->length -= 1; + return (i <= dl->length && dl->items[i].pgno == pgno) ? dl->items[i].ptr + : nullptr; +} + +static __maybe_unused const MDBX_page *debug_dpl_find(const MDBX_dpl *dl, + const pgno_t pgno) { + for (unsigned i = dl->length; i > dl->sorted; --i) + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + + if (dl->sorted) { + const unsigned i = + (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); + if (i <= dl->sorted && dl->items[i].pgno == pgno) + return dl->items[i].ptr; } - return mp; + return nullptr; +} + +static void mdbx_dpl_remove(MDBX_dpl *dl, unsigned i) { + assert((int)i > 0 && i <= dl->length); + dl->sorted -= dl->sorted >= i; + dl->length -= 1; + if (dl->length >= i) + memmove(dl->items + i, dl->items + i + 1, + (dl->length - i + 1) * sizeof(dl->items[0])); } static __always_inline int __must_check_result -mdbx_dpl_append(MDBX_DPL dl, pgno_t pgno, MDBX_page *page) { - assert(dl->length <= MDBX_DPL_TXNFULL); +mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); if (mdbx_audit_enabled()) { for (unsigned i = dl->length; i > 0; --i) { - assert(dl[i].pgno != pgno); - if (unlikely(dl[i].pgno == pgno)) + assert(dl->items[i].pgno != pgno); + if (unlikely(dl->items[i].pgno == pgno)) return MDBX_PROBLEM; } } - if (unlikely(dl->length == MDBX_DPL_TXNFULL)) - return MDBX_TXN_FULL; + if (unlikely(dl->length == dl->allocated)) { + if (unlikely(dl->allocated >= MDBX_PGL_LIMIT)) { + mdbx_error("DPL is full (MDBX_PGL_LIMIT %u)", MDBX_PGL_LIMIT); + return MDBX_TXN_FULL; + } + const size_t size = (dl->allocated < MDBX_PNL_INITIAL * 42) + ? dl->allocated + dl->allocated + : dl->allocated + dl->allocated / 2; + dl = mdbx_dpl_reserve(txn, size); + if (unlikely(!dl)) + return MDBX_ENOMEM; + mdbx_tassert(txn, dl->length < dl->allocated); + } /* append page */ const unsigned n = dl->length + 1; - if (n == 1 || (dl->sorted >= dl->length && dl[n - 1].pgno < pgno)) + if (n == 1 || (dl->sorted >= dl->length && dl->items[n - 1].pgno < pgno)) dl->sorted = n; dl->length = n; - dl[n].pgno = pgno; - dl[n].ptr = page; + dl->items[n].pgno = pgno; + dl->items[n].ptr = page; return MDBX_SUCCESS; } -static __always_inline void mdbx_dpl_clear(MDBX_DPL dl) { - dl->sorted = dl->length = 0; -} - /*----------------------------------------------------------------------------*/ uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; uint8_t mdbx_loglevel = MDBX_LOG_FATAL; MDBX_debug_func *mdbx_debug_logger; -static bool mdbx_refund(MDBX_txn *txn); static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp); static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp); static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, @@ -6155,7 +6316,7 @@ static int mdbx_cursor_touch(MDBX_cursor *mc); enum { /* mdbx_txn_end operation number, for logging */ MDBX_END_COMMITTED, - MDBX_END_EMPTY_COMMIT, + MDBX_END_PURE_COMMIT, MDBX_END_ABORT, MDBX_END_RESET, MDBX_END_RESET_TMP, @@ -6167,7 +6328,7 @@ enum { #define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ -static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); +static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); static int __must_check_result mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl, @@ -6233,10 +6394,6 @@ static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, bool dont_filter_gc); -static __maybe_unused __always_inline int __must_check_result -mdbx_audit(MDBX_txn *txn) { - return mdbx_audit_ex(txn, 0, (txn->mt_flags & MDBX_TXN_RDONLY) != 0); -} static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, const MDBX_page *const mp, @@ -6572,7 +6729,7 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { return; case P_META: mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, - page_meta(mp)->mm_txnid_a.inconsistent); + unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); return; default: mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); @@ -6678,13 +6835,15 @@ int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, * Set MDBX_TXN_ERROR on failure. */ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { MDBX_env *env = txn->mt_env; - MDBX_page *np = env->me_dpages; + MDBX_page *np = env->me_dp_reserve; size_t size = env->me_psize; if (likely(num == 1 && np)) { + mdbx_assert(env, env->me_dp_reserve_len > 0); ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dpages = np->mp_next; + env->me_dp_reserve = np->mp_next; + env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); np = mdbx_malloc(size); @@ -6713,15 +6872,20 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { return np; } -/* Free a dirty page */ -static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) { -#if MDBX_DEBUG - dp->mp_pgno = MAX_PAGENO + 1; -#endif - if (pages == 1) { - dp->mp_next = env->me_dpages; +/* Free a shadow dirty page */ +static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { + VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); + ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); + if (MDBX_DEBUG || unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(dp, -1, pgno2bytes(env, npages)); + if (npages == 1 && + env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { + ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), + pgno2bytes(env, npages) - sizeof(dp->mp_next)); + dp->mp_next = env->me_dp_reserve; VALGRIND_MEMPOOL_FREE(env, dp); - env->me_dpages = dp; + env->me_dp_reserve = dp; + env->me_dp_reserve_len += 1; } else { /* large pages just get freed directly */ VALGRIND_MEMPOOL_FREE(env, dp); @@ -6732,11 +6896,11 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) { /* Return all dirty pages to dpage list */ static void mdbx_dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; - const MDBX_DPL dl = txn->tw.dirtylist; + MDBX_dpl *const dl = txn->tw.dirtylist; const size_t n = dl->length; for (size_t i = 1; i <= n; i++) { - MDBX_page *dp = dl[i].ptr; + MDBX_page *dp = dl->items[i].ptr; mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); } @@ -6756,13 +6920,15 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { if (!mdbx_audit_enabled()) return true; + const MDBX_dpl *const dl = txn->tw.dirtylist; unsigned loose = 0; - for (unsigned i = txn->tw.dirtylist->length; i > 0; --i) { - const MDBX_page *const dp = txn->tw.dirtylist[i].ptr; + for (unsigned i = dl->length; i > 0; --i) { + const MDBX_page *const dp = dl->items[i].ptr; if (!dp) continue; - mdbx_tassert(txn, dp->mp_pgno == txn->tw.dirtylist[i].pgno); - if (unlikely(dp->mp_pgno != txn->tw.dirtylist[i].pgno)) + + mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno); + if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; mdbx_tassert(txn, dp->mp_flags & P_DIRTY); @@ -6780,9 +6946,9 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) return false; - if (i < txn->tw.dirtylist->sorted) { - mdbx_tassert(txn, txn->tw.dirtylist[i + 1].pgno >= dp->mp_pgno + num); - if (unlikely(txn->tw.dirtylist[i + 1].pgno < dp->mp_pgno + num)) + if (i < dl->sorted) { + mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); + if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) return false; } @@ -6805,19 +6971,17 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { if (unlikely(loose != txn->tw.loose_count)) return false; - if (txn->tw.dirtylist->length - txn->tw.dirtylist->sorted < 16) { - for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { - const MDBX_page *const dp = - mdbx_dpl_find(txn->tw.dirtylist, txn->tw.retired_pages[i]); - mdbx_tassert(txn, !dp); - if (unlikely(dp)) - return false; - } + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { + const MDBX_page *const dp = debug_dpl_find(dl, txn->tw.retired_pages[i]); + mdbx_tassert(txn, !dp); + if (unlikely(dp)) + return false; } return true; } +#if MDBX_ENABLE_REFUND static void mdbx_refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; @@ -6842,19 +7006,16 @@ static void mdbx_refund_reclaimed(MDBX_txn *txn) { mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - 1)); } static void mdbx_refund_loose(MDBX_txn *txn) { - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, txn->tw.loose_pages != nullptr); mdbx_tassert(txn, txn->tw.loose_count > 0); - const MDBX_DPL dl = txn->tw.dirtylist; + MDBX_dpl *const dl = txn->tw.dirtylist; mdbx_tassert(txn, dl->length >= txn->tw.loose_count); - mdbx_tassert(txn, txn->tw.spill_pages == nullptr || - dl->length >= MDBX_PNL_SIZE(txn->tw.spill_pages)); pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; MDBX_PNL suitable = onstack; @@ -6871,15 +7032,15 @@ static void mdbx_refund_loose(MDBX_txn *txn) { mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; unsigned w = 0; - for (const MDBX_page *dp = txn->tw.loose_pages; dp; dp = dp->mp_next) { - mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); - mdbx_tassert(txn, txn->mt_next_pgno > dp->mp_pgno); - if (likely(txn->mt_next_pgno - txn->tw.loose_count <= dp->mp_pgno)) { + for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { + mdbx_tassert(txn, lp->mp_flags == (P_LOOSE | P_DIRTY)); + mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno); + if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { mdbx_tassert(txn, w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) : MDBX_PNL_ALLOCLEN(suitable))); - suitable[++w] = dp->mp_pgno; - most = (dp->mp_pgno > most) ? dp->mp_pgno : most; + suitable[++w] = lp->mp_pgno; + most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } } @@ -6905,6 +7066,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { refunded, most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); txn->mt_next_pgno = most; /* Filter-out dirty list */ @@ -6912,37 +7074,38 @@ static void mdbx_refund_loose(MDBX_txn *txn) { w = 0; if (dl->sorted) { do { - if (dl[++r].pgno < most) { + if (dl->items[++r].pgno < most) { if (++w != r) - dl[w] = dl[r]; + dl->items[w] = dl->items[r]; } } while (r < dl->sorted); dl->sorted = w; } while (r < dl->length) { - if (dl[++r].pgno < most) { + if (dl->items[++r].pgno < most) { if (++w != r) - dl[w] = dl[r]; + dl->items[w] = dl->items[r]; } } dl->length = w; - mdbx_tassert(txn, txn->mt_parent || - txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length == + txn->mt_env->me_options.dp_limit); goto unlink_loose; } } else { /* Dirtylist is mostly sorted, just refund loose pages at the end. */ mdbx_dpl_sort(dl); - mdbx_tassert(txn, dl->length < 2 || dl[1].pgno < dl[dl->length].pgno); + mdbx_tassert(txn, dl->length < 2 || + dl->items[1].pgno < dl->items[dl->length].pgno); mdbx_tassert(txn, dl->sorted == dl->length); /* Scan dirtylist tail-forward and cutoff suitable pages. */ - while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 && - dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { - MDBX_page *dp = dl[dl->length].ptr; + while (dl->length && dl->items[dl->length].pgno == txn->mt_next_pgno - 1 && + dl->items[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { + MDBX_page *dp = dl->items[dl->length].ptr; mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno); - mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno); + mdbx_tassert(txn, dp->mp_pgno == dl->items[dl->length].pgno); + txn->mt_next_pgno -= 1; dl->length -= 1; } @@ -6951,10 +7114,9 @@ static void mdbx_refund_loose(MDBX_txn *txn) { dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; - txn->mt_next_pgno -= refunded; - mdbx_tassert(txn, txn->mt_parent || - txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length == + txn->mt_env->me_options.dp_limit); + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); /* Filter-out loose chain & dispose refunded pages. */ unlink_loose: @@ -6973,9 +7135,8 @@ static void mdbx_refund_loose(MDBX_txn *txn) { } mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - mdbx_tassert(txn, txn->mt_parent || - txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length == + txn->mt_env->me_options.dp_limit); if (suitable != onstack) mdbx_pnl_free(suitable); txn->tw.loose_refund_wl = txn->mt_next_pgno; @@ -7002,11 +7163,27 @@ static bool mdbx_refund(MDBX_txn *txn) { break; } - return before != txn->mt_next_pgno; + if (before == txn->mt_next_pgno) + return false; + + if (txn->tw.spill_pages) + /* Squash deleted pagenums if we refunded any */ + mdbx_spill_purge(txn); + + return true; +} +#else /* MDBX_ENABLE_REFUND */ +static __inline bool mdbx_refund(MDBX_txn *txn) { + (void)txn; + /* No online auto-compactification. */ + return false; } +#endif /* MDBX_ENABLE_REFUND */ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, unsigned npages) { + mdbx_debug("kill%s %u page %" PRIaPGNO, IS_DIRTY(mp) ? " dirty" : "", npages, + pgno); mdbx_assert(env, pgno >= NUM_METAS && npages); if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) { const size_t bytes = pgno2bytes(env, npages); @@ -7033,6 +7210,30 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, } } +/* Remove page from dirty list */ +static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, + MDBX_page *const mp, + const unsigned npages) { + mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length && + txn->tw.dirtylist->items[di].ptr == mp); + mdbx_dpl_remove(txn->tw.dirtylist, di); + txn->tw.dirtyroom++; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + txn->mt_env->me_options.dp_limit); + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags = 0xFFFF; + VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); + if (txn->mt_flags & MDBX_WRITEMAP) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + } else + mdbx_dpage_free(txn->mt_env, mp, npages); +} + /* Retire, loosen or free a single page. * * Saves single pages to a list for future reuse @@ -7043,95 +7244,212 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ - static int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const pgno_t pgno = mp->mp_pgno; + const bool is_dirty = IS_DIRTY(mp); - mp->mp_txnid = INVALID_TXNID; - if (txn->mt_parent) { - mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); - mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno)); - /* If txn has a parent, make sure the page is in our dirty list. */ - MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); - /* TODO: use extended flag-mask to track parent's dirty-pages */ - if (dp == nullptr) { - mp->mp_next = txn->tw.retired2parent_pages; - txn->tw.retired2parent_pages = mp; - txn->tw.retired2parent_count += npages; - return MDBX_SUCCESS; - } - if (unlikely(mp != dp)) { /* bad cursor? */ - mdbx_error( - "wrong page 0x%p #%" PRIaPGNO " in the dirtylist, expecting %p", - __Wpedantic_format_voidptr(dp), pgno, __Wpedantic_format_voidptr(mp)); - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; - } - /* ok, it's ours */ + if (is_dirty) { + mdbx_tassert(txn, !txn->tw.spill_pages || + !mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)); + mdbx_tassert(txn, debug_dpl_find(txn->tw.dirtylist, pgno) == mp || + txn->mt_parent); + } else { + mdbx_tassert(txn, !debug_dpl_find(txn->tw.dirtylist, pgno)); } - mdbx_debug("loosen page %" PRIaPGNO, pgno); - const bool is_dirty = IS_DIRTY(mp); - if (MDBX_DEBUG != 0 || - unlikely((txn->mt_env->me_flags & MDBX_PAGEPERTURB) != 0)) { - mdbx_kill_page(txn->mt_env, mp, pgno, npages); - VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); - } - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - ASAN_POISON_MEMORY_REGION(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - - if (unlikely(npages > - 1 /* overflow pages doesn't comes to the loose-list */)) { - if (is_dirty) { - /* Remove from dirty list */ - MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, pgno); - if (unlikely(dp != mp)) { - mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", + const unsigned di = is_dirty ? mdbx_dpl_exist(txn->tw.dirtylist, pgno) : 0; + const unsigned si = (!is_dirty && txn->tw.spill_pages) + ? mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1) + : 0; + + if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { + const char *kind; + if (di) { + mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == mp); + if (unlikely(txn->tw.dirtylist->items[di].ptr != mp)) { + wrong_dirty: + mdbx_error("wrong dirty page 0x%p #%" PRIaPGNO, __Wpedantic_format_voidptr(mp), pgno); txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } - txn->tw.dirtyroom++; - mdbx_tassert(txn, txn->mt_parent || - txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, mp, npages); + /* Страница испачкана в этой транзакции, но до этого могла быть + * аллоцирована, испачкана и пролита в одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "dirty"; + /* Remove from dirty list */ + mdbx_page_wash(txn, di, mp, npages); + } else if (is_dirty) { + /* The page MUST BE from parent transaction. */ + if (mdbx_audit_enabled()) { + const MDBX_page *parent_dp = nullptr; + for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; + parent = parent->mt_parent) { + mdbx_tassert(txn, + !parent->tw.spill_pages || + !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)); + parent_dp = debug_dpl_find(parent->tw.dirtylist, pgno); + } + mdbx_tassert(txn, parent_dp == mp); + } + kind = "parent-dirty"; + } else if (si) { + /* Страница пролита в этой транзакции, следовательно она аллоцирована + * и запачкана в этой или одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "spilled"; + mdbx_spill_remove(txn, si, npages); + } else { + for (MDBX_txn *parent = txn->mt_parent; + parent && (parent->mt_flags & MDBX_TXN_SPILLS); + parent = parent->mt_parent) + if (parent->tw.spill_pages && + mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) { + /* Страница аллоцирована, запачкана и пролита в одной из родительских + * транзакций. Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "parent-spilled"; + goto refund; + } + /* Страница используется (входит в MVCC-снимки доступные читателям). + * Её НЕЛЬЗЯ вытолкнуть в нераспределенных хвост. */ + goto retire; } + refund: + mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + txn->mt_next_pgno = pgno; + mdbx_refund(txn); + return MDBX_SUCCESS; + } - if (unlikely(pgno + npages == txn->mt_next_pgno)) { - txn->mt_next_pgno = pgno; - mdbx_refund(txn); - return MDBX_SUCCESS; + if (is_dirty) { + if (di) { + /* Dirty page from this transaction */ + mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == mp); + if (unlikely(txn->tw.dirtylist->items[di].ptr != mp)) + goto wrong_dirty; + + /* If suitable we can reuse it through loose list */ + if (likely( + npages == 1 && + txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > + pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { + mdbx_debug("loosen dirty page %" PRIaPGNO, pgno); + mp->mp_flags = P_LOOSE | P_DIRTY; + mp->mp_next = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) + ? pgno + 2 + : txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + return MDBX_SUCCESS; + } + +#if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) +#endif + { + /* Страница могла быть изменена в одной из родительских транзакций, + * в том числе, позже выгружена и затем снова загружена и изменена. + * В обоих случаях её нельзя затирать на диске и помечать недоступной + * в asan и/или valgrind */ + for (MDBX_txn *parent = txn->mt_parent; + parent && (parent->mt_flags & MDBX_TXN_SPILLS); + parent = parent->mt_parent) { + if (parent->tw.spill_pages && + mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, + npages << 1)) + goto skip_invalidate; + if (mdbx_dpl_intersect(parent->tw.dirtylist, pgno, npages)) + goto skip_invalidate; + } + +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) +#endif + mdbx_kill_page(txn->mt_env, mp, pgno, npages); + if (!(txn->mt_flags & MDBX_WRITEMAP)) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - + PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - + PAGEHDRSZ); + } + } + skip_invalidate: + /* Remove from dirty list */ + mdbx_page_wash(txn, di, mp, npages); + } else { + /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ + if (mdbx_audit_enabled()) { + const MDBX_page *parent_dp = nullptr; + /* Check parent(s)'s dirty lists. */ + for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; + parent = parent->mt_parent) { + mdbx_tassert(txn, + !parent->tw.spill_pages || + !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)); + parent_dp = debug_dpl_find(parent->tw.dirtylist, pgno); + } + mdbx_tassert(txn, parent_dp == mp); + } } + reclaim: + mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); int rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); - return MDBX_SUCCESS; + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return rc; } - mp->mp_pgno = pgno; - mp->mp_flags = P_LOOSE | P_DIRTY; - mp->mp_next = txn->tw.loose_pages; - txn->tw.loose_pages = mp; - txn->tw.loose_count++; - if (unlikely(txn->mt_next_pgno == pgno + 1)) - mdbx_refund(txn); + if (si) { + /* Page ws spilled in this txn */ + mdbx_spill_remove(txn, si, npages); + /* Страница могла быть выделена и затем пролита в этой транзакции, + * тогда её необходимо поместить в reclaimed-список. + * Либо она могла быть выделена в одной из родительских транзакций и затем + * пролита в этой транзакции, тогда её необходимо поместить в retired-список + * для последующей фильтрации при коммите. */ + for (MDBX_txn *parent = txn->mt_parent; parent; + parent = parent->mt_parent) { + if (mdbx_dpl_exist(parent->tw.dirtylist, pgno)) + goto retire; + } + /* Страница точно была выделена в этой транзакции + * и теперь может быть использована повторно. */ + goto reclaim; + } else { + /* Страница может входить в доступный читателям MVCC-снимок, либо же она + * могла быть выделена, а затем пролита в одной из родительских транзакций. + * Поэтому пока помещаем её в retired-список, который будет фильтроваться + * относительно dirty- и spilled-списков родительских транзакций при коммите + * дочерних транзакций, либо же будет записан в GC в неизменном виде. */ + } - return MDBX_SUCCESS; +retire: + mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno); + int rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return rc; } static int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { - const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - const pgno_t pgno = mp->mp_pgno; - MDBX_txn *const txn = mc->mc_txn; - if (unlikely(mc->mc_flags & C_SUB)) { MDBX_db *outer = mdbx_outer_db(mc); mdbx_cassert(mc, !IS_BRANCH(mp) || outer->md_branch_pages > 0); @@ -7144,36 +7462,10 @@ static int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { mc->mc_db->md_branch_pages -= IS_BRANCH(mp); mdbx_cassert(mc, !IS_LEAF(mp) || mc->mc_db->md_leaf_pages > 0); mc->mc_db->md_leaf_pages -= IS_LEAF(mp); - mdbx_cassert(mc, !IS_OVERFLOW(mp) || mc->mc_db->md_overflow_pages >= npages); - mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0; - - if (IS_DIRTY(mp)) { - int rc = mdbx_page_loose(txn, mp); - if (unlikely(rc != MDBX_SUCCESS)) - mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - return rc; - } - - if (txn->tw.spill_pages) { - const unsigned i = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1); - if (i) { - /* This page is no longer spilled */ - mdbx_tassert(txn, i == MDBX_PNL_SIZE(txn->tw.spill_pages) || - txn->tw.spill_pages[i + 1] >= (pgno + npages) << 1); - txn->tw.spill_pages[i] |= 1; - if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) - MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; - int rc = mdbx_page_loose(txn, mp); - if (unlikely(rc != MDBX_SUCCESS)) - mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - return rc; - } - } - - mdbx_tassert(txn, mp == pgno2page(txn->mt_env, pgno)); - int rc = mdbx_pnl_append_range(&txn->tw.retired_pages, pgno, npages); - mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, pgno) == nullptr); - return rc; + mdbx_cassert(mc, !IS_OVERFLOW(mp) || + mc->mc_db->md_overflow_pages >= mp->mp_pages); + mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? mp->mp_pages : 0; + return mdbx_page_loose(mc->mc_txn, mp); } static __must_check_result __always_inline int @@ -7195,66 +7487,69 @@ mdbx_retire_pgno(MDBX_cursor *mc, const pgno_t pgno) { return rc; } -/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. +/* Toggle P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * * [in] mc A cursor handle for the current operation. * [in] pflags Flags of the pages to update: * - P_DIRTY to set P_KEEP, - * - P_DIRTY|P_KEEP to clear it. - * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). - * - * Returns 0 on success, non-zero on failure. */ -static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { - const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; - MDBX_txn *txn = mc->mc_txn; - MDBX_cursor *m3, *m0 = mc; - MDBX_xcursor *mx; - MDBX_page *mp; - unsigned i, j; - int rc = MDBX_SUCCESS; - - /* Mark pages seen by cursors: First m0, then tracked cursors */ - for (i = txn->mt_numdbs;;) { - if (mc->mc_flags & C_INITIALIZED) { - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j = 0; j < m3->mc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - /* Proceed to mx if it is at a sub-database */ - if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (!(mp && IS_LEAF(mp))) - break; - if (!(node_flags(page_node(mp, m3->mc_ki[j - 1])) & F_SUBDATA)) - break; + * - P_DIRTY|P_KEEP to clear it. */ +static void mdbx_cursor_xkeep(MDBX_cursor *mc, unsigned pflags) { + const unsigned mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; + if (mc->mc_flags & C_INITIALIZED) { + MDBX_cursor *m3 = mc; + for (;;) { + MDBX_page *mp = NULL; + for (unsigned j = 0; j < m3->mc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & mask) == pflags) + mp->mp_flags ^= P_KEEP; } + if (!(mp && IS_LEAF(mp))) + break; + /* Proceed to mx if it is at a sub-database */ + MDBX_xcursor *mx = m3->mc_xcursor; + if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + const unsigned nkeys = page_numkeys(mp); + unsigned ki = m3->mc_ki[m3->mc_top]; + mdbx_cassert(mc, nkeys > 0 && + (ki < nkeys || + (ki == nkeys && (mx->mx_cursor.mc_flags & C_EOF)))); + ki -= ki >= nkeys; + if (!(node_flags(page_node(mp, ki)) & F_SUBDATA)) + break; + m3 = &mx->mx_cursor; } - mc = mc->mc_next; - for (; !mc || mc == m0; mc = txn->tw.cursors[--i]) - if (i == 0) - goto mark_done; } +} + +/* Mark pages seen by cursors: First m0, then tracked cursors + * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). */ +static void mdbx_txn_xkeep(MDBX_txn *txn, MDBX_cursor *m0, + const unsigned pflags, const bool all) { + if (m0) + mdbx_cursor_xkeep(m0, pflags); + + for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) + if (txn->mt_dbistate[i] & DBI_DIRTY) + for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next) + if (mc != m0) + mdbx_cursor_xkeep(mc, pflags); -mark_done: if (all) { /* Mark dirty root pages */ - for (i = 0; i < txn->mt_numdbs; i++) { + const unsigned mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; + for (unsigned i = 0; i < txn->mt_numdbs; i++) { if (txn->mt_dbistate[i] & DBI_DIRTY) { pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); - if (dp && (dp->mp_flags & Mask) == pflags) + if (dp && (dp->mp_flags & mask) == pflags) dp->mp_flags ^= P_KEEP; } } } - - return rc; } /* Spill pages from the dirty list back to disk. @@ -7283,110 +7578,137 @@ mark_done: * we can't spill a page in a child txn if it was already spilled in a * parent txn. That would alter the parent txns' data even though * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. - * - * [in] mc cursor A cursor handle identifying the transaction and - * database for which we are checking space. - * [in] key For a put operation, the key being stored. - * [in] data For a put operation, the data being stored. - * - * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { - if (mc->mc_flags & C_SUB) + * the child aborted. */ +static int mdbx_txn_spill(MDBX_txn *txn, MDBX_cursor *m0, unsigned need) { +#ifndef MDBX_DEBUG_SPILLING + if (likely(txn->tw.dirtyroom > need)) return MDBX_SUCCESS; - - MDBX_txn *txn = mc->mc_txn; - MDBX_DPL dl = txn->tw.dirtylist; - - /* Estimate how much space this op will take */ - pgno_t i = mc->mc_db->md_depth; - /* Named DBs also dirty the main DB */ - if (mc->mc_dbi >= CORE_DBS) - i += txn->mt_dbs[MAIN_DBI].md_depth; - /* For puts, roughly factor in the key+data size */ - if (key) - i += bytes2pgno(txn->mt_env, node_size(key, data) + txn->mt_env->me_psize); - i += i; /* double it for good measure */ - pgno_t need = i; - - if (txn->tw.dirtyroom > i) + unsigned spill = need - txn->tw.dirtyroom; +#else + /* spill at least one page if defined MDBX_DEBUG_SPILLING */ + unsigned spill = (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; +#endif /* MDBX_DEBUG_SPILLING */ + + const unsigned dirty = txn->tw.dirtylist->length; + const unsigned spill_min = + txn->mt_env->me_options.spill_min_denominator + ? dirty / txn->mt_env->me_options.spill_min_denominator + : 0; + const unsigned spill_max = + dirty - (txn->mt_env->me_options.spill_max_denominator + ? dirty / txn->mt_env->me_options.spill_max_denominator + : 0); + spill = (spill > spill_min) ? spill : spill_min; + spill = (spill < spill_max) ? spill : spill_max; + if (!spill) return MDBX_SUCCESS; - /* Less aggressive spill - we originally spilled the entire dirty list, - * with a few exceptions for cursor pages and DB root pages. But this - * turns out to be a lot of wasted effort because in a large txn many - * of those pages will need to be used again. So now we spill only 1/8th - * of the dirty pages. Testing revealed this to be a good tradeoff, - * better than 1/2, 1/4, or 1/10. */ - if (need < MDBX_DPL_TXNFULL / 8) - need = MDBX_DPL_TXNFULL / 8; + mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", spill, + txn->tw.dirtyroom, need); + mdbx_tassert(txn, txn->tw.dirtylist->length >= spill); + int rc; if (!txn->tw.spill_pages) { - txn->tw.spill_pages = mdbx_pnl_alloc(need); - if (unlikely(!txn->tw.spill_pages)) - return MDBX_ENOMEM; + txn->tw.spill_least_removed = INT_MAX; + txn->tw.spill_pages = mdbx_pnl_alloc(spill); + if (unlikely(!txn->tw.spill_pages)) { + rc = MDBX_ENOMEM; + goto bailout; + } } else { /* purge deleted slots */ - MDBX_PNL sl = txn->tw.spill_pages; - pgno_t num = MDBX_PNL_SIZE(sl), j = 0; - for (i = 1; i <= num; i++) { - if ((sl[i] & 1) == 0) - sl[++j] = sl[i]; - } - MDBX_PNL_SIZE(sl) = j; + mdbx_spill_purge(txn); + rc = mdbx_pnl_reserve(&txn->tw.spill_pages, spill); + (void)rc /* ignore since the resulting list may be shorter + and mdbx_pnl_append() will increase pnl on demand */ + ; } /* Preserve pages which may soon be dirtied again */ - int rc = mdbx_pages_xkeep(mc, P_DIRTY, true); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + mdbx_txn_xkeep(txn, m0, P_DIRTY, true); + MDBX_dpl *const dl = mdbx_dpl_sort(txn->tw.dirtylist); /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ - for (i = dl->length; i && need; i--) { - pgno_t pn = dl[i].pgno << 1; - MDBX_page *dp = dl[i].ptr; - if (dp->mp_flags & (P_LOOSE | P_KEEP)) + const unsigned dl_len_before = dl->length; + unsigned spilled = 0; + unsigned keep = dl_len_before; + for (; keep && spill; keep--) { + const pgno_t pgno = dl->items[keep].pgno; + MDBX_page *dp = dl->items[keep].ptr; + const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + mdbx_debug("skip %s %u page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) ? "loose" : "keep", npages, + dp->mp_pgno); + skip: continue; + } /* Can't spill twice, - * make sure it's not already in a parent's spill list. */ - if (txn->mt_parent) { - MDBX_txn *parent; - for (parent = txn->mt_parent; parent; parent = parent->mt_parent) { + * make sure it's not already in a parent's spill list(s). */ + MDBX_txn *parent = txn->mt_parent; + if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { + do if (parent->tw.spill_pages && - mdbx_pnl_exist(parent->tw.spill_pages, pn)) { + mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, + npages << 1)) { + mdbx_debug("skip parent-spilled %u page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_KEEP; - break; + goto skip; } - } - if (parent) - continue; + while ((parent = parent->mt_parent) != nullptr); } - rc = mdbx_pnl_append(&txn->tw.spill_pages, pn); + mdbx_debug("spill %u page %" PRIaPGNO, npages, dp->mp_pgno); + rc = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - need--; + spill--; + spilled += 1; } mdbx_pnl_sort(txn->tw.spill_pages); /* Flush the spilled part of dirty list */ - rc = mdbx_page_flush(txn, i); + rc = mdbx_page_flush(txn, keep); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdbx_pages_xkeep(mc, P_DIRTY | P_KEEP, i != 0); + mdbx_tassert(txn, dl_len_before - spilled == dl->length); + mdbx_txn_xkeep(txn, m0, P_DIRTY | P_KEEP, keep > 0); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); bailout: txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS; return rc; } +static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, + const MDBX_val *data) { + if (mc->mc_flags & C_SUB) + return MDBX_SUCCESS; + MDBX_txn *txn = mc->mc_txn; + if (txn->mt_flags & MDBX_WRITEMAP) + return MDBX_SUCCESS; + + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + unsigned need = CURSOR_STACK + 3; + /* 2) GC/FreeDB for any payload */ + if (mc->mc_dbi > FREE_DBI) { + need += txn->mt_dbs[FREE_DBI].md_depth + 3; + /* 3) Named DBs also dirty the main DB */ + if (mc->mc_dbi > MAIN_DBI) + need += txn->mt_dbs[MAIN_DBI].md_depth + 3; + } + /* 4) Factor the key+data which to be put in */ + need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; + return mdbx_txn_spill(txn, mc, need); +} + /*----------------------------------------------------------------------------*/ static __always_inline bool meta_bootid_match(const MDBX_meta *meta) { - return meta->mm_bootid.x == bootid.x && meta->mm_bootid.y == bootid.y && + return memcmp(&meta->mm_bootid, &bootid, 16) == 0 && (bootid.x | bootid.y) != 0; } @@ -7402,9 +7724,8 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, const bool allow_volatile) { - mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - txnid_t a = safe64_read(&meta->mm_txnid_a); - txnid_t b = safe64_read(&meta->mm_txnid_b); + txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); + txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); if (allow_volatile) return (a == b) ? a : 0; mdbx_assert(env, a == b); @@ -7423,33 +7744,35 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, static __inline void mdbx_meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_a.inconsistent < txnid && - meta->mm_txnid_b.inconsistent < txnid); + mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && + unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - safe64_update(&meta->mm_txnid_a, txnid); + unaligned_poke_u64(4, meta->mm_txnid_b, 0); + unaligned_poke_u64(4, meta->mm_txnid_a, txnid); } static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_a.inconsistent == txnid); - mdbx_assert(env, meta->mm_txnid_b.inconsistent < txnid); + mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; mdbx_jitter4testing(true); - meta->mm_bootid = bootid; - safe64_update(&meta->mm_txnid_b, txnid); + memcpy(&meta->mm_bootid, &bootid, 16); + unaligned_poke_u64(4, meta->mm_txnid_b, txnid); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env)); + mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) || + meta >= METAPAGE_END(env)); (void)env; /* update inconsistent since this function used ONLY for filling meta-image * for writing, but not the actual meta-page */ - meta->mm_bootid = bootid; - meta->mm_txnid_a.inconsistent = txnid; - meta->mm_txnid_b.inconsistent = txnid; + memcpy(&meta->mm_bootid, &bootid, 16); + unaligned_poke_u64(4, meta->mm_txnid_a, txnid); + unaligned_poke_u64(4, meta->mm_txnid_b, txnid); } static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { @@ -7579,8 +7902,10 @@ static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { static const char *mdbx_durable_str(const MDBX_meta *const meta) { if (META_IS_STEADY(meta)) - return (meta->mm_datasync_sign == mdbx_meta_sign(meta)) ? "Steady" - : "Tainted"; + return (unaligned_peek_u64(4, meta->mm_datasync_sign) == + mdbx_meta_sign(meta)) + ? "Steady" + : "Tainted"; return "Weak"; } @@ -7662,15 +7987,22 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { mp->mp_txnid = INVALID_TXNID; mp->mp_flags |= P_DIRTY; - const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp); + if (unlikely(txn->tw.dirtyroom == 0)) { + mdbx_error("Dirtyroom is depleted, DPL length %u", + txn->tw.dirtylist->length); + return MDBX_TXN_FULL; + } + const int rc = mdbx_dpl_append(txn, mp->mp_pgno, mp); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; return rc; } txn->tw.dirtyroom--; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + txn->mt_env->me_options.dp_limit); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); return MDBX_SUCCESS; } @@ -8000,7 +8332,7 @@ static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, data_page(meta)->mp_pgno); if (env->me_flags & MDBX_WRITEMAP) - meta->mm_datasync_sign = wipe; + unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); else return mdbx_pwrite(env->me_lazy_fd, &wipe, sizeof(meta->mm_datasync_sign), (uint8_t *)&meta->mm_datasync_sign - env->me_map); @@ -8096,32 +8428,40 @@ static __inline txnid_t pp_txnid2chk(const MDBX_txn *txn) { __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, MDBX_page **const mp, int flags) { int rc; - MDBX_txn *txn = mc->mc_txn; - MDBX_env *env = txn->mt_env; + MDBX_txn *const txn = mc->mc_txn; + MDBX_env *const env = txn->mt_env; MDBX_page *np; + const unsigned coalesce_threshold = + env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the GC, then the retired-list cannot play - * catch-up with itself by growing while trying to save it. */ - flags &= ~MDBX_ALLOC_GC; - } else if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { - /* avoid (recursive) search inside empty tree and while tree is updating, - * https://github.com/erthink/libmdbx/issues/31 */ - flags &= ~MDBX_ALLOC_GC; - } + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold) + flags &= ~MDBX_COALESCE; + if (unlikely( + /* If mc is updating the GC, then the retired-list cannot play + catch-up with itself by growing while trying to save it. */ + (mc->mc_flags & C_RECLAIMING) || + /* avoid (recursive) search inside empty tree and while tree is + updating, https://github.com/erthink/libmdbx/issues/31 */ + txn->mt_dbs[FREE_DBI].md_entries == 0 || + /* If our dirty list is already full, we can't touch GC */ + (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) + flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); } if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { /* If there are any loose pages, just use them */ mdbx_assert(env, mp && num); if (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { mdbx_refund(txn); if (unlikely(!txn->tw.loose_pages)) - goto skip_cache; + goto no_loose; } +#endif /* MDBX_ENABLE_REFUND */ np = txn->tw.loose_pages; txn->tw.loose_pages = np->mp_next; @@ -8137,10 +8477,13 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, return MDBX_SUCCESS; } } -skip_cache: +#if MDBX_ENABLE_REFUND +no_loose: +#endif /* MDBX_ENABLE_REFUND */ - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); txnid_t oldest = 0, last = 0; @@ -8156,8 +8499,8 @@ skip_cache: * Prefer pages with lower pgno. */ mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); - if (likely(flags & MDBX_ALLOC_CACHE) && re_len > wanna_range && - (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { + if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE && + re_len > wanna_range) { mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; @@ -8187,11 +8530,6 @@ skip_cache: } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ - if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) { - /* If our dirty list is already full, we can't touch GC */ - flags &= ~MDBX_ALLOC_GC; - } if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; @@ -8302,22 +8640,16 @@ skip_cache: goto fail; } const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); - /* TODO: provide a user-configurable threshold */ - const unsigned threshold_2_stop_gc_reclaiming = MDBX_PNL_MAX / 4; - if (unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > - threshold_2_stop_gc_reclaiming) && + if (flags != MDBX_ALLOC_GC && + unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + env->me_options.rp_augment_limit) && (pgno_add(txn->mt_next_pgno, num) <= txn->mt_geo.upper || gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= - MDBX_PNL_MAX / 16 * 15)) { + MDBX_PGL_LIMIT / 16 * 15)) { /* Stop reclaiming to avoid overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. https://github.com/erthink/libmdbx/issues/123 */ - flags &= ~MDBX_ALLOC_GC; - if (unlikely((flags & MDBX_ALLOC_ALL) == 0)) { - /* Oh, we can't do anything */ - rc = MDBX_TXN_FULL; - goto fail; - } + flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); break; } rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); @@ -8337,10 +8669,9 @@ skip_cache: mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - unsigned i; - for (i = gc_len; i; i--) + for (unsigned i = gc_len; i; i--) mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); - mdbx_debug_extra_print("%s", "\n"); + mdbx_debug_extra_print("%s\n", "."); } /* Merge in descending sorted order */ @@ -8351,56 +8682,32 @@ skip_cache: rc = MDBX_CORRUPTED; goto fail; } + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); re_len = MDBX_PNL_SIZE(re_list); mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); - if (re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { + if (MDBX_ENABLE_REFUND && re_len && + unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ mdbx_refund(txn); re_list = txn->tw.reclaimed_pglist; re_len = MDBX_PNL_SIZE(re_list); } - if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { - /* Done for a kick-reclaim mode, actually no page needed */ + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) return MDBX_SUCCESS; - } /* Don't try to coalesce too much. */ - if (unlikely(re_len > MDBX_DPL_TXNFULL / 42)) - break; - if (re_len /* current size */ >= env->me_maxgc_ov1page || + if (re_len /* current size */ > coalesce_threshold || (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= - env->me_maxgc_ov1page / 2)) + coalesce_threshold / 2)) flags &= ~MDBX_COALESCE; } - if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE && - re_len > wanna_range) { - range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; - pgno = MDBX_PNL_LEAST(re_list); - if (likely(wanna_range == 0)) - goto done; -#if MDBX_PNL_ASCENDING - mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); - while (true) { - unsigned range_end = range_begin + wanna_range; - if (re_list[range_end] - pgno == wanna_range) - goto done; - if (range_end == re_len) - break; - pgno = re_list[++range_begin]; - } -#else - mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); - while (true) { - if (re_list[range_begin - wanna_range] - pgno == wanna_range) - goto done; - if (range_begin == wanna_range) - break; - pgno = re_list[--range_begin]; - } -#endif /* MDBX_PNL sort-order */ + if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_CACHE)) { + flags -= MDBX_COALESCE; + continue; } /* There is no suitable pages in the GC and to be able to allocate @@ -8512,8 +8819,9 @@ skip_cache: } fail: - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (likely(mp)) { *mp = nullptr; txn->mt_flags |= MDBX_TXN_ERROR; @@ -8525,10 +8833,12 @@ skip_cache: done: if (unlikely(mp == nullptr)) return MDBX_SUCCESS; + if (unlikely(txn->tw.dirtyroom < 1)) + return MDBX_TXN_FULL; mdbx_ensure(env, pgno >= NUM_METAS); if (env->me_flags & MDBX_WRITEMAP) { np = pgno2page(env, pgno); - /* LY: reset no-access flag from mdbx_loose_page() */ + /* LY: reset no-access flag from mdbx_page_loose() */ VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); ASAN_UNPOISON_MEMORY_REGION(np, pgno2bytes(env, num)); } else { @@ -8552,8 +8862,9 @@ done: for (unsigned i = range_begin - num; i < re_len;) re_list[++i] = re_list[++range_begin]; #endif - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); } else { txn->mt_next_pgno = pgno + num; mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); @@ -8566,14 +8877,18 @@ done: np->mp_pgno = pgno; np->mp_leaf2_ksize = 0; np->mp_flags = 0; - np->mp_pages = num; + if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) { + np->mp_pages = num; + np->mp_flags = P_OVERFLOW; + } rc = mdbx_page_dirty(txn, np); if (unlikely(rc != MDBX_SUCCESS)) goto fail; *mp = np; - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); return MDBX_SUCCESS; } @@ -8609,44 +8924,46 @@ __hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, size_t psize) { * ret is unchanged if mp wasn't spilled. */ static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { - MDBX_env *env = txn->mt_env; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - - for (const MDBX_txn *tx2 = txn; tx2; tx2 = tx2->mt_parent) { - if (!tx2->tw.spill_pages) + mdbx_tassert(txn, !IS_DIRTY(mp)); + const pgno_t spilled_pgno = mp->mp_pgno << 1; + const MDBX_txn *scan = txn; + do { + if ((scan->mt_flags & MDBX_TXN_SPILLS) == 0) + break; + if (!scan->tw.spill_pages) continue; - unsigned i = mdbx_pnl_exist(tx2->tw.spill_pages, pn); - if (!i) + const unsigned si = mdbx_pnl_exist(scan->tw.spill_pages, spilled_pgno); + if (!si) continue; - if (txn->tw.dirtyroom == 0) - return MDBX_TXN_FULL; - unsigned num = IS_OVERFLOW(mp) ? mp->mp_pages : 1; + const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; MDBX_page *np = mp; - if ((env->me_flags & MDBX_WRITEMAP) == 0) { - np = mdbx_page_malloc(txn, num); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + np = mdbx_page_malloc(txn, npages); if (unlikely(!np)) return MDBX_ENOMEM; - if (unlikely(num > 1)) - memcpy(np, mp, pgno2bytes(env, num)); + if (likely(npages == 1)) + mdbx_page_copy(np, mp, txn->mt_env->me_psize); else - mdbx_page_copy(np, mp, env->me_psize); + memcpy(np, mp, pgno2bytes(txn->mt_env, npages)); } mdbx_debug("unspill page %" PRIaPGNO, mp->mp_pgno); - if (tx2 == txn) { + if (scan == txn) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. */ - txn->tw.spill_pages[i] |= 1; - if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) - MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; + mdbx_spill_remove(txn, si, npages); } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ int rc = mdbx_page_dirty(txn, np); - if (likely(rc == MDBX_SUCCESS)) - *ret = np; - return rc; - } + if (unlikely(rc != MDBX_SUCCESS)) { + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, np, npages); + return rc; + } + *ret = np; + break; + } while ((scan = scan->mt_parent) != nullptr); return MDBX_SUCCESS; } @@ -8657,14 +8974,31 @@ static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, * * Returns 0 on success, non-zero on failure. */ __hot static int mdbx_page_touch(MDBX_cursor *mc) { - MDBX_page *mp = mc->mc_pg[mc->mc_top], *np; + MDBX_page *const mp = mc->mc_pg[mc->mc_top], *np; MDBX_txn *txn = mc->mc_txn; MDBX_cursor *m2, *m3; pgno_t pgno; int rc; - mdbx_cassert(mc, !IS_OVERFLOW(mp)); - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (mdbx_assert_enabled()) { + if (mc->mc_dbi >= CORE_DBS) { + if (mc->mc_flags & C_SUB) { + MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); + MDBX_cursor_couple *couple = + container_of(mx, MDBX_cursor_couple, inner); + mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + mdbx_cassert(mc, *couple->outer.mc_dbistate & DBI_DIRTY); + } else { + mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); + } + mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + } + mdbx_cassert(mc, !IS_OVERFLOW(mp)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + } + + if (!IS_DIRTY(mp)) { if (txn->mt_flags & MDBX_TXN_SPILLS) { np = NULL; rc = mdbx_page_unspill(txn, mp, &np); @@ -8677,12 +9011,12 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, 1)) || (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) goto fail; + pgno = np->mp_pgno; mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), mp->mp_pgno, pgno); mdbx_cassert(mc, mp->mp_pgno != pgno); mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, mp->mp_pgno) == nullptr); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; @@ -8710,18 +9044,25 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { } mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - mdbx_cassert(mc, txn->tw.dirtylist->length <= MDBX_DPL_TXNFULL); + mdbx_cassert(mc, txn->tw.dirtylist->length <= + MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); /* No - copy it */ np = mdbx_page_malloc(txn, 1); if (unlikely(!np)) { rc = MDBX_ENOMEM; goto fail; } - rc = mdbx_dpl_append(txn->tw.dirtylist, pgno, np); + /* insert a clone of parent's dirty page, so don't touch dirtyroom */ + rc = mdbx_dpl_append(txn, pgno, np); if (unlikely(rc)) { mdbx_dpage_free(txn->mt_env, np, 1); goto fail; } + + np->mp_pgno = pgno; + np->mp_txnid = INVALID_TXNID; + np->mp_flags |= P_DIRTY; + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); } else { return MDBX_SUCCESS; } @@ -8956,13 +9297,11 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { - /* Preserve changes from nested to parent txn */ + /* Restore pointers to parent txn */ mc->mc_next = bk->mc_next; mc->mc_backup = bk->mc_backup; mc->mc_txn = bk->mc_txn; - *bk->mc_db = *mc->mc_db; mc->mc_db = bk->mc_db; - *bk->mc_dbistate = *mc->mc_dbistate; mc->mc_dbistate = bk->mc_dbistate; if (mx) { if (mx != bk->mc_xcursor) { @@ -9283,7 +9622,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { if (likely(r)) { safe64_reset(&r->mr_txnid, false); r->mr_snapshot_pages_used = meta->mm_geo.next; - r->mr_snapshot_pages_retired = meta->mm_pages_retired; + r->mr_snapshot_pages_retired = + unaligned_peek_u64(4, meta->mm_pages_retired); safe64_write(&r->mr_txnid, snap); mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); @@ -9321,7 +9661,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_canary = meta->mm_canary; if (likely(r)) { r->mr_snapshot_pages_used = meta->mm_geo.next; - r->mr_snapshot_pages_retired = meta->mm_pages_retired; + r->mr_snapshot_pages_retired = + unaligned_peek_u64(4, meta->mm_pages_retired); r->mr_txnid.inconsistent = txn->mt_txnid; mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); @@ -9388,8 +9729,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { const txnid_t snap = mdbx_meta_txnid_stable(env, meta); txn->mt_txnid = safe64_txnid_next(snap); if (unlikely(txn->mt_txnid > MAX_TXNID)) { - mdbx_error("%s", "txnid overflow!"); rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); goto bailout; } @@ -9397,11 +9738,14 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_child = NULL; txn->tw.loose_pages = NULL; txn->tw.loose_count = 0; - txn->tw.dirtyroom = MDBX_DPL_TXNFULL; - txn->tw.dirtylist = env->me_dirtylist; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; mdbx_dpl_clear(txn->tw.dirtylist); MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; txn->tw.spill_pages = NULL; + txn->tw.spill_least_removed = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; @@ -9412,7 +9756,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ txn->mt_geo = meta->mm_geo; - txn->tw.loose_refund_wl = txn->mt_next_pgno; } /* Setup db info */ @@ -9599,6 +9942,16 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (env->me_options.spill_parent4child_denominator) { + /* Spill dirty-pages of parent to provide dirtyroom for child txn */ + rc = mdbx_txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->me_options.spill_parent4child_denominator); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); /* Child txns save MDBX_pgstate and use own copy of cursors */ size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); @@ -9628,26 +9981,59 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_env = env; if (parent) { - mdbx_tassert(txn, mdbx_dirtylist_check(parent)); + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->tw.dirtylist = mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1)); - txn->tw.reclaimed_pglist = - mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist)); - if (!txn->tw.dirtylist || !txn->tw.reclaimed_pglist) { + rc = mdbx_dpl_alloc(txn); + if (likely(rc == MDBX_SUCCESS)) { + txn->tw.reclaimed_pglist = + mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist)); + if (unlikely(!txn->tw.reclaimed_pglist)) + rc = MDBX_ENOMEM; + } + if (unlikely(rc != MDBX_SUCCESS)) { + nested_failed: mdbx_pnl_free(txn->tw.reclaimed_pglist); - mdbx_free(txn->tw.dirtylist); + mdbx_dpl_free(txn); mdbx_free(txn); - return MDBX_ENOMEM; + return rc; } - mdbx_dpl_clear(txn->tw.dirtylist); + txn->tw.dirtyroom = parent->tw.dirtyroom; + + /* Move loose pages to reclaimed list */ + if (parent->tw.loose_count) { + do { + MDBX_page *lp = parent->tw.loose_pages; + const unsigned di = mdbx_dpl_exist(parent->tw.dirtylist, lp->mp_pgno); + mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp); + mdbx_tassert(parent, lp->mp_flags == (P_LOOSE | P_DIRTY)); + rc = + mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto nested_failed; + parent->tw.loose_pages = lp->mp_next; + /* Remove from dirty list */ + mdbx_page_wash(parent, di, lp, 1); + } while (parent->tw.loose_pages); + parent->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + } + + mdbx_dpl_sort(parent->tw.dirtylist); + if (parent->tw.spill_pages) + mdbx_spill_purge(parent); + memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); mdbx_assert(env, mdbx_pnl_check4assert( txn->tw.reclaimed_pglist, (txn->mt_next_pgno /* LY: intentional assignment here, only for assertion */ - = parent->mt_next_pgno))); + = parent->mt_next_pgno) - + MDBX_ENABLE_REFUND)); txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { @@ -9661,9 +10047,10 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); txn->mt_txnid = parent->mt_txnid; - txn->tw.dirtyroom = parent->tw.dirtyroom; txn->mt_geo = parent->mt_geo; - txn->tw.loose_refund_wl = parent->tw.loose_refund_wl; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ txn->mt_canary = parent->mt_canary; parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; @@ -9678,7 +10065,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, mdbx_tassert(parent, parent->mt_parent || parent->tw.dirtyroom + parent->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + env->me_options.dp_limit); + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); env->me_txn = txn; rc = mdbx_cursor_shadow(parent, txn); if (unlikely(rc != MDBX_SUCCESS)) @@ -9699,10 +10087,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, mdbx_assert(env, (txn->mt_flags & ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); - else - mdbx_assert(env, - (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC)) == 0); + else { + mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | + MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | + MDBX_TXN_SPILLS)) == 0); + assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); + } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; *ret = txn; @@ -9743,7 +10133,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { /* fetch info from volatile head */ head_meta = mdbx_meta_head(env); head_txnid = mdbx_meta_txnid_fluid(env, head_meta); - head_retired = head_meta->mm_pages_retired; + head_retired = unaligned_peek_u64(4, head_meta->mm_pages_retired); info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); info->txn_space_leftover = @@ -9802,7 +10192,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { : MDBX_PNL_SIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); info->txn_space_dirty = - pgno2bytes(env, MDBX_DPL_TXNFULL - txn->tw.dirtyroom); + pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck; if (scan_rlt && lck) { @@ -9849,13 +10239,50 @@ int mdbx_txn_flags(const MDBX_txn *txn) { return txn->mt_flags; } +/* Check for misused dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static void dbi_import_locked(MDBX_txn *txn) { + MDBX_env *const env = txn->mt_env; + const unsigned n = env->me_numdbs; + for (unsigned i = CORE_DBS; i < n; ++i) { + if (i >= txn->mt_numdbs) { + txn->mt_dbistate[i] = 0; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) + txn->tw.cursors[i] = NULL; + } + if ((env->me_dbflags[i] & DB_VALID) && + !(txn->mt_dbistate[i] & DBI_USRVALID)) { + txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; + txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; + txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; + mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); + } + } + txn->mt_numdbs = n; +} + +/* Import DBI which opened after txn started into context */ +static __cold bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { + if (dbi < CORE_DBS || dbi >= txn->mt_env->me_numdbs) + return false; + + mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == + MDBX_SUCCESS); + dbi_import_locked(txn); + mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) == + MDBX_SUCCESS); + return txn->mt_dbistate[dbi] & DBI_USRVALID; +} + /* Export or close DBI handles opened in this txn. */ -static void mdbx_dbis_update(MDBX_txn *txn, int keep) { +static void dbi_update(MDBX_txn *txn, int keep) { mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; - MDBX_env *env = txn->mt_env; + MDBX_env *const env = txn->mt_env; for (unsigned i = n; --i >= CORE_DBS;) { if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) @@ -9865,11 +10292,10 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } + if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) + continue /* dbi explicitly closed and/or then re-opened by other txn */; if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; - mdbx_compiler_barrier(); - if (env->me_numdbs <= i) - env->me_numdbs = i + 1; } else { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { @@ -9883,17 +10309,95 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { } } + n = env->me_numdbs; + if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { + if (!locked) { + mdbx_ensure(env, + mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + locked = true; + } + + n = env->me_numdbs; + while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID)) + --n; + env->me_numdbs = n; + } + if (unlikely(locked)) mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } } +/* Filter-out pgno list from transaction's dirty-page list */ +static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, + const bool spilled) { + if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { + mdbx_tassert(txn, mdbx_pnl_check4assert(pl, txn->mt_next_pgno << spilled)); + MDBX_dpl *dl = mdbx_dpl_sort(txn->tw.dirtylist); + + /* Scanning in ascend order */ + const int step = MDBX_PNL_ASCENDING ? 1 : -1; + const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); + const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; + mdbx_tassert(txn, pl[begin] <= pl[end - step]); + + unsigned r = mdbx_dpl_search(dl, pl[begin] >> spilled); + mdbx_tassert(txn, dl->sorted == dl->length); + for (int i = begin; r <= dl->length;) { /* scan loop */ + assert(i != end); + mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + pgno_t pl_pgno = pl[i] >> spilled; + pgno_t dp_pgno = dl->items[r].pgno; + if (likely(dp_pgno != pl_pgno)) { + const bool cmp = dp_pgno < pl_pgno; + r += cmp; + i += cmp ? 0 : step; + if (likely(i != end)) + continue; + return; + } + + /* update loop */ + unsigned w = r; + remove_dl: + if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { + MDBX_page *dp = dl->items[r].ptr; + mdbx_dpage_free(txn->mt_env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); + } + ++r; + next_i: + i += step; + if (unlikely(i == end)) { + while (r <= dl->length) + dl->items[w++] = dl->items[r++]; + } else { + while (r <= dl->length) { + assert(i != end); + mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + pl_pgno = pl[i] >> spilled; + dp_pgno = dl->items[r].pgno; + if (dp_pgno < pl_pgno) + dl->items[w++] = dl->items[r++]; + else if (dp_pgno > pl_pgno) + goto next_i; + else + goto remove_dl; + } + } + dl->sorted = dl->length = w - 1; + txn->tw.dirtyroom += r - w; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); + return; + } + } +} + /* End a transaction, except successful commit of a nested transaction. * May be called twice for readonly txns: First reset it, then abort. * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ -static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { +static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; static const char *const names[] = MDBX_END_NAMES; @@ -9955,18 +10459,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { #endif if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ mdbx_cursors_eot(txn, false); - if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; + mdbx_pnl_free(txn->tw.spill_pages); + txn->tw.spill_pages = nullptr; if (txn == env->me_txn0) { mdbx_assert(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ - mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); + dbi_update(txn, mode & MDBX_END_UPDATE); mdbx_pnl_shrink(&txn->tw.retired_pages); mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); + if (!(env->me_flags & MDBX_WRITEMAP)) + mdbx_dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { @@ -9975,10 +10481,9 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); mdbx_assert(env, parent->mt_child == txn && (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); - mdbx_pnl_free(txn->tw.reclaimed_pglist); - mdbx_pnl_free(txn->tw.spill_pages); + mdbx_assert( + env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (txn->tw.lifo_reclaimed) { mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= @@ -9996,9 +10501,14 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { parent->tw.retired_pages = txn->tw.retired_pages; } - mdbx_free(txn->tw.dirtylist); - parent->mt_child = NULL; + parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + if (!(env->me_flags & MDBX_WRITEMAP)) + mdbx_dlist_free(txn); + mdbx_dpl_free(txn); + mdbx_pnl_free(txn->tw.reclaimed_pglist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { @@ -10080,6 +10590,7 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (txn->mt_child) mdbx_txn_abort(txn->mt_child); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } @@ -10091,10 +10602,7 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, pgno_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + - (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored) + - txn->tw.retired2parent_count; - for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) - pending += parent->tw.loose_count; + (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored); } MDBX_cursor_couple cx; @@ -10158,10 +10666,8 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, node_ks(node)) == 0) { txn->mt_dbistate[k] |= DBI_AUDITED; - if (txn->mt_dbistate[k] & DBI_DIRTY) { - mdbx_tassert(txn, (txn->mt_dbistate[k] & DBI_STALE) == 0); + if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE)) db = txn->mt_dbs + k; - } break; } } @@ -10179,10 +10685,14 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != DBI_VALID) continue; - if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; - } else { + for (MDBX_txn *t = txn; t; t = t->mt_parent) + if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { + count += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + + t->mt_dbs[i].md_overflow_pages; + txn->mt_dbistate[i] |= DBI_AUDITED; + break; + } + if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { mdbx_warning("audit %s@%" PRIaTXN ": unable account dbi %d / \"%*s\", state 0x%02x", txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, @@ -10197,12 +10707,11 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose-count) + " - "%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored) " - "+ %u(retired2parent)", + "%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored)", txn->mt_txnid, pending, txn->tw.loose_count, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, - retired_stored, txn->tw.retired2parent_count); + retired_stored); mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO "(free) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO "(next-pgno)", @@ -10289,11 +10798,12 @@ retry: ++loop; retry_noaccount: mdbx_trace("%s", " >> restart"); - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + txn->mt_env->me_options.dp_limit); if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 9 : 99))) { mdbx_error("too more loops %u, bailout", loop); rc = MDBX_PROBLEM; @@ -10313,8 +10823,9 @@ retry_noaccount: MDBX_val key, data; mdbx_trace("%s", " >> continue"); - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (lifo) { if (cleaned_gc_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) @@ -10385,11 +10896,12 @@ retry_noaccount: } } - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + txn->mt_env->me_options.dp_limit); if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) @@ -10398,8 +10910,9 @@ retry_noaccount: /* return suitable into unallocated space */ if (mdbx_refund(txn)) { - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (mdbx_audit_enabled()) { rc = mdbx_audit_ex(txn, retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) @@ -10447,16 +10960,17 @@ retry_noaccount: } /* filter-out list of dirty-pages from loose-pages */ - const MDBX_DPL dl = txn->tw.dirtylist; + MDBX_dpl *const dl = txn->tw.dirtylist; unsigned w = 0; for (unsigned r = w; ++r <= dl->length;) { - MDBX_page *dp = dl[r].ptr; + MDBX_page *dp = dl->items[r].ptr; mdbx_tassert(txn, (dp->mp_flags & P_DIRTY)); - mdbx_tassert(txn, dl[r].pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) <= - txn->mt_next_pgno); + mdbx_tassert(txn, + dl->items[r].pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) <= + txn->mt_next_pgno); if ((dp->mp_flags & P_LOOSE) == 0) { if (++w != r) - dl[w] = dl[r]; + dl->items[w] = dl->items[r]; } else { mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); if ((env->me_flags & MDBX_WRITEMAP) == 0) @@ -10469,8 +10983,12 @@ retry_noaccount: dl->length = w; dl->sorted = 0; txn->tw.dirtyroom += txn->tw.loose_count; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); txn->tw.loose_pages = NULL; txn->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ } const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); @@ -10512,7 +11030,7 @@ retry_noaccount: txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); - mdbx_debug_extra_print("%s", "\n"); + mdbx_debug_extra_print("%s\n", "."); } if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { mdbx_trace("%s.reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, @@ -10525,8 +11043,9 @@ retry_noaccount: } /* handle reclaimed and lost pages - merge and store both into gc */ - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, txn->tw.loose_count == 0); mdbx_trace("%s", " >> reserving"); @@ -10759,8 +11278,9 @@ retry_noaccount: mdbx_prep_backlog(txn, &couple.outer, data.iov_len); rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -10791,8 +11311,9 @@ retry_noaccount: ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot : reused_gc_slot; rc = MDBX_SUCCESS; - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); mdbx_tassert(txn, mdbx_dirtylist_check(txn)); if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; @@ -10885,6 +11406,13 @@ retry_noaccount: mdbx_notice("%s", "** restart: reclaimed-slots changed"); goto retry; } + if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { + mdbx_tassert(txn, + retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); + mdbx_notice("** restart: retired-list growth (%u -> %u)", + retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + goto retry; + } pgno_t *dst = data.iov_base; *dst++ = chunk; @@ -10956,9 +11484,11 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); } - if (unlikely(rc != MDBX_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) mdbx_error("Write error: %s", mdbx_strerror(rc)); - txn->mt_flags |= MDBX_TXN_ERROR; + else { + VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + iov_off, iov_bytes); + ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + iov_off, iov_bytes); } for (unsigned i = 0; i < iov_items; i++) @@ -10973,30 +11503,30 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, * Returns 0 on success, non-zero on failure. */ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { struct iovec iov[MDBX_COMMIT_PAGES]; - const MDBX_DPL dl = (keep || txn->tw.loose_count > 1) - ? mdbx_dpl_sort(txn->tw.dirtylist) - : txn->tw.dirtylist; + MDBX_dpl *const dl = mdbx_dpl_sort(txn->tw.dirtylist); MDBX_env *const env = txn->mt_env; pgno_t flush_begin = MAX_PAGENO; pgno_t flush_end = MIN_PAGENO; + int rc = MDBX_SUCCESS; unsigned iov_items = 0; size_t iov_bytes = 0; size_t iov_off = 0; + unsigned r, w; for (r = w = keep; ++r <= dl->length;) { - MDBX_page *dp = dl[r].ptr; + MDBX_page *dp = dl->items[r].ptr; mdbx_tassert(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); mdbx_tassert(txn, dp->mp_flags & P_DIRTY); /* Don't flush this page yet */ if (dp->mp_flags & P_KEEP) { - dp->mp_flags &= ~P_KEEP; - dl[++w] = dl[r]; + dp->mp_flags -= P_KEEP; + dl->items[++w] = dl->items[r]; continue; } if (dp->mp_flags & P_LOOSE) { - dl[++w] = dl[r]; + dl->items[++w] = dl->items[r]; continue; } @@ -11005,7 +11535,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { flush_end = (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; *env->me_unsynced_pages += npages; - dp->mp_flags &= ~P_DIRTY; + dp->mp_flags -= P_DIRTY; dp->mp_txnid = pp_txnid2chk(txn); if ((env->me_flags & MDBX_WRITEMAP) == 0) { @@ -11013,9 +11543,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { if (iov_off + iov_bytes != pgno2bytes(env, dp->mp_pgno) || iov_items == ARRAY_LENGTH(iov) || iov_bytes + size > MAX_WRITE) { if (iov_items) { - int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); #if defined(__linux__) || defined(__gnu_linux__) if (mdbx_linux_kernel_version >= 0x02060b00) /* Linux kernels older than version 2.6.11 ignore the addr and nbytes @@ -11026,6 +11554,12 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { env->me_os_psize); iov_items = 0; iov_bytes = 0; + if (unlikely(rc != MDBX_SUCCESS)) { + do + dl->items[++w] = dl->items[r]; + while (++r <= dl->length); + break; + } } iov_off = pgno2bytes(env, dp->mp_pgno); } @@ -11036,10 +11570,20 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { } } - if (iov_items) { - int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + mdbx_tassert(txn, dl->sorted == dl->length && r == dl->length + 1); + txn->tw.dirtyroom += dl->length - w; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); + dl->sorted = dl->length = w; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + txn->mt_env->me_options.dp_limit); + + if (iov_items) + rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); + + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; } #if defined(__linux__) || defined(__gnu_linux__) @@ -11056,55 +11600,358 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { /* TODO: use flush_begin & flush_end for msync() & sync_file_range(). */ (void)flush_begin; (void)flush_end; - - txn->tw.dirtyroom += r - 1 - w; - dl->length = w; - mdbx_tassert(txn, txn->mt_parent || - txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); return MDBX_SUCCESS; } -/* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - -/* Import DBI which opened after txn started into context */ -static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) { - MDBX_env *env = txn->mt_env; - if (dbi < CORE_DBS || dbi >= env->me_numdbs) - return false; - - mdbx_ensure(env, mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); - const unsigned snap_numdbs = env->me_numdbs; - mdbx_compiler_barrier(); - for (unsigned i = CORE_DBS; i < snap_numdbs; ++i) { - if (i >= txn->mt_numdbs) - txn->mt_dbistate[i] = 0; - if (!(txn->mt_dbistate[i] & DBI_USRVALID) && - (env->me_dbflags[i] & DB_VALID)) { - txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); - } - } - txn->mt_numdbs = snap_numdbs; - - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return txn->mt_dbistate[dbi] & DBI_USRVALID; -} - /* Check txn and dbi arguments to a function */ static __always_inline bool mdbx_txn_dbi_exists(MDBX_txn *txn, MDBX_dbi dbi, unsigned validity) { if (likely(dbi < txn->mt_numdbs && (txn->mt_dbistate[dbi] & validity))) return true; - return mdbx_txn_import_dbi(txn, dbi); + return dbi_import(txn, dbi); } int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } +/* Merge child txn into parent */ +static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const unsigned parent_retired_len) { + MDBX_dpl *const src = mdbx_dpl_sort(txn->tw.dirtylist); + + /* Remove refunded pages from parent's dirty list */ + MDBX_dpl *const dst = mdbx_dpl_sort(parent->tw.dirtylist); + while (MDBX_ENABLE_REFUND && dst->length && + dst->items[dst->length].pgno >= parent->mt_next_pgno) { + if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { + MDBX_page *dp = dst->items[dst->length].ptr; + mdbx_dpage_free(txn->mt_env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); + } + dst->length -= 1; + } + parent->tw.dirtyroom += dst->sorted - dst->length; + assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); + dst->sorted = dst->length; + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + parent->mt_env->me_options.dp_limit); + + /* Remove reclaimed pages from parent's dirty list */ + const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; + mdbx_dpl_sift(parent, reclaimed_list, false); + + /* Move retired pages from parent's dirty & spilled list to reclaimed */ + unsigned r, w, d, s, l; + for (r = w = parent_retired_len; + ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { + const pgno_t pgno = parent->tw.retired_pages[r]; + const unsigned di = mdbx_dpl_exist(parent->tw.dirtylist, pgno); + const unsigned si = (!di && unlikely(parent->tw.spill_pages)) + ? mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1) + : 0; + unsigned npages; + const char *kind; + if (di) { + MDBX_page *dp = parent->tw.dirtylist->items[di].ptr; + mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_DIRTY)) == 0); + npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + mdbx_page_wash(parent, di, dp, npages); + kind = "dirty"; + l = 1; + if (unlikely(npages > l)) { + /* OVERFLOW-страница могла быть переиспользована по частям. Тогда + * в retired-списке может быть только начало последовательности, + * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому + * переносим в reclaimed с проверкой на обрыв последовательности. + * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если + * страница была разбита на части, то важно удалить dirty-элемент, + * а все осколки будут учтены отдельно. */ + + /* Список retired страниц не сортирован, но для ускорения сортировки + * дополняется в соответствии с MDBX_PNL_ASCENDING */ +#if MDBX_PNL_ASCENDING + const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages); + while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { + ++r; + if (++l == npages) + break; + } +#else + while (w > parent_retired_len && + parent->tw.retired_pages[w - 1] == pgno + l) { + --w; + if (++l == npages) + break; + } +#endif + } + } else if (unlikely(si)) { + l = npages = 1; + mdbx_spill_remove(parent, si, 1); + kind = "spilled"; + } else { + parent->tw.retired_pages[++w] = pgno; + continue; + } + + mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, + kind, pgno); + int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS); + } + MDBX_PNL_SIZE(parent->tw.retired_pages) = w; + + /* Filter-out parent spill list */ + if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { + const MDBX_PNL sl = mdbx_spill_purge(parent); + unsigned len = MDBX_PNL_SIZE(sl); + if (len) { + /* Remove refunded pages from parent's spill list */ + if (MDBX_ENABLE_REFUND && + MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { +#if MDBX_PNL_ASCENDING + unsigned i = MDBX_PNL_SIZE(sl); + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); + do { + if ((sl[i] & 1) == 0) + mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + i -= 1; + } while (i && sl[i] >= (parent->mt_next_pgno << 1)); + MDBX_PNL_SIZE(sl) = i; +#else + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); + unsigned i = 0; + do { + ++i; + if ((sl[i] & 1) == 0) + mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); + MDBX_PNL_SIZE(sl) = len -= i; + memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); +#endif + } + mdbx_tassert(txn, mdbx_pnl_check4assert(sl, parent->mt_next_pgno << 1)); + + /* Remove reclaimed pages from parent's spill list */ + s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); + /* Scanning from end to begin */ + while (s && r) { + if (sl[s] & 1) { + --s; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t reclaimed_pgno = reclaimed_list[r]; + if (reclaimed_pgno != spilled_pgno) { + const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno); + s -= !cmp; + r -= cmp; + } else { + mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + mdbx_spill_remove(parent, s, 1); + --s; + --r; + } + } + + /* Remove anything in our dirty list from parent's spill list */ + /* Scanning spill list in descend order */ + const int step = MDBX_PNL_ASCENDING ? -1 : 1; + s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1; + d = src->length; + while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) { + if (sl[s] & 1) { + s += step; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t dirty_pgno_form = src->items[d].pgno; + MDBX_page *dp = src->items[d].ptr; + const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + const pgno_t dirty_pgno_to = dirty_pgno_form + npages; + if (dirty_pgno_form > spilled_pgno) { + --d; + continue; + } + if (dirty_pgno_to <= spilled_pgno) { + s += step; + continue; + } + + mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + mdbx_spill_remove(parent, s, 1); + s += step; + } + + /* Squash deleted pagenums if we deleted any */ + mdbx_spill_purge(parent); + } + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->tw.spill_pages) { + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages, + parent->mt_next_pgno << 1)); + mdbx_dpl_sift(parent, txn->tw.spill_pages, true); + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + parent->mt_env->me_options.dp_limit); + } + + /* Find length of merging our dirty list with parent's and release + * filter-out pages */ + for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { + const MDBX_page *sp = src->items[s].ptr; + mdbx_tassert(parent, + (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_DIRTY | P_LOOSE)) == 0); + const unsigned s_npages = IS_OVERFLOW(sp) ? sp->mp_pages : 1; + const pgno_t s_pgno = src->items[s].pgno; + + MDBX_page *dp = dst->items[d].ptr; + mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_DIRTY)) == 0); + const unsigned d_npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + const pgno_t d_pgno = dst->items[d].pgno; + + if (d_pgno >= s_pgno + s_npages) { + --d; + ++l; + } else if (d_pgno + d_npages <= s_pgno) { + --s; + ++l; + } else { + dst->items[d--].ptr = nullptr; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, dp, d_npages); + } + } + assert(dst->sorted == dst->length); + mdbx_tassert(parent, dst->allocated >= l + d + s); + dst->sorted = l + d + s; /* the merged length */ + + /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */ + if (dst->sorted >= dst->length) { + /* from end to begin with dst extending */ + for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) { + if (unlikely(l <= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = 1; r <= d; ++r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + ++w; + } + mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1); + d = w - 1; + continue; + } + assert(l > d); + if (dst->items[d].ptr) { + dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) + ? dst->items[d--] + : src->items[s--]; + } else + --d; + } + if (s > 0) { + assert(l == s); + while (d > 0) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l > 0); + dst->items[l--] = src->items[s--]; + } while (s > 0); + } else { + assert(l == d); + while (l > 0) { + assert(dst->items[l].ptr != nullptr); + --l; + } + } + } else { + /* from begin to end with dst shrinking (a lot of new overflow pages) */ + for (l = s = d = 1; s <= src->length && d <= dst->length;) { + if (unlikely(l >= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = dst->length; r >= d; --r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + --w; + } + mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1); + d = w + 1; + continue; + } + assert(l < d); + if (dst->items[d].ptr) { + dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) + ? dst->items[d++] + : src->items[s++]; + } else + ++d; + } + if (s <= src->length) { + assert(dst->sorted - l == src->length - s); + while (d <= dst->length) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l <= dst->sorted); + dst->items[l++] = src->items[s++]; + } while (s <= src->length); + } else { + assert(dst->sorted - l == dst->length - d); + while (l <= dst->sorted) { + assert(l <= d && d <= dst->length && dst->items[d].ptr); + dst->items[l++] = dst->items[d++]; + } + } + } + parent->tw.dirtyroom -= dst->sorted - dst->length; + assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); + dst->length = dst->sorted; + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + parent->mt_env->me_options.dp_limit); + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + mdbx_dpl_free(txn); + + if (txn->tw.spill_pages) { + if (parent->tw.spill_pages) { + /* Must not fail since space was preserved above. */ + mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages); + mdbx_pnl_free(txn->tw.spill_pages); + } else { + parent->tw.spill_pages = txn->tw.spill_pages; + parent->tw.spill_least_removed = txn->tw.spill_least_removed; + } + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + } + + parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; + if (parent->tw.spill_pages) { + assert(mdbx_pnl_check4assert(parent->tw.spill_pages, + parent->mt_next_pgno << 1)); + if (MDBX_PNL_SIZE(parent->tw.spill_pages)) + parent->mt_flags |= MDBX_TXN_SPILLS; + } +} + int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); @@ -11132,7 +11979,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* mdbx_txn_end() mode for a commit which writes nothing */ unsigned end_mode = - MDBX_END_EMPTY_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) goto done; @@ -11150,6 +11997,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { } if (txn->mt_parent) { + mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); mdbx_assert(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); @@ -11157,8 +12005,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); mdbx_assert(env, mdbx_dirtylist_check(txn)); - if (txn->tw.dirtylist->length == 0 && - (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0 && + if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { for (int i = txn->mt_numdbs; --i >= 0;) { mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); @@ -11172,20 +12019,47 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { sizeof(parent->mt_geo)) == 0); mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, sizeof(parent->mt_canary)) == 0); + mdbx_tassert(txn, !txn->tw.spill_pages || + MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); + mdbx_tassert(txn, txn->tw.loose_count == 0); - end_mode = MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE; + /* fast completion of pure nested transaction */ + end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; goto done; } /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ - if (txn->tw.spill_pages && parent->tw.spill_pages) { - rc = mdbx_pnl_need(&parent->tw.spill_pages, - MDBX_PNL_SIZE(txn->tw.spill_pages)); + const unsigned parent_retired_len = + (unsigned)(uintptr_t)parent->tw.retired_pages; + mdbx_tassert(txn, + parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); + const unsigned retired_delta = + MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; + if (retired_delta) { + rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } + if (txn->tw.spill_pages) { + if (parent->tw.spill_pages) { + rc = mdbx_pnl_need(&parent->tw.spill_pages, + MDBX_PNL_SIZE(txn->tw.spill_pages)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + mdbx_spill_purge(txn); + } + + if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > + parent->tw.dirtylist->allocated && + !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { + rc = MDBX_ENOMEM; + goto fail; + } + //------------------------------------------------------------------------- parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; @@ -11203,234 +12077,64 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->mt_canary = txn->mt_canary; parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; + /* Move loose pages to parent */ +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + parent->tw.loose_count = txn->tw.loose_count; + parent->tw.loose_pages = txn->tw.loose_pages; + /* Merge our cursors into parent's and close them */ mdbx_cursors_eot(txn, true); end_mode |= MDBX_END_EOTDONE; - /* Update parent's DB table. */ + /* Update parent's DBs array */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); parent->mt_numdbs = txn->mt_numdbs; parent->mt_dbistate[FREE_DBI] = txn->mt_dbistate[FREE_DBI]; parent->mt_dbistate[MAIN_DBI] = txn->mt_dbistate[MAIN_DBI]; for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { - /* preserve parent's DB_NEW status */ - parent->mt_dbistate[i] = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & - (DBI_CREAT | DBI_FRESH)); - } - ts_1 = latency ? mdbx_osal_monotime() : 0; - - /* Remove refunded pages from parent's dirty & spill lists */ - MDBX_DPL dst = mdbx_dpl_sort(parent->tw.dirtylist); - while (dst->length && dst[dst->length].pgno >= parent->mt_next_pgno) { - MDBX_page *mp = dst[dst->length].ptr; - if (mp && (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, mp, IS_OVERFLOW(mp) ? mp->mp_pages : 1); - dst->length -= 1; - } - parent->tw.dirtyroom += dst->sorted - dst->length; - dst->sorted = dst->length; - mdbx_tassert(parent, - parent->mt_parent || - parent->tw.dirtyroom + parent->tw.dirtylist->length == - MDBX_DPL_TXNFULL); - - if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0 && - MDBX_PNL_MOST(parent->tw.spill_pages) >= parent->mt_next_pgno << 1) { - const MDBX_PNL ps = parent->tw.spill_pages; -#if MDBX_PNL_ASCENDING - unsigned i = MDBX_PNL_SIZE(ps); - assert(MDBX_PNL_MOST(ps) == MDBX_PNL_LAST(ps)); - do - i -= 1; - while (i && ps[i] >= parent->mt_next_pgno << 1); - MDBX_PNL_SIZE(ps) = i; -#else - assert(MDBX_PNL_MOST(ps) == MDBX_PNL_FIRST(ps)); - unsigned i = 1, len = MDBX_PNL_SIZE(ps); - while (i < len && ps[i + 1] >= parent->mt_next_pgno << 1) - ++i; - MDBX_PNL_SIZE(ps) = len -= i; - for (unsigned k = 1; k <= len; ++k) - ps[k] = ps[k + i]; -#endif - } - - /* Remove anything in our dirty list from parent's spill list */ - MDBX_DPL src = mdbx_dpl_sort(txn->tw.dirtylist); - if (likely(src->length > 0) && parent->tw.spill_pages && - MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { - MDBX_PNL sp = parent->tw.spill_pages; - assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno)); - - const unsigned len = MDBX_PNL_SIZE(parent->tw.spill_pages); - MDBX_PNL_SIZE(sp) = ~(pgno_t)0; - - /* Mark our dirty pages as deleted in parent spill list */ - unsigned r, w, i = 1; - w = r = len; - do { - pgno_t pn = src[i].pgno << 1; - while (pn > sp[r]) - r--; - if (pn == sp[r]) { - sp[r] = 1; - w = --r; - } - } while (++i <= src->length); - - /* Squash deleted pagenums if we deleted any */ - for (r = w; ++r <= len;) - if ((sp[r] & 1) == 0) - sp[++w] = sp[r]; - MDBX_PNL_SIZE(sp) = w; - assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno << 1)); - } - - /* Remove anything in our spill list from parent's dirty list */ - if (txn->tw.spill_pages && MDBX_PNL_SIZE(txn->tw.spill_pages) > 0) { - const MDBX_PNL sp = txn->tw.spill_pages; - mdbx_pnl_sort(sp); - /* Scanning in ascend order */ - const int step = MDBX_PNL_ASCENDING ? 1 : -1; - const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(sp); - const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sp) + 1 : 0; - mdbx_tassert(txn, sp[begin] <= sp[end - step]); - - unsigned r, w = r = mdbx_dpl_search(dst, sp[begin] >> 1); - mdbx_tassert(txn, dst->sorted == dst->length); - for (int i = begin; r <= dst->length;) { - mdbx_tassert(txn, (sp[i] & 1) == 0); - const pgno_t pgno = sp[i] >> 1; - if (dst[r].pgno < pgno) { - dst[w++] = dst[r++]; - } else if (dst[r].pgno > pgno) { - i += step; - if (i == end) - while (r <= dst->length) - dst[w++] = dst[r++]; - } else { - MDBX_page *dp = dst[r++].ptr; - if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); - } - } - mdbx_tassert(txn, r == dst->length + 1); - dst->length = w; - parent->tw.dirtyroom += r - w; - } - assert(dst->sorted == dst->length); - mdbx_tassert(parent, - parent->mt_parent || - parent->tw.dirtyroom + parent->tw.dirtylist->length == - MDBX_DPL_TXNFULL); - - unsigned d, s, l; - /* Find length of merging our dirty list with parent's */ - for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0; ++l) { - const pgno_t s_pgno = src[s].pgno; - const pgno_t d_pgno = dst[d].pgno; - d -= d_pgno >= s_pgno; - s -= d_pgno <= s_pgno; - } - assert(dst->sorted == dst->length); - dst->sorted = l += d + s; - assert(dst->sorted >= dst->length); - parent->tw.dirtyroom -= dst->sorted - dst->length; - - /* Merge our dirty list into parent's */ - for (d = dst->length, s = src->length; d > 0 && s > 0; --l) { - if (dst[d].pgno > src[s].pgno) - dst[l] = dst[d--]; - else if (dst[d].pgno < src[s].pgno) - dst[l] = src[s--]; - else { - MDBX_page *dp = dst[d--].ptr; - if (dp && (env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pgno : 1); - dst[l] = src[s--]; - } - } - if (s) { - do - dst[l--] = src[s--]; - while (s > 0); - } else if (d) { - do - dst[l--] = dst[d--]; - while (d > 0); - } - assert(l == 0); - dst->length = dst->sorted; - mdbx_free(txn->tw.dirtylist); - txn->tw.dirtylist = nullptr; - mdbx_tassert(parent, - parent->mt_parent || - parent->tw.dirtyroom + parent->tw.dirtylist->length == - MDBX_DPL_TXNFULL); - - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { - /* Must not fail since space was preserved above. */ - rc = mdbx_pnl_append_list(&parent->tw.spill_pages, txn->tw.spill_pages); - mdbx_assert(env, rc == MDBX_SUCCESS); - (void)rc; - mdbx_pnl_free(txn->tw.spill_pages); - mdbx_pnl_sort(parent->tw.spill_pages); - } else { - parent->tw.spill_pages = txn->tw.spill_pages; - } - } - - parent->mt_flags &= ~(MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD); - if (parent->tw.spill_pages) { - assert(mdbx_pnl_check4assert(parent->tw.spill_pages, - parent->mt_next_pgno << 1)); - if (MDBX_PNL_SIZE(parent->tw.spill_pages)) - parent->mt_flags |= MDBX_TXN_SPILLS; + /* preserve parent's status */ + const uint8_t state = + txn->mt_dbistate[i] | + (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i, + (parent->mt_dbistate[i] != state) ? "update" : "still", + parent->mt_dbistate[i], state); + parent->mt_dbistate[i] = state; } + ts_1 = latency ? mdbx_osal_monotime() : 0; + mdbx_txn_merge(parent, txn, parent_retired_len); ts_2 = latency ? mdbx_osal_monotime() : 0; - /* Append our loose page list to parent's */ - if (txn->tw.loose_pages) { - MDBX_page **lp = &parent->tw.loose_pages; - while (*lp) - lp = &(*lp)->mp_next; - *lp = txn->tw.loose_pages; - parent->tw.loose_count += txn->tw.loose_count; - } - if (txn->tw.retired2parent_pages) { - MDBX_page *mp = txn->tw.retired2parent_pages; - do { - MDBX_page *next = mp->mp_next; - rc = mdbx_page_loose(parent, mp); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - mp = next; - } while (mp); - } - env->me_txn = parent; parent->mt_child = NULL; mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - /* Scan parent's loose page for suitable for refund */ - for (MDBX_page *mp = parent->tw.loose_pages; mp; mp = mp->mp_next) { - if (mp->mp_pgno == parent->mt_next_pgno - 1) { - mdbx_refund(parent); - break; - } +#if MDBX_ENABLE_REFUND + mdbx_refund(parent); + if (mdbx_assert_enabled()) { + /* Check parent's loose pages not suitable for refund */ + for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) + mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl && + lp->mp_pgno + 1 < parent->mt_next_pgno); + /* Check parent's reclaimed pages not suitable for refund */ + if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) + mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < + parent->mt_next_pgno); } +#endif /* MDBX_ENABLE_REFUND */ ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; txn->mt_signature = 0; mdbx_free(txn); - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); rc = MDBX_SUCCESS; goto provide_latency; } mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + txn->mt_env->me_options.dp_limit); mdbx_cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; @@ -11463,6 +12167,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } MDBX_db *db = &txn->mt_dbs[i]; + mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, pp_txnid2chk(txn), db->md_mod_txnid); db->md_mod_txnid = pp_txnid2chk(txn); data.iov_base = db; WITH_CURSOR_TRACKING(couple.outer, @@ -11497,12 +12204,13 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { txn->mt_dbs[MAIN_DBI].md_mod_txnid = pp_txnid2chk(txn); MDBX_meta meta, *head = mdbx_meta_head(env); - meta.mm_magic_and_version = head->mm_magic_and_version; + memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8); meta.mm_extra_flags = head->mm_extra_flags; meta.mm_validator_id = head->mm_validator_id; meta.mm_extra_pagehdr = head->mm_extra_pagehdr; - meta.mm_pages_retired = - head->mm_pages_retired + MDBX_PNL_SIZE(txn->tw.retired_pages); + unaligned_poke_u64(4, meta.mm_pages_retired, + unaligned_peek_u64(4, head->mm_pages_retired) + + MDBX_PNL_SIZE(txn->tw.retired_pages)); meta.mm_geo = txn->mt_geo; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; @@ -11550,13 +12258,14 @@ static __cold int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, const MDBX_page *const page, const unsigned meta_number, MDBX_meta *dest, const unsigned guess_pagesize) { - if (meta->mm_magic_and_version != MDBX_DATA_MAGIC && - meta->mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) { + const uint64_t magic_and_version = + unaligned_peek_u64(4, &meta->mm_magic_and_version); + if (magic_and_version != MDBX_DATA_MAGIC && + magic_and_version != MDBX_DATA_MAGIC_DEVEL) { mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - meta->mm_magic_and_version); - return ((meta->mm_magic_and_version >> 8) != MDBX_MAGIC) - ? MDBX_INVALID - : MDBX_VERSION_MISMATCH; + magic_and_version); + return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } if (page->mp_pgno != meta_number) { @@ -11583,16 +12292,19 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); } - if (safe64_read(&meta->mm_txnid_a) != safe64_read(&meta->mm_txnid_b)) { + if (unaligned_peek_u64(4, &meta->mm_txnid_a) != + unaligned_peek_u64(4, &meta->mm_txnid_b)) { mdbx_warning("meta[%u] not completely updated, skip it", meta_number); return MDBX_RESULT_TRUE; } /* LY: check signature as a checksum */ - if (META_IS_STEADY(meta) && meta->mm_datasync_sign != mdbx_meta_sign(meta)) { + if (META_IS_STEADY(meta) && + unaligned_peek_u64(4, &meta->mm_datasync_sign) != mdbx_meta_sign(meta)) { mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 "), skip it", - meta_number, meta->mm_datasync_sign, mdbx_meta_sign(meta)); + meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), + mdbx_meta_sign(meta)); return MDBX_RESULT_TRUE; } @@ -11603,7 +12315,7 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, meta->mm_geo.grow, meta->mm_geo.shrink, - meta->mm_txnid_a.inconsistent, mdbx_durable_str(meta)); + unaligned_peek_u64(4, meta->mm_txnid_a), mdbx_durable_str(meta)); /* LY: check min-pages value */ if (meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO) { @@ -11731,7 +12443,7 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, return MDBX_CORRUPTED; } - if (safe64_read(&meta->mm_txnid_a) == 0) { + if (unaligned_peek_u64(4, &meta->mm_txnid_a) == 0) { mdbx_warning("meta[%u] has zero txnid, skip it", meta_number); return MDBX_RESULT_TRUE; } @@ -11749,7 +12461,7 @@ static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, return rc; memset(dest, 0, sizeof(MDBX_meta)); - dest->mm_datasync_sign = MDBX_DATASIGN_WEAK; + unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ @@ -11850,7 +12562,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model->mp_pgno = num; model->mp_flags = P_META; MDBX_meta *const model_meta = page_meta(model); - model_meta->mm_magic_and_version = MDBX_DATA_MAGIC; + unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); @@ -11877,7 +12589,8 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; mdbx_meta_set_txnid(env, model_meta, MIN_TXNID + num); - model_meta->mm_datasync_sign = mdbx_meta_sign(model_meta); + unaligned_poke_u64(4, model_meta->mm_datasync_sign, + mdbx_meta_sign(model_meta)); return (MDBX_page *)((uint8_t *)model + env->me_psize); } @@ -11996,12 +12709,12 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, shrink = pending->mm_geo.now - bottom; pending->mm_geo.now = bottom; if (unlikely(mdbx_meta_txnid_stable(env, head) == - pending->mm_txnid_a.inconsistent)) { + unaligned_peek_u64(4, pending->mm_txnid_a))) { const txnid_t txnid = - safe64_txnid_next(pending->mm_txnid_a.inconsistent); + safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("%s", "txnid overflow!"); rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); goto fail; } mdbx_meta_set_txnid(env, pending, txnid); @@ -12037,16 +12750,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - pending->mm_datasync_sign = mdbx_meta_sign(pending); *env->me_unsynced_pages = 0; *env->me_sync_timestamp = mdbx_osal_monotime(); + unaligned_poke_u64(4, pending->mm_datasync_sign, mdbx_meta_sign(pending)); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - pending->mm_datasync_sign = MDBX_DATASIGN_WEAK; + unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); } MDBX_meta *target = nullptr; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a.inconsistent) { + if (mdbx_meta_txnid_stable(env, head) == + unaligned_peek_u64(4, pending->mm_txnid_a)) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -12077,7 +12791,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, pending->mm_geo.grow, pending->mm_geo.shrink, - pending->mm_txnid_a.inconsistent, mdbx_durable_str(pending)); + unaligned_peek_u64(4, pending->mm_txnid_a), + mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, @@ -12100,14 +12815,16 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2)); mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) < - pending->mm_txnid_a.inconsistent); + mdbx_ensure(env, + target == head || mdbx_meta_txnid_stable(env, target) < + unaligned_peek_u64(4, pending->mm_txnid_a)); if (flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ - mdbx_meta_update_begin(env, target, pending->mm_txnid_a.inconsistent); - target->mm_datasync_sign = MDBX_DATASIGN_WEAK; + mdbx_meta_update_begin(env, target, + unaligned_peek_u64(4, pending->mm_txnid_a)); + unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK); #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * and mm_flags to allow readers catch actual pagesize. */ @@ -12122,18 +12839,19 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; - target->mm_pages_retired = pending->mm_pages_retired; + memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); mdbx_jitter4testing(true); mdbx_flush_incoherent_cpu_writeback(); /* LY: 'commit' the meta */ - mdbx_meta_update_end(env, target, pending->mm_txnid_b.inconsistent); + mdbx_meta_update_end(env, target, + unaligned_peek_u64(4, pending->mm_txnid_b)); mdbx_jitter4testing(true); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) == - pending->mm_txnid_a.inconsistent && + unaligned_peek_u64(4, pending->mm_txnid_a) && !META_IS_STEADY(head) && META_IS_STEADY(pending)); mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, sizeof(head->mm_geo)) == 0); @@ -12142,7 +12860,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, sizeof(head->mm_canary)) == 0); } - target->mm_datasync_sign = pending->mm_datasync_sign; + memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); /* sync meta-pages */ @@ -12179,7 +12897,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (flags & MDBX_NOMETASYNC) *env->me_unsynced_pages += 1; else - *env->me_meta_sync_txnid = pending->mm_txnid_a.low; + *env->me_meta_sync_txnid = + (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a); /* LY: shrink datafile if needed */ if (unlikely(shrink)) { @@ -12211,10 +12930,10 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { env->me_psize = (unsigned)pagesize; STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); - STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_DPL_TXNFULL); + STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT / 4); const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, - maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_DPL_TXNFULL); + mdbx_ensure(env, maxgc_ov1page > 42 && + maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); @@ -12229,6 +12948,12 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { env->me_psize2log = (uint8_t)log2n(pagesize); mdbx_assert(env, pgno2bytes(env, 1) == pagesize); mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); + + const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); + if (env->me_options.dp_limit > max_pgno - NUM_METAS) + env->me_options.dp_limit = max_pgno - NUM_METAS; + if (env->me_options.dp_initial > env->me_options.dp_limit) + env->me_options.dp_initial = env->me_options.dp_limit; } __cold int mdbx_env_create(MDBX_env **penv) { @@ -12244,6 +12969,19 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_pid = mdbx_getpid(); env->me_stuck_meta = -1; + env->me_options.dp_reserve_limit = 1024; + env->me_options.rp_augment_limit = 256 * 1024; + env->me_options.dp_limit = 64 * 1024; + if (env->me_options.dp_limit > MAX_PAGENO - NUM_METAS) + env->me_options.dp_limit = MAX_PAGENO - NUM_METAS; + env->me_options.dp_initial = MDBX_PNL_INITIAL; + if (env->me_options.dp_initial > env->me_options.dp_limit) + env->me_options.dp_initial = env->me_options.dp_limit; + env->me_options.spill_max_denominator = 8; + env->me_options.spill_min_denominator = 8; + env->me_options.spill_parent4child_denominator = 0; + env->me_options.dp_loose_limit = 64; + int rc; const size_t os_psize = mdbx_syspagesize(); if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { @@ -12616,8 +13354,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, const txnid_t txnid = safe64_txnid_next(mdbx_meta_txnid_stable(env, head)); if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("%s", "txnid overflow!"); rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); goto bailout; } mdbx_meta_set_txnid(env, &meta, txnid); @@ -12639,57 +13377,19 @@ __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { } __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(dbs > MDBX_MAX_DBI)) - return MDBX_EINVAL; - - if (unlikely(env->me_map)) - return MDBX_EPERM; - - env->me_maxdbs = dbs + CORE_DBS; - return MDBX_SUCCESS; + return __inline_mdbx_env_set_maxdbs(env, dbs); } -__cold int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!dbs)) - return MDBX_EINVAL; - - *dbs = env->me_maxdbs; - return MDBX_SUCCESS; +__cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { + return __inline_mdbx_env_get_maxdbs(env, dbs); } __cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(readers < 1 || readers > MDBX_READERS_LIMIT)) - return MDBX_EINVAL; - - if (unlikely(env->me_map)) - return MDBX_EPERM; - - env->me_maxreaders = readers; - return MDBX_SUCCESS; + return __inline_mdbx_env_set_maxreaders(env, readers); } __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!readers)) - return MDBX_EINVAL; - - *readers = env->me_maxreaders; - return MDBX_SUCCESS; + return __inline_mdbx_env_get_maxreaders(env, readers); } /* Further setup required for opening an MDBX environment */ @@ -12741,7 +13441,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, - meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta)); + unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta)); mdbx_setup_pagesize(env, meta.mm_psize); const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); @@ -12808,7 +13508,8 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, - meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta)); + unaligned_peek_u64(4, meta.mm_txnid_a), + mdbx_durable_str(&meta)); } else { /* fetch back 'now/current' size, since it was ignored during comparison * and may differ. */ @@ -12983,9 +13684,9 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (env->me_flags & MDBX_WRITEMAP) { /* It is possible to update txnid without safe64_write(), * since DB opened exclusive for now */ - head->mm_txnid_a.inconsistent = undo_txnid; - head->mm_datasync_sign = MDBX_DATASIGN_WEAK; - head->mm_txnid_b.inconsistent = undo_txnid; + unaligned_poke_u64(4, head->mm_txnid_a, undo_txnid); + unaligned_poke_u64(4, head->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, head->mm_txnid_b, undo_txnid); const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb; const size_t paged_offset = floor_powerof2(offset, env->me_os_psize); const size_t paged_length = ceil_powerof2( @@ -12994,7 +13695,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { } else { MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, undo_txnid); - rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK; + unaligned_poke_u64(4, rollback.mm_datasync_sign, MDBX_DATASIGN_WEAK); err = mdbx_pwrite(env->me_lazy_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); } @@ -13056,7 +13757,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { const txnid_t txnid = mdbx_meta_txnid_stable(env, head); const txnid_t next_txnid = safe64_txnid_next(txnid); if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("%s", "txnid overflow!"); + mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } mdbx_notice("updating meta.geo: " @@ -13498,7 +14199,7 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) { page->mp_flags = P_META; MDBX_meta *meta = page_meta(page); - meta->mm_magic_and_version = MDBX_DATA_MAGIC; + unaligned_poke_u64(4, meta->mm_magic_and_version, MDBX_DATA_MAGIC); meta->mm_psize = env->me_psize; txnid_t txnid = mdbx_meta_txnid_stable(env, meta); const txnid_t txnid0 = mdbx_meta_txnid_stable(env, METAPAGE(env, 0)); @@ -13513,11 +14214,11 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) { if (!META_IS_STEADY(meta) || mdbx_recent_committed_txnid(env) != txnid) { if (unlikely(txnid > MAX_TXNID)) { - mdbx_error("%s", "txnid overflow!"); + mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } mdbx_meta_set_txnid(env, meta, txnid); - meta->mm_datasync_sign = mdbx_meta_sign(meta); + unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta)); } if (env->me_flags & MDBX_WRITEMAP) { @@ -13779,15 +14480,9 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } } #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ - env->me_dirtylist = mdbx_calloc(MDBX_DPL_TXNFULL + 1, sizeof(MDBX_DP)); - if (!env->me_dirtylist) - rc = MDBX_ENOMEM; } env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); @@ -13943,10 +14638,13 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; - txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - if (!txn->tw.retired_pages || !txn->tw.reclaimed_pglist) - rc = MDBX_ENOMEM; + rc = mdbx_dpl_alloc(txn); + if (likely(rc == MDBX_SUCCESS)) { + txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) + rc = MDBX_ENOMEM; + } } else rc = MDBX_ENOMEM; } @@ -13958,7 +14656,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; mdbx_debug("opened database version %u, pagesize %u", - (uint8_t)meta->mm_magic_and_version, env->me_psize); + (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), + env->me_psize); mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, data_page(meta)->mp_pgno, mdbx_meta_txnid_fluid(env, meta)); mdbx_debug("depth: %u", db->md_depth); @@ -14043,8 +14742,8 @@ static __cold int mdbx_env_close0(MDBX_env *env) { mdbx_free(env->me_dbiseqs); mdbx_free(env->me_dbflags); mdbx_free(env->me_pathname); - mdbx_free(env->me_dirtylist); if (env->me_txn0) { + mdbx_dpl_free(env->me_txn0); mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); mdbx_pnl_free(env->me_txn0->tw.retired_pages); mdbx_pnl_free(env->me_txn0->tw.spill_pages); @@ -14120,10 +14819,10 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { mdbx_ensure(env, mdbx_ipclock_destroy(&env->me_lckless_stub.wlock) == 0); #endif /* MDBX_LOCKING */ - while ((dp = env->me_dpages) != NULL) { + while ((dp = env->me_dp_reserve) != NULL) { ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dpages = dp->mp_next; + env->me_dp_reserve = dp->mp_next; mdbx_free(dp); } VALGRIND_DESTROY_MEMPOOL(env); @@ -14394,9 +15093,10 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, MDBX_page *p = nullptr; mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); mdbx_assert(env, pp_txnid >= MIN_TXNID && pp_txnid <= txn->mt_txnid); - const uint16_t illegal_bits = (txn->mt_flags & MDBX_TXN_RDONLY) - ? P_LOOSE | P_SUBP | P_META | P_DIRTY - : P_LOOSE | P_SUBP | P_META; + const uint16_t illegal_bits = + (txn->mt_flags & MDBX_TXN_RDONLY) + ? ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW) + : ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_DIRTY); int level; if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) { level = 1; @@ -14405,7 +15105,7 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, * because the dirty list got full. Bring this page * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ - if (txn->tw.spill_pages && + if (unlikely(txn->mt_flags & MDBX_TXN_SPILLS) && txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) { pp_txnid = txn->mt_txnid; goto spilled; @@ -15664,11 +16364,11 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; + *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc)) return rc; - *mc->mc_dbistate |= DBI_DIRTY; - mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; } mc->mc_top = 0; if (mc->mc_snum) { @@ -15700,6 +16400,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + mdbx_cassert(mc, cursor_is_tracked(mc)); env = mc->mc_txn->mt_env; @@ -15943,7 +16646,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { rdata = data; } - if (unlikely(rc2 = mdbx_page_spill(mc, key, rdata))) + if (unlikely(rc2 = mdbx_cursor_spill(mc, key, rdata))) return rc2; } @@ -15974,8 +16677,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mc->mc_xcursor->mx_dbx.md_klen_max = data->iov_len); } - *mc->mc_dbistate |= DBI_DIRTY; - mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) np->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; @@ -16091,7 +16792,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(!np)) return MDBX_ENOMEM; /* Note - this page is already counted in parent's dirtyroom */ - rc2 = mdbx_dpl_append(mc->mc_txn->tw.dirtylist, pg, np); + rc2 = mdbx_dpl_append(mc->mc_txn, pg, np); if (unlikely(rc2 != MDBX_SUCCESS)) { rc = rc2; mdbx_dpage_free(env, np, ovpages); @@ -16111,6 +16812,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, whole - off); memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */ omp = np; + mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn)); } node_set_ds(node, data->iov_len); if (F_ISSET(flags, MDBX_RESERVE)) @@ -16499,6 +17201,9 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_ENODATA; @@ -16506,7 +17211,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return MDBX_NOTFOUND; if (unlikely(!(flags & MDBX_NOSPILL) && - (rc = mdbx_page_spill(mc, NULL, NULL)))) + (rc = mdbx_cursor_spill(mc, NULL, NULL)))) return rc; rc = mdbx_cursor_touch(mc); @@ -16619,13 +17324,15 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) return rc; *mp = np; - mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, - mc->mc_txn->mt_env->me_psize); + mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi, + np->mp_pgno, num); np->mp_flags = (uint16_t)(flags | P_DIRTY); np->mp_txnid = INVALID_TXNID; np->mp_lower = 0; np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); + *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; mc->mc_db->md_branch_pages += IS_BRANCH(np); mc->mc_db->md_leaf_pages += IS_LEAF(np); if (unlikely(IS_OVERFLOW(np))) { @@ -17135,6 +17842,9 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, /* Initialize a cursor for a given transaction and database. */ static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDBX_BAD_DBI; + return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], &txn->mt_dbistate[dbi]); @@ -17306,19 +18016,21 @@ again: } void mdbx_cursor_close(MDBX_cursor *mc) { - if (mc) { + if (likely(mc)) { mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE || mc->mc_signature == MDBX_MC_READY4CLOSE); + MDBX_txn *const txn = mc->mc_txn; if (!mc->mc_backup) { + mc->mc_txn = NULL; /* Remove from txn, if tracked. * A read-only txn (!C_UNTRACK) may have been freed already, * so do not peek inside it. Only write txns track cursors. */ if (mc->mc_flags & C_UNTRACK) { - mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); - MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; + mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + MDBX_cursor **prev = &txn->tw.cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_cassert(mc, *prev == mc); + mdbx_tassert(txn, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = 0; @@ -17326,7 +18038,8 @@ void mdbx_cursor_close(MDBX_cursor *mc) { mdbx_free(mc); } else { /* Cursor closed before nested txn ends */ - mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE); + mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); mc->mc_signature = MDBX_MC_WAIT4EOT; } } @@ -18029,6 +18742,7 @@ static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; + cdst->mc_dbistate = csrc->mc_dbistate; for (unsigned i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -18316,7 +19030,7 @@ static __cold int mdbx_page_check(MDBX_cursor *const mc, if (IS_OVERFLOW(mp)) { if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); - if (unlikely(mp->mp_pgno > mc->mc_txn->mt_next_pgno - mp->mp_pages)) + if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) return bad_page(mp, "overflow page beyond (%u) next-pgno\n", mp->mp_pgno + mp->mp_pages); return MDBX_SUCCESS; @@ -18583,7 +19297,7 @@ static __cold int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { mdbx_tassert(mc->mc_txn, mc->mc_txn->mt_parent || mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - MDBX_DPL_TXNFULL); + mc->mc_txn->mt_env->me_options.dp_limit); mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1); if (unlikely(mc->mc_top != mc->mc_snum - 1)) return MDBX_CURSOR_FULL; @@ -19642,7 +20356,7 @@ static __cold void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Update signature */ assert(meta->mm_geo.now >= meta->mm_geo.next); - meta->mm_datasync_sign = mdbx_meta_sign(meta); + unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta)); } /* Make resizeable */ @@ -19831,7 +20545,7 @@ static __cold int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) make_sizeable(headcopy); /* Update signature to steady */ - headcopy->mm_datasync_sign = mdbx_meta_sign(headcopy); + unaligned_poke_u64(4, headcopy->mm_datasync_sign, mdbx_meta_sign(headcopy)); /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); @@ -20276,18 +20990,15 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, const MDBX_meta *const recent_meta = mdbx_meta_head(env); arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta); arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); - arg->mi_meta0_sign = meta0->mm_datasync_sign; + arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign); arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); - arg->mi_meta1_sign = meta1->mm_datasync_sign; + arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign); arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); - arg->mi_meta2_sign = meta2->mm_datasync_sign; + arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign); if (likely(bytes > size_before_bootid)) { - arg->mi_bootid.meta0.x = meta0->mm_bootid.x; - arg->mi_bootid.meta1.x = meta0->mm_bootid.x; - arg->mi_bootid.meta2.x = meta0->mm_bootid.x; - arg->mi_bootid.meta0.y = meta0->mm_bootid.y; - arg->mi_bootid.meta1.y = meta0->mm_bootid.y; - arg->mi_bootid.meta2.y = meta0->mm_bootid.y; + memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); + memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); + memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); } const MDBX_meta *txn_meta = recent_meta; @@ -20314,11 +21025,14 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_mapsize = env->me_dxb_mmap.limit; mdbx_compiler_barrier(); if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) && - arg->mi_meta0_sign == meta0->mm_datasync_sign && + arg->mi_meta0_sign == + unaligned_peek_u64(4, meta0->mm_datasync_sign) && arg->mi_meta1_txnid == mdbx_meta_txnid_fluid(env, meta1) && - arg->mi_meta1_sign == meta1->mm_datasync_sign && + arg->mi_meta1_sign == + unaligned_peek_u64(4, meta1->mm_datasync_sign) && arg->mi_meta2_txnid == mdbx_meta_txnid_fluid(env, meta2) && - arg->mi_meta2_sign == meta2->mm_datasync_sign && + arg->mi_meta2_sign == + unaligned_peek_u64(4, meta2->mm_datasync_sign) && recent_meta == mdbx_meta_head(env) && arg->mi_recent_txnid == mdbx_meta_txnid_fluid(env, recent_meta))) break; @@ -20562,18 +21276,8 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, goto early_bailout; } - if (txn->mt_numdbs < env->me_numdbs) { - /* Import handles from env */ - for (unsigned i = txn->mt_numdbs; i < env->me_numdbs; ++i) { - txn->mt_dbistate[i] = 0; - if (env->me_dbflags[i] & DB_VALID) { - txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); - } - } - txn->mt_numdbs = env->me_numdbs; - } + /* Import handles from env */ + dbi_import_locked(txn); /* Rescan after mutex acquisition & import handles */ for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { @@ -20633,18 +21337,16 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbistate[slot] = (uint8_t)dbiflags; txn->mt_dbxs[slot].md_name.iov_base = namedup; txn->mt_dbxs[slot].md_name.iov_len = len; - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - txn->tw.cursors[slot] = NULL; - txn->mt_numdbs += (slot == txn->mt_numdbs); - if ((dbiflags & DBI_CREAT) == 0) { + txn->mt_dbiseqs[slot] = ++env->me_dbiseqs[slot]; + if (!(dbiflags & DBI_CREAT)) env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + if (txn->mt_numdbs == slot) { mdbx_compiler_barrier(); - if (env->me_numdbs <= slot) - env->me_numdbs = slot + 1; - } else { - env->me_dbiseqs[slot]++; + txn->mt_numdbs = env->me_numdbs = slot + 1; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) + txn->tw.cursors[slot] = NULL; } - txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot]; + mdbx_assert(env, env->me_numdbs > slot); *dbi = slot; } @@ -20702,10 +21404,15 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { return MDBX_BAD_DBI; env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; env->me_dbxs[dbi].md_name.iov_len = 0; mdbx_compiler_barrier(); env->me_dbxs[dbi].md_name.iov_base = NULL; mdbx_free(ptr); + + if (env->me_numdbs == dbi + 1) + env->me_numdbs = dbi; + return MDBX_SUCCESS; } @@ -20719,7 +21426,9 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_dbi_close_locked(env, dbi); + rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) + ? mdbx_dbi_close_locked(env, dbi) + : MDBX_BAD_DBI; mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } return rc; @@ -20885,7 +21594,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { txn->mt_flags |= MDBX_TXN_ERROR; goto bailout; } - env->me_dbiseqs[dbi]++; mdbx_dbi_close_locked(env, dbi); mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); @@ -20974,11 +21682,14 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, if (txnid) { retry_header:; const MDBX_meta *const recent_meta = mdbx_meta_head(env); - const uint64_t head_pages_retired = recent_meta->mm_pages_retired; + const uint64_t head_pages_retired = + unaligned_peek_u64(4, recent_meta->mm_pages_retired); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, recent_meta); mdbx_compiler_barrier(); - if (unlikely(recent_meta != mdbx_meta_head(env) || - head_pages_retired != recent_meta->mm_pages_retired) || + if (unlikely( + recent_meta != mdbx_meta_head(env) || + head_pages_retired != + unaligned_peek_u64(4, recent_meta->mm_pages_retired)) || head_txnid != mdbx_meta_txnid_fluid(env, recent_meta)) goto retry_header; @@ -21228,7 +21939,8 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, const MDBX_meta *head_meta = mdbx_meta_head(env); const txnid_t gap = (mdbx_meta_txnid_stable(env, head_meta) - laggard) / MDBX_TXNID_STEP; - const uint64_t head_retired = head_meta->mm_pages_retired; + const uint64_t head_retired = + unaligned_peek_u64(4, head_meta->mm_pages_retired); const size_t space = (oldest_retired > head_retired) ? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired)) @@ -21261,43 +21973,11 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, } __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - - if (unlikely(!env->me_map)) - return MDBX_EPERM; - - *env->me_autosync_threshold = bytes2pgno(env, threshold + env->me_psize - 1); - if (threshold) { - rc = mdbx_env_sync_poll(env); - if (unlikely(MDBX_IS_ERROR(rc))) - return rc; - } - return MDBX_SUCCESS; + return __inline_mdbx_env_set_syncbytes(env, threshold); } __cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { - int rc = check_env(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - - if (unlikely(!env->me_map)) - return MDBX_EPERM; - - *env->me_autosync_period = mdbx_osal_16dot16_to_monotime(seconds_16dot16); - if (seconds_16dot16) { - rc = mdbx_env_sync_poll(env); - if (unlikely(MDBX_IS_ERROR(rc))) - return rc; - } - return MDBX_SUCCESS; + return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); } __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { @@ -22457,7 +23137,7 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { return -1; STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); - const uint64_t limit = pagesize * (uint64_t)(MDBX_DPL_TXNFULL - 1); + const uint64_t limit = pagesize * (uint64_t)(MDBX_PGL_LIMIT - 1); return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE; } @@ -22701,6 +23381,229 @@ __cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) { return get_default_datacmp(flags); } +__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, + const uint64_t value) { + int err = check_env(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const bool lock_needed = (env->me_map && env->me_txn0 && + env->me_txn0->mt_owner != mdbx_thread_self()); + bool should_unlock = false; + switch (option) { + case MDBX_opt_sync_bytes: + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!env->me_autosync_threshold)) + return MDBX_EPERM; + if (sizeof(value) > sizeof(size_t) && unlikely(value != (size_t)value)) + return MDBX_TOO_LARGE; + if ((*env->me_autosync_threshold = + bytes2pgno(env, (size_t)value + env->me_psize - 1)) != 0) { + err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + break; + + case MDBX_opt_sync_period: + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!env->me_autosync_period)) + return MDBX_EPERM; + if (unlikely(value > UINT32_MAX)) + return MDBX_TOO_LARGE; + if ((*env->me_autosync_period = + mdbx_osal_16dot16_to_monotime((uint32_t)value)) != 0) { + err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + break; + + case MDBX_opt_max_db: + if (unlikely(value > MDBX_MAX_DBI)) + return MDBX_EINVAL; + if (unlikely(env->me_map)) + return MDBX_EPERM; + env->me_maxdbs = (unsigned)value + CORE_DBS; + break; + + case MDBX_opt_max_readers: + if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) + return MDBX_EINVAL; + if (unlikely(env->me_map)) + return MDBX_EPERM; + env->me_maxreaders = (unsigned)value; + break; + + case MDBX_opt_dp_reserve_limit: + if (unlikely(value > INT_MAX)) + return MDBX_EINVAL; + if (env->me_options.dp_reserve_limit != (unsigned)value) { + if (lock_needed) { + err = mdbx_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + env->me_options.dp_reserve_limit = (unsigned)value; + while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { + mdbx_assert(env, env->me_dp_reserve != NULL); + MDBX_page *mp = env->me_dp_reserve; + ASAN_UNPOISON_MEMORY_REGION(&mp->mp_next, sizeof(mp->mp_next)); + VALGRIND_MAKE_MEM_DEFINED(&mp->mp_next, sizeof(mp->mp_next)); + env->me_dp_reserve = mp->mp_next; + mdbx_free(mp); + env->me_dp_reserve_len -= 1; + } + } + break; + + case MDBX_opt_rp_augment_limit: + if (unlikely(value > MDBX_PGL_LIMIT)) + return MDBX_EINVAL; + env->me_options.rp_augment_limit = (unsigned)value; + break; + + case MDBX_opt_txn_dp_limit: + case MDBX_opt_txn_dp_initial: + if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4 || + value > bytes2pgno(env, env->me_dbgeo.upper) - NUM_METAS)) + return MDBX_EINVAL; + if (unlikely(env->me_txn0 == NULL)) + return MDBX_EACCESS; + if (lock_needed) { + err = mdbx_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + if (env->me_txn) + err = MDBX_EPERM /* unable change during transaction */; + else { + mdbx_dpl_clear(env->me_txn0->tw.dirtylist); + const unsigned value32 = (unsigned)value; + if (option == MDBX_opt_txn_dp_initial && + env->me_options.dp_initial != value32) { + if (env->me_options.dp_limit < value32) + env->me_options.dp_limit = value32; + if (env->me_txn0->tw.dirtylist->allocated < value32 && + !mdbx_dpl_reserve(env->me_txn0, value32)) + err = MDBX_ENOMEM; + else + env->me_options.dp_initial = value32; + } + if (option == MDBX_opt_txn_dp_limit && + env->me_options.dp_limit != value32) { + if (env->me_txn0->tw.dirtylist->allocated > value32 && + !mdbx_dpl_reserve(env->me_txn0, value32)) + err = MDBX_ENOMEM; + else { + if (env->me_options.dp_initial > value32) + env->me_options.dp_initial = value32; + env->me_options.dp_limit = value32; + } + } + } + break; + + case MDBX_opt_spill_max_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_max_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_min_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_min_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_parent4child_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_parent4child_denominator = (uint8_t)value; + break; + + case MDBX_opt_loose_limit: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.dp_loose_limit = (uint8_t)value; + break; + + default: + return MDBX_EINVAL; + } + + if (should_unlock) + mdbx_txn_unlock(env); + return err; +} + +__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, + uint64_t *value) { + int err = check_env(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (unlikely(!value)) + return MDBX_EINVAL; + + switch (option) { + case MDBX_opt_sync_bytes: + if (unlikely(!env->me_autosync_threshold)) + return MDBX_EPERM; + *value = *env->me_autosync_threshold; + break; + + case MDBX_opt_sync_period: + if (unlikely(!env->me_autosync_period)) + return MDBX_EPERM; + *value = mdbx_osal_monotime_to_16dot16(*env->me_autosync_period); + break; + + case MDBX_opt_max_db: + *value = env->me_maxdbs - CORE_DBS; + break; + + case MDBX_opt_max_readers: + *value = env->me_maxreaders; + break; + + case MDBX_opt_dp_reserve_limit: + *value = env->me_options.dp_reserve_limit; + break; + + case MDBX_opt_rp_augment_limit: + *value = env->me_options.rp_augment_limit; + break; + + case MDBX_opt_txn_dp_limit: + *value = env->me_options.dp_limit; + break; + case MDBX_opt_txn_dp_initial: + *value = env->me_options.dp_initial; + break; + + case MDBX_opt_spill_max_denominator: + *value = env->me_options.spill_max_denominator; + break; + case MDBX_opt_spill_min_denominator: + *value = env->me_options.spill_min_denominator; + break; + case MDBX_opt_spill_parent4child_denominator: + *value = env->me_options.spill_parent4child_denominator; + break; + + case MDBX_opt_loose_limit: + *value = env->me_options.dp_loose_limit; + break; + + default: + return MDBX_EINVAL; + } + + return MDBX_SUCCESS; +} + /*** Attribute support functions for Nexenta **********************************/ #ifdef MDBX_NEXENTA_ATTRS @@ -22962,14 +23865,12 @@ __dll_export #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ -#if MDBX_HUGE_TRANSACTIONS - " MDBX_HUGE_TRANSACTIONS=YES" -#endif /* MDBX_HUGE_TRANSACTIONS */ " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_ENABLE_REFUND=" STRINGIFY(MDBX_ENABLE_REFUND) #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ @@ -23608,6 +24509,10 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #endif } +#if !(defined(_WIN32) || defined(_WIN64)) +static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } +#endif /*! Windows */ + MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { #if defined(_WIN32) || defined(_WIN64) const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); @@ -23746,6 +24651,29 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, flags |= O_CLOEXEC; #endif /* O_CLOEXEC */ + /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; + static const char dev_null[] = "/dev/null"; + if (!is_valid_fd(STDIN_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); + stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); + } + if (!is_valid_fd(STDOUT_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", + "OUT", STDOUT_FILENO, dev_null); + stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); + } + if (!is_valid_fd(STDERR_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", + "ERR", STDERR_FILENO, dev_null); + stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + *fd = open(pathname, flags, unix_mode_bits); #if defined(O_DIRECT) if (*fd < 0 && (flags & O_DIRECT) && @@ -23754,6 +24682,45 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, *fd = open(pathname, flags, unix_mode_bits); } #endif /* O_DIRECT */ + + /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + if (*fd == STDIN_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); + assert(stub_fd0 == -1); + *fd = dup(stub_fd0 = *fd); + } + if (*fd == STDOUT_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); + assert(stub_fd1 == -1); + *fd = dup(stub_fd1 = *fd); + } + if (*fd == STDERR_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); + assert(stub_fd2 == -1); + *fd = dup(stub_fd2 = *fd); + } + if (stub_fd0 != -1) + close(stub_fd0); + if (stub_fd1 != -1) + close(stub_fd1); + if (stub_fd2 != -1) + close(stub_fd2); + if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { + mdbx_error( + "Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); + close(*fd); + return EBADF; + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + if (*fd < 0) return errno; @@ -23777,6 +24744,7 @@ MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) return CloseHandle(fd) ? MDBX_SUCCESS : GetLastError(); #else + assert(fd > STDERR_FILENO); return (close(fd) == 0) ? MDBX_SUCCESS : errno; #endif } @@ -25423,9 +26391,9 @@ __dll_export 0, 9, 2, - 12, - {"2020-12-05T09:45:03+03:00", "455c7d633b3e490fe4f1e8c9999adccdf358f699", "3e7459b428373404e3c8ce238bd4061fa9d18f79", - "v0.9.2-12-g3e7459b4"}, + 110, + {"2021-01-23T01:13:20+03:00", "8a2017c7d169c0ee1b9b21cc67ea939d7abe5f08", "4e13d1239ab6580ed04cab377b746e7f8c214a9b", + "v0.9.2-110-g4e13d12"}, sourcery}; __dll_export diff --git a/libs/libmdbx/src/mdbx.c++ b/libs/libmdbx/src/mdbx.c++ index 4f54ad8efb..049c93d588 100644 --- a/libs/libmdbx/src/mdbx.c++ +++ b/libs/libmdbx/src/mdbx.c++ @@ -12,7 +12,7 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -339,7 +339,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -347,7 +347,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1542,11 +1542,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1596,6 +1591,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1917,10 +1930,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1940,17 +1953,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2199,7 +2213,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2237,19 +2252,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2264,37 +2280,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2418,14 +2424,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2435,26 +2443,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2584,11 +2585,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2601,6 +2600,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index 2a6a2d7893..d5c5ec99ce 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -823,7 +823,7 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t) * \param [in] env An environment handle returned by \ref mdbx_env_create(). * \param [in] msg The assertion message, not including newline. */ typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, - int line, const char *msg, + int line, const char *fmt, va_list args) MDBX_CXX17_NOEXCEPT; /** \brief The "don't change `logger`" value for mdbx_setup_debug() */ @@ -1793,6 +1793,176 @@ LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, * \returns a non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); +/** \brief MDBX environment options. */ +enum MDBX_option_t { + /** \brief Controls the maximum number of named databases for the environment. + * + * \details By default only unnamed key-value database could used and + * appropriate value should set by `MDBX_opt_max_db` to using any more named + * subDB(s). To reduce overhead, use the minimum sufficient value. This option + * may only set after \ref mdbx_env_create() and before \ref mdbx_env_open(). + * + * \see mdbx_env_set_maxdbs() \see mdbx_env_get_maxdbs() */ + MDBX_opt_max_db, + + /** \brief Defines the maximum number of threads/reader slots + * for all processes interacting with the database. + * + * \details This defines the number of slots in the lock table that is used to + * track readers in the the environment. The default is about 100 for 4K + * system page size. Starting a read-only transaction normally ties a lock + * table slot to the current thread until the environment closes or the thread + * exits. If \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the + * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is + * destroyed. This option may only set after \ref mdbx_env_create() and before + * \ref mdbx_env_open(), and has an effect only when the database is opened by + * the first process interacts with the database. + * + * \see mdbx_env_set_maxreaders() \see mdbx_env_get_maxreaders() */ + MDBX_opt_max_readers, + + /** \brief Controls interprocess/shared threshold to force flush the data + * buffers to disk, if \ref MDBX_SAFE_NOSYNC is used. + * + * \see mdbx_env_set_syncbytes() \see mdbx_env_get_syncbytes() */ + MDBX_opt_sync_bytes, + + /** \brief Controls interprocess/shared relative period since the last + * unsteady commit to force flush the data buffers to disk, + * if \ref MDBX_SAFE_NOSYNC is used. + * \see mdbx_env_set_syncperiod() \see mdbx_env_get_syncperiod() */ + MDBX_opt_sync_period, + + /** \brief Controls the in-process limit to grow a list of reclaimed/recycled + * page's numbers for finding a sequence of contiguous pages for large data + * items. + * + * \details A long values requires allocation of contiguous database pages. + * To find such sequences, it may be necessary to accumulate very large lists, + * especially when placing very long values (more than a megabyte) in a large + * databases (several tens of gigabytes), which is much expensive in extreme + * cases. This threshold allows you to avoid such costs by allocating new + * pages at the end of the database (with its possible growth on disk), + * instead of further accumulating/reclaiming Garbage Collection records. + * + * On the other hand, too small threshold will lead to unreasonable database + * growth, or/and to the inability of put long values. + * + * The `MDBX_opt_rp_augment_limit` controls described limit for the current + * process. Default is 262144, it is usually enough for most cases. */ + MDBX_opt_rp_augment_limit, + + /** \brief Controls the in-process limit to grow a cache of dirty + * pages for reuse in the current transaction. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * To reduce overhead, it is reasonable to release not all such pages + * immediately, but to leave some ones in cache for reuse in the current + * transaction. + * + * The `MDBX_opt_loose_limit` allows you to set a limit for such cache inside + * the current process. Should be in the range 0..255, default is 64. */ + MDBX_opt_loose_limit, + + /** \brief Controls the in-process limit of a pre-allocated memory items + * for dirty pages. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and + * released when a transaction is committed. To reduce overhead, it is + * reasonable to release not all ones, but to leave some allocations in + * reserve for reuse in the next transaction(s). + * + * The `MDBX_opt_dp_reserve_limit` allows you to set a limit for such reserve + * inside the current process. Default is 1024. */ + MDBX_opt_dp_reserve_limit, + + /** \brief Controls the in-process limit of dirty pages + * for a write transaction. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and will + * be busy until are written to disk. Therefore for a large transactions is + * reasonable to limit dirty pages collecting above an some threshold but + * spill to disk instead. + * + * The `MDBX_opt_txn_dp_limit` controls described threshold for the current + * process. Default is 65536, it is usually enough for most cases. */ + MDBX_opt_txn_dp_limit, + + /** \brief Controls the in-process initial allocation size for dirty pages + * list of a write transaction. Default is 1024. */ + MDBX_opt_txn_dp_initial, + + /** \brief Controls the in-process how maximal part of the dirty pages may be + * spilled when necessary. + * + * \details The `MDBX_opt_spill_max_denominator` defines the denominator for + * limiting from the top for part of the current dirty pages may be spilled + * when the free room for a new dirty pages (i.e. distance to the + * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested + * operation. + * Exactly `max_pages_to_spill = dirty_pages - dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_max_denominator`. + * + * Should be in the range 0..255, where zero means no limit, i.e. all dirty + * pages could be spilled. Default is 8, i.e. no more than 7/8 of the current + * dirty pages may be spilled when reached the condition described above. */ + MDBX_opt_spill_max_denominator, + + /** \brief Controls the in-process how minimal part of the dirty pages should + * be spilled when necessary. + * + * \details The `MDBX_opt_spill_min_denominator` defines the denominator for + * limiting from the bottom for part of the current dirty pages should be + * spilled when the free room for a new dirty pages (i.e. distance to the + * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested + * operation. + * Exactly `min_pages_to_spill = dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_min_denominator`. + * + * Should be in the range 0..255, where zero means no restriction at the + * bottom. Default is 8, i.e. at least the 1/8 of the current dirty pages + * should be spilled when reached the condition described above. */ + MDBX_opt_spill_min_denominator, + + /** \brief Controls the in-process how much of the parent transaction dirty + * pages will be spilled while start each child transaction. + * + * \details The `MDBX_opt_spill_parent4child_denominator` defines the + * denominator to determine how much of parent transaction dirty pages will be + * spilled explicitly while start each child transaction. + * Exactly `pages_to_spill = dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_parent4child_denominator`. + * + * For a stack of nested transactions each dirty page could be spilled only + * once, and parent's dirty pages couldn't be spilled while child + * transaction(s) are running. Therefore a child transaction could reach + * \ref MDBX_TXN_FULL when parent(s) transaction has spilled too less (and + * child reach the limit of dirty pages), either when parent(s) has spilled + * too more (since child can't spill already spilled pages). So there is no + * universal golden ratio. + * + * Should be in the range 0..255, where zero means no explicit spilling will + * be performed during starting nested transactions. + * Default is 0, i.e. by default no spilling performed during starting nested + * transactions, that correspond historically behaviour. */ + MDBX_opt_spill_parent4child_denominator, +}; +#ifndef __cplusplus +/** \ingroup c_settings */ +typedef enum MDBX_option_t MDBX_option_t; +#endif + +LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, + const uint64_t value); +LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, + const MDBX_option_t option, + uint64_t *value); + /** \brief Open an environment instance. * \ingroup c_opening * @@ -2178,7 +2348,10 @@ LIBMDBX_INLINE_API(int, mdbx_env_sync_poll, (MDBX_env * env)) { * a synchronous flush would be made. * * \returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); +LIBMDBX_INLINE_API(int, mdbx_env_set_syncbytes, + (MDBX_env * env, size_t threshold)) { + return mdbx_env_set_option(env, MDBX_opt_sync_bytes, threshold); +} /** \brief Sets relative period since the last unsteady commit to force flush * the data buffers to disk, even of \ref MDBX_SAFE_NOSYNC flag in the @@ -2210,8 +2383,10 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); * the last unsteady commit. * * \returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env, - unsigned seconds_16dot16); +LIBMDBX_INLINE_API(int, mdbx_env_set_syncperiod, + (MDBX_env * env, unsigned seconds_16dot16)) { + return mdbx_env_set_option(env, MDBX_opt_sync_period, seconds_16dot16); +} /** \brief Close the environment and release the memory map. * \ingroup c_opening @@ -2582,17 +2757,18 @@ mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_txnsize_max(intptr_t pagesize); -/** \brief Set the maximum number of threads/reader slots for the environment. - * \ingroup c_settings +/** \brief Set the maximum number of threads/reader slots for for all processes + * interacts with the database. \ingroup c_settings * - * This defines the number of slots in the lock table that is used to track - * readers in the the environment. The default is 119 for 4K system page size. - * Starting a read-only transaction normally ties a lock table slot to the - * current thread until the environment closes or the thread exits. If + * \details This defines the number of slots in the lock table that is used to + * track readers in the the environment. The default is about 100 for 4K system + * page size. Starting a read-only transaction normally ties a lock table slot + * to the current thread until the environment closes or the thread exits. If * \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the slot to the * \ref MDBX_txn object until it or the \ref MDBX_env object is destroyed. * This function may only be called after \ref mdbx_env_create() and before - * \ref mdbx_env_open(). + * \ref mdbx_env_open(), and has an effect only when the database is opened by + * the first process interacts with the database. * \see mdbx_env_get_maxreaders() * * \param [in] env An environment handle returned @@ -2603,7 +2779,10 @@ mdbx_limits_txnsize_max(intptr_t pagesize); * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. * \retval MDBX_EPERM The environment is already open. */ -LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers); +LIBMDBX_INLINE_API(int, mdbx_env_set_maxreaders, + (MDBX_env * env, unsigned readers)) { + return mdbx_env_set_option(env, MDBX_opt_max_readers, readers); +} /** \brief Get the maximum number of threads/reader slots for the environment. * \ingroup c_statinfo @@ -2616,7 +2795,16 @@ LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers); +LIBMDBX_INLINE_API(int, mdbx_env_get_maxreaders, + (const MDBX_env *env, unsigned *readers)) { + int rc = MDBX_EINVAL; + if (readers) { + uint64_t proxy = 0; + rc = mdbx_env_get_option(env, MDBX_opt_max_readers, &proxy); + *readers = (unsigned)proxy; + } + return rc; +} /** \brief Set the maximum number of named databases for the environment. * \ingroup c_settings @@ -2639,7 +2827,9 @@ LIBMDBX_API int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers); * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. * \retval MDBX_EPERM The environment is already open. */ -LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); +LIBMDBX_INLINE_API(int, mdbx_env_set_maxdbs, (MDBX_env * env, MDBX_dbi dbs)) { + return mdbx_env_set_option(env, MDBX_opt_max_db, dbs); +} /** \brief Get the maximum number of named databases for the environment. * \ingroup c_statinfo @@ -2651,7 +2841,16 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs); +LIBMDBX_INLINE_API(int, mdbx_env_get_maxdbs, + (const MDBX_env *env, MDBX_dbi *dbs)) { + int rc = MDBX_EINVAL; + if (dbs) { + uint64_t proxy = 0; + rc = mdbx_env_get_option(env, MDBX_opt_max_db, &proxy); + *dbs = (MDBX_dbi)proxy; + } + return rc; +} /** \brief Get the maximum size of keys can write. * \ingroup c_statinfo diff --git a/libs/libmdbx/src/mdbx_chk.c b/libs/libmdbx/src/mdbx_chk.c index dd505dd96c..906c187347 100644 --- a/libs/libmdbx/src/mdbx_chk.c +++ b/libs/libmdbx/src/mdbx_chk.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -361,7 +361,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -369,7 +369,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1962,17 +1975,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2440,14 +2446,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2457,26 +2465,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2606,11 +2607,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2623,6 +2622,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; @@ -3596,7 +3606,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, data->iov_len); size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; - if (number < 1 || number > MDBX_PNL_MAX) + if (number < 1 || number > MDBX_PGL_LIMIT) problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { problem_add("entry", txnid, "trimmed idl", @@ -4176,7 +4186,21 @@ int main(int argc, char *argv[]) { if (argc < 2) usage(prog); - for (int i; (i = getopt(argc, argv, "012TVvqnwctdis:")) != EOF;) { + for (int i; (i = getopt(argc, argv, + "0" + "1" + "2" + "T" + "V" + "v" + "q" + "n" + "w" + "c" + "t" + "d" + "i" + "s:")) != EOF;) { switch (i) { case 'V': printf("mdbx_chk version %d.%d.%d.%d\n" @@ -4697,7 +4721,7 @@ int main(int argc, char *argv[]) { walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { - error("gc pages mismatch (%" PRIu64 "(walked) != %" PRIu64 "(GC))\n", + error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", unused_pages, gc_pages); } } else if (verbose) { diff --git a/libs/libmdbx/src/mdbx_copy.c b/libs/libmdbx/src/mdbx_copy.c index cde9923308..0c831b4528 100644 --- a/libs/libmdbx/src/mdbx_copy.c +++ b/libs/libmdbx/src/mdbx_copy.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -361,7 +361,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -369,7 +369,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1962,17 +1975,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2440,14 +2446,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2457,26 +2465,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2606,11 +2607,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2623,6 +2622,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; diff --git a/libs/libmdbx/src/mdbx_dump.c b/libs/libmdbx/src/mdbx_dump.c index c51247a121..6b7a5aaa40 100644 --- a/libs/libmdbx/src/mdbx_dump.c +++ b/libs/libmdbx/src/mdbx_dump.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -361,7 +361,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -369,7 +369,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1962,17 +1975,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2440,14 +2446,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2457,26 +2465,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2606,11 +2607,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2623,6 +2622,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; @@ -3370,7 +3380,16 @@ int main(int argc, char *argv[]) { if (argc < 2) usage(); - while ((i = getopt(argc, argv, "af:lnps:Vrq")) != EOF) { + while ((i = getopt(argc, argv, + "a" + "f:" + "l" + "n" + "p" + "s:" + "V" + "r" + "q")) != EOF) { switch (i) { case 'V': printf("mdbx_dump version %d.%d.%d.%d\n" diff --git a/libs/libmdbx/src/mdbx_load.c b/libs/libmdbx/src/mdbx_load.c index d1ac7e9238..ad9ddc90e6 100644 --- a/libs/libmdbx/src/mdbx_load.c +++ b/libs/libmdbx/src/mdbx_load.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -361,7 +361,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -369,7 +369,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1962,17 +1975,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2440,14 +2446,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2457,26 +2465,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2606,11 +2607,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2623,6 +2622,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; @@ -3566,16 +3576,17 @@ static int readline(MDBX_val *out, MDBX_val *buf) { static void usage(void) { fprintf(stderr, - "usage: %s [-V] [-q] [-a] [-f file] [-s name] [-N] [-T] [-r] [-n]" - " dbpath\n" + "usage: %s " + "[-V] [-q] [-a] [-f file] [-s name] [-N] [-p] [-T] [-r] [-n] dbpath\n" " -V\t\tprint version and exit\n" " -q\t\tbe quiet\n" " -a\t\tappend records in input order (required for custom " "comparators)\n" " -f file\tread from file instead of stdin\n" " -s name\tload into named subDB\n" - " -N\t\tdon't overwrite existing records when loading (), just skip " - "them\n" + " -N\t\tdon't overwrite existing records when loading, just skip " + "ones\n" + " -p\t\tpurge subDB before loading\n" " -T\t\tread plaintext\n" " -r\t\trescue mode (ignore errors to load corrupted DB dump)\n" " -n\t\tdon't use subdirectory for newly created database " @@ -3598,15 +3609,26 @@ int main(int argc, char *argv[]) { MDBX_cursor *mc = nullptr; MDBX_dbi dbi; char *envname = nullptr; - int envflags = MDBX_UTTERLY_NOSYNC, putflags = MDBX_UPSERT; + int envflags = MDBX_SAFE_NOSYNC | MDBX_ACCEDE, putflags = MDBX_UPSERT; bool quiet = false; bool rescue = false; + bool purge = false; prog = argv[0]; if (argc < 2) usage(); - while ((i = getopt(argc, argv, "af:ns:NTVrq")) != EOF) { + while ((i = getopt(argc, argv, + "a" + "f:" + "n" + "s:" + "N" + "p" + "T" + "V" + "r" + "q")) != EOF) { switch (i) { case 'V': printf("mdbx_load version %d.%d.%d.%d\n" @@ -3641,6 +3663,9 @@ int main(int argc, char *argv[]) { case 'N': putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; break; + case 'p': + purge = true; + break; case 'T': mode |= NOHDR | PRINT; break; @@ -3814,6 +3839,14 @@ int main(int argc, char *argv[]) { } } + if (purge) { + rc = mdbx_drop(txn, dbi, false); + if (unlikely(rc != MDBX_SUCCESS)) { + error("mdbx_drop", rc); + goto txn_abort; + } + } + if (putflags & MDBX_APPEND) putflags = (dbi_flags & MDBX_DUPSORT) ? putflags | MDBX_APPENDDUP : putflags & ~MDBX_APPENDDUP; diff --git a/libs/libmdbx/src/mdbx_stat.c b/libs/libmdbx/src/mdbx_stat.c index aa5d501728..d19c672ff0 100644 --- a/libs/libmdbx/src/mdbx_stat.c +++ b/libs/libmdbx/src/mdbx_stat.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4 +#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -361,7 +361,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define likely(cond) __builtin_expect(!!(cond), 1) # else -# define likely(x) (x) +# define likely(x) (!!(x)) # endif #endif /* likely */ @@ -369,7 +369,7 @@ # if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) # define unlikely(cond) __builtin_expect(!!(cond), 0) # else -# define unlikely(x) (x) +# define unlikely(x) (!!(x)) # endif #endif /* unlikely */ @@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif /* DOXYGEN */ -/** Enables support for huge write-transactions */ -#ifndef MDBX_HUGE_TRANSACTIONS -#define MDBX_HUGE_TRANSACTIONS 0 -#endif /* MDBX_HUGE_TRANSACTIONS */ - /** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ #define MDBX_OSX_WANNA_DURABILITY 0 /** Using fsync() with chance of data lost on power failure */ @@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#endif +#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls sort order of internal page number lists. + * The database format depend on this option and libmdbx builded with different + * option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#endif +#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t { typedef struct MDBX_meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint64_t mm_magic_and_version; + uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_a; + uint32_t mm_txnid_a[2]; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -1962,17 +1975,18 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile uint64_t mm_datasync_sign; +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; /* txnid that committed this page, the second of a two-phase-update pair */ - mdbx_safe64_t mm_txnid_b; + uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ - uint64_t mm_pages_retired; + uint32_t mm_pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. @@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo { (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) -#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) @@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo { #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 #define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ /*----------------------------------------------------------------------------*/ -/* Two kind lists of pages (aka PNL) */ -/* An PNL is an Page Number List, a sorted array of IDs. The first element of - * the array is a counter for how many actual page-numbers are in the list. - * PNLs are sorted in descending order, this allow cut off a page with lowest - * pgno (at the tail) just truncating the list */ -#define MDBX_PNL_ASCENDING 0 +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ typedef pgno_t *MDBX_PNL; #if MDBX_PNL_ASCENDING @@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL; typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ -typedef union MDBX_DP { - __anonymous_struct_extension__ struct { - pgno_t pgno; - MDBX_page *ptr; - }; - __anonymous_struct_extension__ struct { - unsigned sorted; - unsigned length; - }; -} MDBX_DP; - -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. - * The first element's length member is a count of how many actual - * elements are in the array. */ -typedef MDBX_DP *MDBX_DPL; +typedef struct MDBX_dp { + pgno_t pgno; + MDBX_page *ptr; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned allocated; +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; /* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#if MDBX_HUGE_TRANSACTIONS -#define MDBX_PNL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2) -#else -#define MDBX_PNL_MAX \ - ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4) -#endif /* MDBX_HUGE_TRANSACTIONS */ - #define MDBX_TXL_GRANULATE 32 #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) @@ -2440,14 +2446,16 @@ struct MDBX_txn { MDBX_cursor **cursors; pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL dirtylist; + MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ MDBX_TXL lifo_reclaimed; /* The list of pages that became unused during this transaction. */ @@ -2457,26 +2465,19 @@ struct MDBX_txn { MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ unsigned loose_count; - /* Number of retired to parent pages (tw.retired2parent_pages) */ - unsigned retired2parent_count; - /* The list of parent's txn dirty pages that retired (became unused) - * in this transaction, linked through `mp_next`. */ - MDBX_page *retired2parent_pages; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ MDBX_PNL spill_pages; + unsigned spill_least_removed; } tw; }; }; -/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. */ #if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 28 +#define CURSOR_STACK 32 #else -#define CURSOR_STACK 20 +#define CURSOR_STACK 24 #endif struct MDBX_xcursor; @@ -2606,11 +2607,9 @@ struct MDBX_env { uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; - /* MDBX_DP of pages written during a write txn. */ - MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; unsigned me_branch_nodemax; /* max size of a branch-node */ @@ -2623,6 +2622,17 @@ struct MDBX_env { volatile pgno_t *me_discarded_tail; volatile uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + unsigned me_dp_reserve_len; + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + } me_options; struct { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; @@ -3225,7 +3235,14 @@ int main(int argc, char *argv[]) { if (argc < 2) usage(prog); - while ((o = getopt(argc, argv, "Vaefnrs:")) != EOF) { + while ((o = getopt(argc, argv, + "V" + "a" + "e" + "f" + "n" + "r" + "s:")) != EOF) { switch (o) { case 'V': printf("mdbx_stat version %d.%d.%d.%d\n" |