summaryrefslogtreecommitdiff
path: root/libs/libmdbx
diff options
context:
space:
mode:
authorGeorge Hazan <ghazan@miranda.im>2021-01-23 12:25:21 +0300
committerGeorge Hazan <ghazan@miranda.im>2021-01-23 12:25:28 +0300
commit19dfb93282192821ad6d555fc80bcd9732f0b0dc (patch)
treed75fd4ab4719d22e4afb0844d31ff1ceadd9f356 /libs/libmdbx
parent114662b245aac5d6c15db73d7dd75dfc429dc88a (diff)
libmdbx: upgrade to 0.9.2.110
Diffstat (limited to 'libs/libmdbx')
-rw-r--r--libs/libmdbx/src/CMakeLists.txt1
-rw-r--r--libs/libmdbx/src/ChangeLog.md36
-rw-r--r--libs/libmdbx/src/GNUmakefile2
-rw-r--r--libs/libmdbx/src/VERSION2
-rw-r--r--libs/libmdbx/src/cmake/compiler.cmake12
-rw-r--r--libs/libmdbx/src/config.h.in2
-rw-r--r--libs/libmdbx/src/mdbx.c3224
-rw-r--r--libs/libmdbx/src/mdbx.c++130
-rw-r--r--libs/libmdbx/src/mdbx.h229
-rw-r--r--libs/libmdbx/src/mdbx_chk.c150
-rw-r--r--libs/libmdbx/src/mdbx_copy.c130
-rw-r--r--libs/libmdbx/src/mdbx_dump.c141
-rw-r--r--libs/libmdbx/src/mdbx_load.c165
-rw-r--r--libs/libmdbx/src/mdbx_stat.c139
14 files changed, 2836 insertions, 1527 deletions
diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt
index 789fc4685a..ff21fb5e0a 100644
--- a/libs/libmdbx/src/CMakeLists.txt
+++ b/libs/libmdbx/src/CMakeLists.txt
@@ -438,7 +438,6 @@ add_mdbx_option(MDBX_BUILD_TOOLS "Build MDBX tools (mdbx_chk/stat/dump/load/copy
CMAKE_DEPENDENT_OPTION(MDBX_INSTALL_MANPAGES "Install man-pages for MDBX tools (mdbx_chk/stat/dump/load/copy)" ON MDBX_BUILD_TOOLS OFF)
add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON)
add_mdbx_option(MDBX_ENV_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO)
-add_mdbx_option(MDBX_HUGE_TRANSACTIONS "Support for huge write-transactions" OFF)
mark_as_advanced(MDBX_ENV_CHECKPID)
if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use GNU/Linux libc extensions" OFF)
diff --git a/libs/libmdbx/src/ChangeLog.md b/libs/libmdbx/src/ChangeLog.md
index 66c5de3705..1d79b89061 100644
--- a/libs/libmdbx/src/ChangeLog.md
+++ b/libs/libmdbx/src/ChangeLog.md
@@ -1,7 +1,7 @@
ChangeLog
---------
-## v0.9.3 (in development)
+## v0.9.3 (in development) scheduled at 2021-02-23
TODO:
- Engage new terminology (https://github.com/erthink/libmdbx/issues/137).
@@ -13,18 +13,48 @@ TODO:
- Packages for ROSA Linux, ALT Linux, Fedora/RHEL, Debian/Ubuntu.
Acknowledgements:
+
- Mahlon E. Smith (http://www.martini.nu/) for FreeBSD port of libmdbx.
- 장세연 (http://www.castis.com) for bug fixing and PR.
-Added features:
+Removed options and features:
+
+ - Drop `MDBX_HUGE_TRANSACTIONS` build-option (now no longer required).
+
+New features:
+
- Package for FreeBSD is available now by Mahlon E. Smith.
+ - New API functions to get/set various options (https://github.com/erthink/libmdbx/issues/128).
+ - the maximum number of named databases for the environment;
+ - the maximum number of threads/reader slots;
+ - threshold (since the last unsteady commit) to force flush the data buffers to disk;
+ - relative period (since the last unsteady commit) to force flush the data buffers to disk;
+ - limit to grow a list of reclaimed/recycled page's numbers for finding a sequence of contiguous pages for large data items;
+ - limit to grow a cache of dirty pages for reuse in the current transaction;
+ - limit of a pre-allocated memory items for dirty pages;
+ - limit of dirty pages for a write transaction;
+ - initial allocation size for dirty pages list of a write transaction;
+ - maximal part of the dirty pages may be spilled when necessary;
+ - minimal part of the dirty pages should be spilled when necessary;
+ - how much of the parent transaction dirty pages will be spilled while start each child transaction;
+ - Unlimited/Dynamic size of retired and dirty page lists (https://github.com/erthink/libmdbx/issues/123).
+ - Added `-p` option (purge subDB before loading) to `mdbx_load` tool.
+ - Reworked spilling of large transaction and committing of nested transactions.
+ - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options.
Fixes:
+
- Fixed missing cleanup (null assigned) in the C++ commit/abort (https://github.com/erthink/libmdbx/pull/143).
- Fixed `mdbx_realloc()` for case of nullptr and `MDBX_AVOID_CRT=ON` for Windows.
+ - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://github.com/erthink/libmdbx/issues/146).
+ - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://github.com/erthink/libmdbx/issues/153).
+ - Fixed minor/potential memory leak during page flushing and unspilling.
+ - Fixed handling states of cursors's and subDBs's for nested transactions.
+ - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit.
+ - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://github.com/erthink/libmdbx/issues/153).
-## v0.9.2 scheduled at 2020-11-27
+## v0.9.2 at 2020-11-27
Acknowledgements:
diff --git a/libs/libmdbx/src/GNUmakefile b/libs/libmdbx/src/GNUmakefile
index b14000f5f3..da781606ee 100644
--- a/libs/libmdbx/src/GNUmakefile
+++ b/libs/libmdbx/src/GNUmakefile
@@ -72,7 +72,7 @@ strip: all
strip libmdbx.$(SO_SUFFIX) $(TOOLS)
clean:
- rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* \
+ rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *.$(SO_SUFFIX) *.dSYM *~ tmp.db/* \
*.gcov *.log *.err src/*.o test/*.o mdbx_example dist \
config.h src/config.h src/version.c *.tar*
diff --git a/libs/libmdbx/src/VERSION b/libs/libmdbx/src/VERSION
index 418c133552..b57017e6e7 100644
--- a/libs/libmdbx/src/VERSION
+++ b/libs/libmdbx/src/VERSION
@@ -1 +1 @@
-0.9.2.12
+0.9.2.110
diff --git a/libs/libmdbx/src/cmake/compiler.cmake b/libs/libmdbx/src/cmake/compiler.cmake
index ab72b897d1..22324a64f8 100644
--- a/libs/libmdbx/src/cmake/compiler.cmake
+++ b/libs/libmdbx/src/cmake/compiler.cmake
@@ -533,7 +533,7 @@ macro(setup_compile_flags)
add_compile_flags("C;CXX" "-fexceptions")
endif()
if(CC_HAS_FCXX_EXCEPTIONS)
- add_compile_flags("CXX" "-fcxx-exceptions -frtti")
+ add_compile_flags("CXX" "-fcxx-exceptions" "-frtti")
endif()
if(MSVC)
# checks for /EHa or /clr options exists,
@@ -583,13 +583,13 @@ macro(setup_compile_flags)
endif()
if(CC_HAS_WNO_UNKNOWN_PRAGMAS AND NOT HAVE_OPENMP)
- add_compile_flags("C;CXX" -Wno-unknown-pragmas)
+ add_compile_flags("C;CXX" "-Wno-unknown-pragmas")
endif()
if(CC_HAS_SECTIONS)
- add_compile_flags("C;CXX" -ffunction-sections -fdata-sections)
+ add_compile_flags("C;CXX" "-ffunction-sections" "-fdata-sections")
elseif(MSVC)
- add_compile_flags("C;CXX" /Gy)
+ add_compile_flags("C;CXX" "/Gy")
endif()
# We must set -fno-omit-frame-pointer here, since we rely
@@ -649,11 +649,11 @@ macro(setup_compile_flags)
endif()
if(ENABLE_ASAN)
- add_compile_flags("C;CXX" -fsanitize=address)
+ add_compile_flags("C;CXX" "-fsanitize=address")
endif()
if(ENABLE_UBSAN)
- add_compile_flags("C;CXX" -fsanitize=undefined)
+ add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error")
endif()
if(ENABLE_GCOV)
diff --git a/libs/libmdbx/src/config.h.in b/libs/libmdbx/src/config.h.in
index 5048d4e021..bb3eea07b4 100644
--- a/libs/libmdbx/src/config.h.in
+++ b/libs/libmdbx/src/config.h.in
@@ -9,10 +9,10 @@
#cmakedefine ENABLE_GPROF
#cmakedefine ENABLE_GCOV
#cmakedefine ENABLE_ASAN
+#cmakedefine ENABLE_UBSAN
#cmakedefine MDBX_FORCE_ASSERTIONS
/* Common */
-#cmakedefine01 MDBX_HUGE_TRANSACTIONS
#cmakedefine01 MDBX_TXN_CHECKOWNER
#cmakedefine MDBX_ENV_CHECKPID_AUTO
#ifndef MDBX_ENV_CHECKPID_AUTO
diff --git a/libs/libmdbx/src/mdbx.c b/libs/libmdbx/src/mdbx.c
index 7901b0b90d..895c263eef 100644
--- a/libs/libmdbx/src/mdbx.c
+++ b/libs/libmdbx/src/mdbx.c
@@ -12,7 +12,7 @@
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_ALLOY 1
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -339,7 +339,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -347,7 +347,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1542,11 +1542,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1596,6 +1591,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1917,10 +1930,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1940,17 +1953,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2199,7 +2213,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2237,19 +2252,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2264,37 +2280,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2418,14 +2424,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2435,26 +2443,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2584,11 +2585,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2601,6 +2600,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@@ -3431,10 +3441,11 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize,
const unsigned page_ln2 = log2n(pagesize);
const size_t hard = 0x7FF00000ul;
const size_t hard_pages = hard >> page_ln2;
- const size_t limit = (hard_pages < MDBX_DPL_TXNFULL / 3)
- ? hard
- : ((size_t)MDBX_DPL_TXNFULL / 3 << page_ln2);
- return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2;
+ STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO);
+ const size_t pages_limit = MDBX_PGL_LIMIT / 4;
+ const size_t limit =
+ (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2);
+ return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2;
}
/* Calculate the size of a leaf node.
@@ -3615,7 +3626,7 @@ __cold static int MDBX_PRINTF_ARGS(2, 3)
if (prev != mp) {
prev = mp;
mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0,
- "corrupted page #%u, mod-txnid %" PRIaTXN " \n",
+ "corrupted page #%u, mod-txnid %" PRIaTXN "\n",
mp->mp_pgno, mp->mp_txnid);
}
@@ -4032,11 +4043,13 @@ static __always_inline uint64_t safe64_read(const mdbx_safe64_t *ptr) {
return v;
}
+#if 0 /* unused for now */
static __always_inline void safe64_update(mdbx_safe64_t *ptr,
const uint64_t v) {
safe64_reset(ptr, true);
safe64_write(ptr, v);
}
+#endif /* unused for now */
/*----------------------------------------------------------------------------*/
/* rthc (tls keys and destructors) */
@@ -5594,14 +5607,14 @@ static int lcklist_detach_locked(MDBX_env *env) {
* LY: Binary search */
#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \
- static __always_inline TYPE_LIST *NAME(TYPE_LIST *first, unsigned length, \
- const TYPE_ARG item) { \
- TYPE_LIST *const begin = first, *const end = begin + length; \
+ static __always_inline const TYPE_LIST *NAME( \
+ const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \
+ const TYPE_LIST *const begin = first, *const end = begin + length; \
\
while (length > 3) { \
const unsigned whole = length; \
length >>= 1; \
- TYPE_LIST *const middle = first + length; \
+ const TYPE_LIST *const middle = first + length; \
if (CMP(*middle, item)) { \
first = middle + 1; \
length = whole - length - 1; \
@@ -5631,9 +5644,9 @@ static int lcklist_detach_locked(MDBX_env *env) {
} \
\
if (mdbx_audit_enabled()) { \
- for (TYPE_LIST *scan = begin; scan < first; ++scan) \
+ for (const TYPE_LIST *scan = begin; scan < first; ++scan) \
assert(CMP(*scan, item)); \
- for (TYPE_LIST *scan = first; scan < end; ++scan) \
+ for (const TYPE_LIST *scan = first; scan < end; ++scan) \
assert(!CMP(*scan, item)); \
(void)begin, (void)end; \
} \
@@ -5644,7 +5657,7 @@ static int lcklist_detach_locked(MDBX_env *env) {
/*----------------------------------------------------------------------------*/
static __always_inline size_t pnl2bytes(const size_t size) {
- assert(size > 0 && size <= MDBX_PNL_MAX * 2);
+ assert(size > 0 && size <= MDBX_PGL_LIMIT);
size_t bytes =
ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2),
MDBX_PNL_GRANULATE * sizeof(pgno_t)) -
@@ -5654,7 +5667,7 @@ static __always_inline size_t pnl2bytes(const size_t size) {
static __always_inline pgno_t bytes2pnl(const size_t bytes) {
size_t size = bytes / sizeof(pgno_t);
- assert(size > 2 && size <= MDBX_PNL_MAX * 2);
+ assert(size > 2 && size <= MDBX_PGL_LIMIT);
return (pgno_t)size - 2;
}
@@ -5681,7 +5694,7 @@ static void mdbx_pnl_free(MDBX_PNL pl) {
/* Shrink the PNL to the default size if it has grown larger */
static void mdbx_pnl_shrink(MDBX_PNL *ppl) {
assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) == MDBX_PNL_INITIAL);
- assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX &&
+ assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
MDBX_PNL_SIZE(*ppl) = 0;
if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) >
@@ -5701,17 +5714,19 @@ static void mdbx_pnl_shrink(MDBX_PNL *ppl) {
/* Grow the PNL to the size growed to at least given size */
static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) {
const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl);
- assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX &&
+ assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
if (likely(allocated >= wanna))
return MDBX_SUCCESS;
- if (unlikely(wanna > /* paranoia */ MDBX_PNL_MAX))
+ if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) {
+ mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT);
return MDBX_TXN_FULL;
+ }
- const size_t size = (wanna + wanna - allocated < MDBX_PNL_MAX)
+ const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT)
? wanna + wanna - allocated
- : MDBX_PNL_MAX;
+ : MDBX_PGL_LIMIT;
size_t bytes = pnl2bytes(size);
MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes);
if (likely(pl)) {
@@ -5729,9 +5744,9 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) {
/* Make room for num additional elements in an PNL */
static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl,
size_t num) {
- assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX &&
+ assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT &&
MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl));
- assert(num <= MDBX_PNL_MAX);
+ assert(num <= MDBX_PGL_LIMIT);
const size_t wanna = MDBX_PNL_SIZE(*ppl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna)
? MDBX_SUCCESS
@@ -5748,42 +5763,9 @@ static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) {
MDBX_PNL_LAST(pl) = pgno;
}
-/* Append an pgno onto an unsorted PNL */
-static __hot int __must_check_result mdbx_pnl_append(MDBX_PNL *ppl,
- pgno_t pgno) {
- /* Too big? */
- if (unlikely(MDBX_PNL_SIZE(*ppl) == MDBX_PNL_ALLOCLEN(*ppl))) {
- int rc = mdbx_pnl_need(ppl, MDBX_PNL_GRANULATE);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
- }
- mdbx_pnl_xappend(*ppl, pgno);
- return MDBX_SUCCESS;
-}
-
-/* Append an PNL onto an unsorted PNL */
-static int __must_check_result mdbx_pnl_append_list(MDBX_PNL *ppl,
- MDBX_PNL append) {
- const unsigned len = MDBX_PNL_SIZE(append);
- if (likely(len)) {
- int rc = mdbx_pnl_need(ppl, MDBX_PNL_SIZE(append));
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- const MDBX_PNL pnl = *ppl;
- unsigned w = MDBX_PNL_SIZE(pnl), r = 1;
- do
- pnl[++w] = append[r];
- while (++r <= len);
- MDBX_PNL_SIZE(pnl) = w;
- }
- return MDBX_SUCCESS;
-}
-
/* Append an pgno range onto an unsorted PNL */
-static __hot int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl,
- pgno_t pgno,
- unsigned n) {
+__always_inline static int __must_check_result
+mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) {
assert(n > 0);
int rc = mdbx_pnl_need(ppl, n);
if (unlikely(rc != MDBX_SUCCESS))
@@ -5792,16 +5774,18 @@ static __hot int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl,
const MDBX_PNL pnl = *ppl;
#if MDBX_PNL_ASCENDING
unsigned w = MDBX_PNL_SIZE(pnl);
- do
- pnl[++w] = pgno++;
- while (--n);
+ do {
+ pnl[++w] = pgno;
+ pgno += spilled ? 2 : 1;
+ } while (--n);
MDBX_PNL_SIZE(pnl) = w;
#else
unsigned w = MDBX_PNL_SIZE(pnl) + n;
MDBX_PNL_SIZE(pnl) = w;
- do
- pnl[w--] = --n + pgno;
- while (n);
+ do {
+ pnl[w--] = pgno;
+ pgno += spilled ? 2 : 1;
+ } while (--n);
#endif
return MDBX_SUCCESS;
@@ -5829,12 +5813,11 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl,
}
static bool mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) {
- assert(limit >= MIN_PAGENO && limit <= MAX_PAGENO + 1);
if (likely(MDBX_PNL_SIZE(pl))) {
assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO);
assert(MDBX_PNL_MOST(pl) < limit);
- assert(MDBX_PNL_SIZE(pl) <= MDBX_PNL_MAX);
- if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PNL_MAX * 3 / 2))
+ assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT);
+ if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT))
return false;
if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO))
return false;
@@ -5880,6 +5863,55 @@ static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) {
assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1));
}
+static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) {
+ mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) &&
+ txn->tw.spill_least_removed > 0);
+ txn->tw.spill_least_removed =
+ (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed;
+ txn->tw.spill_pages[idx] |= 1;
+ MDBX_PNL_SIZE(txn->tw.spill_pages) -=
+ (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
+
+ while (unlikely(npages > 1)) {
+ const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1;
+ if (MDBX_PNL_ASCENDING) {
+ if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) ||
+ (txn->tw.spill_pages[idx] >> 1) != pgno)
+ return;
+ } else {
+ if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno)
+ return;
+ txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed)
+ ? idx
+ : txn->tw.spill_least_removed;
+ }
+ txn->tw.spill_pages[idx] |= 1;
+ MDBX_PNL_SIZE(txn->tw.spill_pages) -=
+ (idx == MDBX_PNL_SIZE(txn->tw.spill_pages));
+ --npages;
+ }
+}
+
+static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) {
+ mdbx_tassert(txn, txn->tw.spill_least_removed > 0);
+ const MDBX_PNL sl = txn->tw.spill_pages;
+ if (txn->tw.spill_least_removed != INT_MAX) {
+ unsigned len = MDBX_PNL_SIZE(sl), r, w;
+ for (w = r = txn->tw.spill_least_removed; r <= len; ++r) {
+ sl[w] = sl[r];
+ w += 1 - (sl[r] & 1);
+ }
+ for (size_t i = 1; i < w; ++i)
+ mdbx_tassert(txn, (sl[i] & 1) == 0);
+ MDBX_PNL_SIZE(sl) = w - 1;
+ txn->tw.spill_least_removed = INT_MAX;
+ } else {
+ for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i)
+ mdbx_tassert(txn, (sl[i] & 1) == 0);
+ }
+ return sl;
+}
+
SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED)
static __hot void mdbx_pnl_sort(MDBX_PNL pnl) {
pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl));
@@ -5890,22 +5922,50 @@ static __hot void mdbx_pnl_sort(MDBX_PNL pnl) {
* Returns The index of the first item greater than or equal to pgno. */
SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED)
-static __hot unsigned mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) {
+static __hot unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno) {
assert(mdbx_pnl_check4assert(pnl, MAX_PAGENO + 1));
- pgno_t *begin = MDBX_PNL_BEGIN(pnl);
- pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), id);
- pgno_t *end = begin + MDBX_PNL_SIZE(pnl);
+ const pgno_t *begin = MDBX_PNL_BEGIN(pnl);
+ const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno);
+ const pgno_t *end = begin + MDBX_PNL_SIZE(pnl);
assert(it >= begin && it <= end);
if (it != begin)
- assert(MDBX_PNL_ORDERED(it[-1], id));
+ assert(MDBX_PNL_ORDERED(it[-1], pgno));
if (it != end)
- assert(!MDBX_PNL_ORDERED(it[0], id));
+ assert(!MDBX_PNL_ORDERED(it[0], pgno));
return (unsigned)(it - begin + 1);
}
-static __hot unsigned mdbx_pnl_exist(MDBX_PNL pnl, pgno_t id) {
- unsigned n = mdbx_pnl_search(pnl, id);
- return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == id) ? n : 0;
+static __inline unsigned mdbx_pnl_exist(const MDBX_PNL pnl, pgno_t pgno) {
+ unsigned n = mdbx_pnl_search(pnl, pgno);
+ return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0;
+}
+
+static __inline unsigned mdbx_pnl_intersect(const MDBX_PNL pnl, pgno_t pgno,
+ unsigned npages) {
+ const unsigned len = MDBX_PNL_SIZE(pnl);
+ if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
+ mdbx_debug_extra("PNL len %u [", len);
+ for (unsigned i = 1; i <= len; ++i)
+ mdbx_debug_extra_print(" %" PRIaPGNO, pnl[i]);
+ mdbx_debug_extra_print("%s\n", "]");
+ }
+ const pgno_t range_last = pgno + npages - 1;
+#if MDBX_PNL_ASCENDING
+ const unsigned n = mdbx_pnl_search(pnl, pgno);
+ assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || pgno <= pnl[n]));
+ const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= range_last;
+#else
+ const unsigned n = mdbx_pnl_search(pnl, range_last);
+ assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || range_last >= pnl[n]));
+ const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= pgno;
+#endif
+ if (mdbx_assert_enabled()) {
+ bool check = false;
+ for (unsigned i = 0; i < npages; ++i)
+ check |= mdbx_pnl_exist(pnl, pgno + i) != 0;
+ assert(check == rc);
+ }
+ return rc;
}
/*----------------------------------------------------------------------------*/
@@ -5952,8 +6012,10 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) {
if (likely(allocated >= wanna))
return MDBX_SUCCESS;
- if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX))
+ if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) {
+ mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX);
return MDBX_TXN_FULL;
+ }
const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX)
? wanna + wanna - allocated
@@ -5976,7 +6038,7 @@ static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl,
size_t num) {
assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX &&
MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl));
- assert(num <= MDBX_PNL_MAX);
+ assert(num <= MDBX_PGL_LIMIT);
const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num;
return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna)
? MDBX_SUCCESS
@@ -6007,14 +6069,69 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) {
/*----------------------------------------------------------------------------*/
+static __always_inline size_t dpl2bytes(const ptrdiff_t size) {
+ assert(size > 2 && size <= MDBX_PGL_LIMIT);
+ size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) +
+ (size + 2) * sizeof(MDBX_dp),
+ MDBX_PNL_GRANULATE * sizeof(void *) * 2) -
+ MDBX_ASSUME_MALLOC_OVERHEAD;
+ return bytes;
+}
+
+static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) {
+ size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp);
+ assert(size > 4 && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
+ return (unsigned)size - 2;
+}
+
+static __always_inline void mdbx_dpl_clear(MDBX_dpl *dl) {
+ dl->sorted = dl->length = 0;
+}
+
+static void mdbx_dpl_free(MDBX_txn *txn) {
+ if (likely(txn->tw.dirtylist)) {
+ mdbx_free(txn->tw.dirtylist);
+ txn->tw.dirtylist = NULL;
+ }
+}
+
+static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) {
+ mdbx_tassert(txn,
+ txn->tw.dirtylist == NULL || txn->tw.dirtylist->length <= size);
+ size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT);
+ MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes);
+ if (likely(dl)) {
+#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size)
+ bytes = malloc_usable_size(dl);
+#endif /* malloc_usable_size */
+ dl->allocated = bytes2dpl(bytes);
+ dl->items[0].pgno = 0;
+ dl->items[0].ptr = nullptr;
+ mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->allocated);
+ txn->tw.dirtylist = dl;
+ }
+ return dl;
+}
+
+static int mdbx_dpl_alloc(MDBX_txn *txn) {
+ mdbx_tassert(txn,
+ (txn->mt_flags & MDBX_TXN_RDONLY) == 0 && !txn->tw.dirtylist);
+ MDBX_dpl *const dl =
+ mdbx_dpl_reserve(txn, txn->mt_env->me_options.dp_initial);
+ if (unlikely(!dl))
+ return MDBX_ENOMEM;
+ mdbx_dpl_clear(dl);
+ return MDBX_SUCCESS;
+}
+
#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno)
-SORT_IMPL(dp_sort, false, MDBX_DP, DP_SORT_CMP)
-static __always_inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) {
- assert(dl->length <= MDBX_DPL_TXNFULL);
+SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP)
+static __always_inline MDBX_dpl *mdbx_dpl_sort(MDBX_dpl *dl) {
+ assert(dl->length <= MDBX_PGL_LIMIT);
assert(dl->sorted <= dl->length);
if (dl->sorted != dl->length) {
dl->sorted = dl->length;
- dp_sort(dl + 1, dl + dl->length + 1);
+ dp_sort(dl->items + 1, dl->items + dl->length + 1);
}
return dl;
}
@@ -6022,11 +6139,11 @@ static __always_inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) {
/* Returns the index of the first dirty-page whose pgno
* member is greater than or equal to id. */
#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id))
-SEARCH_IMPL(dp_bsearch, MDBX_DP, pgno_t, DP_SEARCH_CMP)
+SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP)
-static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) {
+static unsigned __hot mdbx_dpl_search(MDBX_dpl *dl, pgno_t pgno) {
if (mdbx_audit_enabled()) {
- for (const MDBX_DP *ptr = dl + dl->sorted; --ptr > dl;) {
+ for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) {
assert(ptr[0].pgno < ptr[1].pgno);
assert(ptr[0].pgno >= NUM_METAS);
}
@@ -6036,21 +6153,15 @@ static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) {
default:
/* sort a whole */
dl->sorted = dl->length;
- dp_sort(dl + 1, dl + dl->length + 1);
- __fallthrough; /* fall through */
+ dp_sort(dl->items + 1, dl->items + dl->length + 1);
+ break;
case 0:
/* whole sorted cases */
- if (mdbx_audit_enabled()) {
- for (const MDBX_DP *ptr = dl + dl->length; --ptr > dl;) {
- assert(ptr[0].pgno < ptr[1].pgno);
- assert(ptr[0].pgno >= NUM_METAS);
- }
- }
- return (unsigned)(dp_bsearch(dl + 1, dl->length, pgno) - dl);
+ break;
#define LINEAR_SEARCH_CASE(N) \
case N: \
- if (dl[dl->length - N + 1].pgno == pgno) \
+ if (dl->items[dl->length - N + 1].pgno == pgno) \
return dl->length - N + 1; \
__fallthrough
@@ -6071,70 +6182,120 @@ static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) {
LINEAR_SEARCH_CASE(3); /* fall through */
LINEAR_SEARCH_CASE(2); /* fall through */
case 1:
- if (dl[dl->length].pgno == pgno)
+ if (dl->items[dl->length].pgno == pgno)
return dl->length;
/* continue bsearch on the sorted part */
- return (unsigned)(dp_bsearch(dl + 1, dl->sorted, pgno) - dl);
+ break;
+ }
+ return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
+}
+
+static __inline bool mdbx_dpl_intersect(MDBX_dpl *dl, pgno_t pgno,
+ unsigned npages) {
+ assert(dl->sorted == dl->length);
+ unsigned const n = mdbx_dpl_search(dl, pgno);
+ assert(n >= 1 && n <= dl->length + 1);
+ assert(n > dl->length || pgno <= dl->items[n].pgno);
+ assert(n == 1 || pgno > dl->items[n - 1].pgno);
+ const MDBX_page *const prev = dl->items[n - 1].ptr;
+ const bool rc =
+ (/* intersection with founded */ n <= dl->length &&
+ pgno + npages > dl->items[n].pgno) ||
+ (/* intersection with prev */ n > 1 && unlikely(IS_OVERFLOW(prev)) &&
+ prev->mp_pgno + prev->mp_pages > pgno);
+ if (mdbx_assert_enabled()) {
+ bool check = false;
+ for (unsigned i = 1; i <= dl->length; ++i) {
+ const MDBX_page *const dp = dl->items[i].ptr;
+ if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages ||
+ dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) /* end */ <=
+ /* begin */ pgno))
+ check |= true;
+ }
+ assert(check == rc);
}
+ return rc;
}
-static __always_inline MDBX_page *mdbx_dpl_find(MDBX_DPL dl, pgno_t pgno) {
- const unsigned i = mdbx_dpl_search(dl, pgno);
+static __always_inline unsigned mdbx_dpl_exist(MDBX_dpl *dl, pgno_t pgno) {
+ unsigned i = mdbx_dpl_search(dl, pgno);
assert((int)i > 0);
- return (i <= dl->length && dl[i].pgno == pgno) ? dl[i].ptr : nullptr;
+ return (i <= dl->length && dl->items[i].pgno == pgno) ? i : 0;
}
-static __hot MDBX_page *mdbx_dpl_remove(MDBX_DPL dl, pgno_t prno) {
- unsigned i = mdbx_dpl_search(dl, prno);
+static __always_inline MDBX_page *mdbx_dpl_find(MDBX_dpl *dl, pgno_t pgno) {
+ const unsigned i = mdbx_dpl_search(dl, pgno);
assert((int)i > 0);
- MDBX_page *mp = nullptr;
- if (i <= dl->length && dl[i].pgno == prno) {
- dl->sorted -= dl->sorted >= i;
- mp = dl[i].ptr;
- while (i < dl->length) {
- dl[i] = dl[i + 1];
- ++i;
- }
- dl->length -= 1;
+ return (i <= dl->length && dl->items[i].pgno == pgno) ? dl->items[i].ptr
+ : nullptr;
+}
+
+static __maybe_unused const MDBX_page *debug_dpl_find(const MDBX_dpl *dl,
+ const pgno_t pgno) {
+ for (unsigned i = dl->length; i > dl->sorted; --i)
+ if (dl->items[i].pgno == pgno)
+ return dl->items[i].ptr;
+
+ if (dl->sorted) {
+ const unsigned i =
+ (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items);
+ if (i <= dl->sorted && dl->items[i].pgno == pgno)
+ return dl->items[i].ptr;
}
- return mp;
+ return nullptr;
+}
+
+static void mdbx_dpl_remove(MDBX_dpl *dl, unsigned i) {
+ assert((int)i > 0 && i <= dl->length);
+ dl->sorted -= dl->sorted >= i;
+ dl->length -= 1;
+ if (dl->length >= i)
+ memmove(dl->items + i, dl->items + i + 1,
+ (dl->length - i + 1) * sizeof(dl->items[0]));
}
static __always_inline int __must_check_result
-mdbx_dpl_append(MDBX_DPL dl, pgno_t pgno, MDBX_page *page) {
- assert(dl->length <= MDBX_DPL_TXNFULL);
+mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page) {
+ MDBX_dpl *dl = txn->tw.dirtylist;
+ assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
if (mdbx_audit_enabled()) {
for (unsigned i = dl->length; i > 0; --i) {
- assert(dl[i].pgno != pgno);
- if (unlikely(dl[i].pgno == pgno))
+ assert(dl->items[i].pgno != pgno);
+ if (unlikely(dl->items[i].pgno == pgno))
return MDBX_PROBLEM;
}
}
- if (unlikely(dl->length == MDBX_DPL_TXNFULL))
- return MDBX_TXN_FULL;
+ if (unlikely(dl->length == dl->allocated)) {
+ if (unlikely(dl->allocated >= MDBX_PGL_LIMIT)) {
+ mdbx_error("DPL is full (MDBX_PGL_LIMIT %u)", MDBX_PGL_LIMIT);
+ return MDBX_TXN_FULL;
+ }
+ const size_t size = (dl->allocated < MDBX_PNL_INITIAL * 42)
+ ? dl->allocated + dl->allocated
+ : dl->allocated + dl->allocated / 2;
+ dl = mdbx_dpl_reserve(txn, size);
+ if (unlikely(!dl))
+ return MDBX_ENOMEM;
+ mdbx_tassert(txn, dl->length < dl->allocated);
+ }
/* append page */
const unsigned n = dl->length + 1;
- if (n == 1 || (dl->sorted >= dl->length && dl[n - 1].pgno < pgno))
+ if (n == 1 || (dl->sorted >= dl->length && dl->items[n - 1].pgno < pgno))
dl->sorted = n;
dl->length = n;
- dl[n].pgno = pgno;
- dl[n].ptr = page;
+ dl->items[n].pgno = pgno;
+ dl->items[n].ptr = page;
return MDBX_SUCCESS;
}
-static __always_inline void mdbx_dpl_clear(MDBX_DPL dl) {
- dl->sorted = dl->length = 0;
-}
-
/*----------------------------------------------------------------------------*/
uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT;
uint8_t mdbx_loglevel = MDBX_LOG_FATAL;
MDBX_debug_func *mdbx_debug_logger;
-static bool mdbx_refund(MDBX_txn *txn);
static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp);
static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp);
static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
@@ -6155,7 +6316,7 @@ static int mdbx_cursor_touch(MDBX_cursor *mc);
enum {
/* mdbx_txn_end operation number, for logging */
MDBX_END_COMMITTED,
- MDBX_END_EMPTY_COMMIT,
+ MDBX_END_PURE_COMMIT,
MDBX_END_ABORT,
MDBX_END_RESET,
MDBX_END_RESET_TMP,
@@ -6167,7 +6328,7 @@ enum {
#define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */
#define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */
#define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */
-static int mdbx_txn_end(MDBX_txn *txn, unsigned mode);
+static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode);
static int __must_check_result mdbx_page_get(MDBX_cursor *mc, pgno_t pgno,
MDBX_page **mp, int *lvl,
@@ -6233,10 +6394,6 @@ static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp);
static int __must_check_result mdbx_audit_ex(MDBX_txn *txn,
unsigned retired_stored,
bool dont_filter_gc);
-static __maybe_unused __always_inline int __must_check_result
-mdbx_audit(MDBX_txn *txn) {
- return mdbx_audit_ex(txn, 0, (txn->mt_flags & MDBX_TXN_RDONLY) != 0);
-}
static int __must_check_result mdbx_page_check(MDBX_cursor *const mc,
const MDBX_page *const mp,
@@ -6572,7 +6729,7 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) {
return;
case P_META:
mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno,
- page_meta(mp)->mm_txnid_a.inconsistent);
+ unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a));
return;
default:
mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags);
@@ -6678,13 +6835,15 @@ int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a,
* Set MDBX_TXN_ERROR on failure. */
static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) {
MDBX_env *env = txn->mt_env;
- MDBX_page *np = env->me_dpages;
+ MDBX_page *np = env->me_dp_reserve;
size_t size = env->me_psize;
if (likely(num == 1 && np)) {
+ mdbx_assert(env, env->me_dp_reserve_len > 0);
ASAN_UNPOISON_MEMORY_REGION(np, size);
VALGRIND_MEMPOOL_ALLOC(env, np, size);
VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next));
- env->me_dpages = np->mp_next;
+ env->me_dp_reserve = np->mp_next;
+ env->me_dp_reserve_len -= 1;
} else {
size = pgno2bytes(env, num);
np = mdbx_malloc(size);
@@ -6713,15 +6872,20 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) {
return np;
}
-/* Free a dirty page */
-static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) {
-#if MDBX_DEBUG
- dp->mp_pgno = MAX_PAGENO + 1;
-#endif
- if (pages == 1) {
- dp->mp_next = env->me_dpages;
+/* Free a shadow dirty page */
+static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) {
+ VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages));
+ ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages));
+ if (MDBX_DEBUG || unlikely(env->me_flags & MDBX_PAGEPERTURB))
+ memset(dp, -1, pgno2bytes(env, npages));
+ if (npages == 1 &&
+ env->me_dp_reserve_len < env->me_options.dp_reserve_limit) {
+ ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next),
+ pgno2bytes(env, npages) - sizeof(dp->mp_next));
+ dp->mp_next = env->me_dp_reserve;
VALGRIND_MEMPOOL_FREE(env, dp);
- env->me_dpages = dp;
+ env->me_dp_reserve = dp;
+ env->me_dp_reserve_len += 1;
} else {
/* large pages just get freed directly */
VALGRIND_MEMPOOL_FREE(env, dp);
@@ -6732,11 +6896,11 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) {
/* Return all dirty pages to dpage list */
static void mdbx_dlist_free(MDBX_txn *txn) {
MDBX_env *env = txn->mt_env;
- const MDBX_DPL dl = txn->tw.dirtylist;
+ MDBX_dpl *const dl = txn->tw.dirtylist;
const size_t n = dl->length;
for (size_t i = 1; i <= n; i++) {
- MDBX_page *dp = dl[i].ptr;
+ MDBX_page *dp = dl->items[i].ptr;
mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1);
}
@@ -6756,13 +6920,15 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) {
if (!mdbx_audit_enabled())
return true;
+ const MDBX_dpl *const dl = txn->tw.dirtylist;
unsigned loose = 0;
- for (unsigned i = txn->tw.dirtylist->length; i > 0; --i) {
- const MDBX_page *const dp = txn->tw.dirtylist[i].ptr;
+ for (unsigned i = dl->length; i > 0; --i) {
+ const MDBX_page *const dp = dl->items[i].ptr;
if (!dp)
continue;
- mdbx_tassert(txn, dp->mp_pgno == txn->tw.dirtylist[i].pgno);
- if (unlikely(dp->mp_pgno != txn->tw.dirtylist[i].pgno))
+
+ mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno);
+ if (unlikely(dp->mp_pgno != dl->items[i].pgno))
return false;
mdbx_tassert(txn, dp->mp_flags & P_DIRTY);
@@ -6780,9 +6946,9 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) {
if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num))
return false;
- if (i < txn->tw.dirtylist->sorted) {
- mdbx_tassert(txn, txn->tw.dirtylist[i + 1].pgno >= dp->mp_pgno + num);
- if (unlikely(txn->tw.dirtylist[i + 1].pgno < dp->mp_pgno + num))
+ if (i < dl->sorted) {
+ mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num);
+ if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num))
return false;
}
@@ -6805,19 +6971,17 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) {
if (unlikely(loose != txn->tw.loose_count))
return false;
- if (txn->tw.dirtylist->length - txn->tw.dirtylist->sorted < 16) {
- for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) {
- const MDBX_page *const dp =
- mdbx_dpl_find(txn->tw.dirtylist, txn->tw.retired_pages[i]);
- mdbx_tassert(txn, !dp);
- if (unlikely(dp))
- return false;
- }
+ for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) {
+ const MDBX_page *const dp = debug_dpl_find(dl, txn->tw.retired_pages[i]);
+ mdbx_tassert(txn, !dp);
+ if (unlikely(dp))
+ return false;
}
return true;
}
+#if MDBX_ENABLE_REFUND
static void mdbx_refund_reclaimed(MDBX_txn *txn) {
/* Scanning in descend order */
pgno_t next_pgno = txn->mt_next_pgno;
@@ -6842,19 +7006,16 @@ static void mdbx_refund_reclaimed(MDBX_txn *txn) {
mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO,
txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno);
txn->mt_next_pgno = next_pgno;
- mdbx_tassert(
- txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
+ mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - 1));
}
static void mdbx_refund_loose(MDBX_txn *txn) {
- mdbx_tassert(txn, mdbx_dirtylist_check(txn));
mdbx_tassert(txn, txn->tw.loose_pages != nullptr);
mdbx_tassert(txn, txn->tw.loose_count > 0);
- const MDBX_DPL dl = txn->tw.dirtylist;
+ MDBX_dpl *const dl = txn->tw.dirtylist;
mdbx_tassert(txn, dl->length >= txn->tw.loose_count);
- mdbx_tassert(txn, txn->tw.spill_pages == nullptr ||
- dl->length >= MDBX_PNL_SIZE(txn->tw.spill_pages));
pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)];
MDBX_PNL suitable = onstack;
@@ -6871,15 +7032,15 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count);
pgno_t most = MIN_PAGENO;
unsigned w = 0;
- for (const MDBX_page *dp = txn->tw.loose_pages; dp; dp = dp->mp_next) {
- mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY));
- mdbx_tassert(txn, txn->mt_next_pgno > dp->mp_pgno);
- if (likely(txn->mt_next_pgno - txn->tw.loose_count <= dp->mp_pgno)) {
+ for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) {
+ mdbx_tassert(txn, lp->mp_flags == (P_LOOSE | P_DIRTY));
+ mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno);
+ if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) {
mdbx_tassert(txn,
w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack))
: MDBX_PNL_ALLOCLEN(suitable)));
- suitable[++w] = dp->mp_pgno;
- most = (dp->mp_pgno > most) ? dp->mp_pgno : most;
+ suitable[++w] = lp->mp_pgno;
+ most = (lp->mp_pgno > most) ? lp->mp_pgno : most;
}
}
@@ -6905,6 +7066,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
refunded, most, txn->mt_next_pgno);
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
txn->mt_next_pgno = most;
/* Filter-out dirty list */
@@ -6912,37 +7074,38 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
w = 0;
if (dl->sorted) {
do {
- if (dl[++r].pgno < most) {
+ if (dl->items[++r].pgno < most) {
if (++w != r)
- dl[w] = dl[r];
+ dl->items[w] = dl->items[r];
}
} while (r < dl->sorted);
dl->sorted = w;
}
while (r < dl->length) {
- if (dl[++r].pgno < most) {
+ if (dl->items[++r].pgno < most) {
if (++w != r)
- dl[w] = dl[r];
+ dl->items[w] = dl->items[r];
}
}
dl->length = w;
- mdbx_tassert(txn, txn->mt_parent ||
- txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length ==
+ txn->mt_env->me_options.dp_limit);
goto unlink_loose;
}
} else {
/* Dirtylist is mostly sorted, just refund loose pages at the end. */
mdbx_dpl_sort(dl);
- mdbx_tassert(txn, dl->length < 2 || dl[1].pgno < dl[dl->length].pgno);
+ mdbx_tassert(txn, dl->length < 2 ||
+ dl->items[1].pgno < dl->items[dl->length].pgno);
mdbx_tassert(txn, dl->sorted == dl->length);
/* Scan dirtylist tail-forward and cutoff suitable pages. */
- while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 &&
- dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) {
- MDBX_page *dp = dl[dl->length].ptr;
+ while (dl->length && dl->items[dl->length].pgno == txn->mt_next_pgno - 1 &&
+ dl->items[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) {
+ MDBX_page *dp = dl->items[dl->length].ptr;
mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno);
- mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno);
+ mdbx_tassert(txn, dp->mp_pgno == dl->items[dl->length].pgno);
+ txn->mt_next_pgno -= 1;
dl->length -= 1;
}
@@ -6951,10 +7114,9 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
dl->sorted = dl->length;
txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded;
- txn->mt_next_pgno -= refunded;
- mdbx_tassert(txn, txn->mt_parent ||
- txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length ==
+ txn->mt_env->me_options.dp_limit);
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
/* Filter-out loose chain & dispose refunded pages. */
unlink_loose:
@@ -6973,9 +7135,8 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
}
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
- mdbx_tassert(txn, txn->mt_parent ||
- txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ mdbx_tassert(txn, txn->mt_parent || txn->tw.dirtyroom + dl->length ==
+ txn->mt_env->me_options.dp_limit);
if (suitable != onstack)
mdbx_pnl_free(suitable);
txn->tw.loose_refund_wl = txn->mt_next_pgno;
@@ -7002,11 +7163,27 @@ static bool mdbx_refund(MDBX_txn *txn) {
break;
}
- return before != txn->mt_next_pgno;
+ if (before == txn->mt_next_pgno)
+ return false;
+
+ if (txn->tw.spill_pages)
+ /* Squash deleted pagenums if we refunded any */
+ mdbx_spill_purge(txn);
+
+ return true;
+}
+#else /* MDBX_ENABLE_REFUND */
+static __inline bool mdbx_refund(MDBX_txn *txn) {
+ (void)txn;
+ /* No online auto-compactification. */
+ return false;
}
+#endif /* MDBX_ENABLE_REFUND */
static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
unsigned npages) {
+ mdbx_debug("kill%s %u page %" PRIaPGNO, IS_DIRTY(mp) ? " dirty" : "", npages,
+ pgno);
mdbx_assert(env, pgno >= NUM_METAS && npages);
if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) {
const size_t bytes = pgno2bytes(env, npages);
@@ -7033,6 +7210,30 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
}
}
+/* Remove page from dirty list */
+static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di,
+ MDBX_page *const mp,
+ const unsigned npages) {
+ mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length &&
+ txn->tw.dirtylist->items[di].ptr == mp);
+ mdbx_dpl_remove(txn->tw.dirtylist, di);
+ txn->tw.dirtyroom++;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
+ mdbx_tassert(txn, txn->mt_parent ||
+ txn->tw.dirtyroom + txn->tw.dirtylist->length ==
+ txn->mt_env->me_options.dp_limit);
+ mp->mp_txnid = INVALID_TXNID;
+ mp->mp_flags = 0xFFFF;
+ VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
+ if (txn->mt_flags & MDBX_WRITEMAP) {
+ VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
+ pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
+ ASAN_POISON_MEMORY_REGION(page_data(mp),
+ pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
+ } else
+ mdbx_dpage_free(txn->mt_env, mp, npages);
+}
+
/* Retire, loosen or free a single page.
*
* Saves single pages to a list for future reuse
@@ -7043,95 +7244,212 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
*
* If the page wasn't dirtied in this txn, just add it
* to this txn's free list. */
-
static int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) {
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
const pgno_t pgno = mp->mp_pgno;
+ const bool is_dirty = IS_DIRTY(mp);
- mp->mp_txnid = INVALID_TXNID;
- if (txn->mt_parent) {
- mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0);
- mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno));
- /* If txn has a parent, make sure the page is in our dirty list. */
- MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno);
- /* TODO: use extended flag-mask to track parent's dirty-pages */
- if (dp == nullptr) {
- mp->mp_next = txn->tw.retired2parent_pages;
- txn->tw.retired2parent_pages = mp;
- txn->tw.retired2parent_count += npages;
- return MDBX_SUCCESS;
- }
- if (unlikely(mp != dp)) { /* bad cursor? */
- mdbx_error(
- "wrong page 0x%p #%" PRIaPGNO " in the dirtylist, expecting %p",
- __Wpedantic_format_voidptr(dp), pgno, __Wpedantic_format_voidptr(mp));
- txn->mt_flags |= MDBX_TXN_ERROR;
- return MDBX_PROBLEM;
- }
- /* ok, it's ours */
+ if (is_dirty) {
+ mdbx_tassert(txn, !txn->tw.spill_pages ||
+ !mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1));
+ mdbx_tassert(txn, debug_dpl_find(txn->tw.dirtylist, pgno) == mp ||
+ txn->mt_parent);
+ } else {
+ mdbx_tassert(txn, !debug_dpl_find(txn->tw.dirtylist, pgno));
}
- mdbx_debug("loosen page %" PRIaPGNO, pgno);
- const bool is_dirty = IS_DIRTY(mp);
- if (MDBX_DEBUG != 0 ||
- unlikely((txn->mt_env->me_flags & MDBX_PAGEPERTURB) != 0)) {
- mdbx_kill_page(txn->mt_env, mp, pgno, npages);
- VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ);
- }
- VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
- pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
- ASAN_POISON_MEMORY_REGION(page_data(mp),
- pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ);
-
- if (unlikely(npages >
- 1 /* overflow pages doesn't comes to the loose-list */)) {
- if (is_dirty) {
- /* Remove from dirty list */
- MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, pgno);
- if (unlikely(dp != mp)) {
- mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist",
+ const unsigned di = is_dirty ? mdbx_dpl_exist(txn->tw.dirtylist, pgno) : 0;
+ const unsigned si = (!is_dirty && txn->tw.spill_pages)
+ ? mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)
+ : 0;
+
+ if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) {
+ const char *kind;
+ if (di) {
+ mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == mp);
+ if (unlikely(txn->tw.dirtylist->items[di].ptr != mp)) {
+ wrong_dirty:
+ mdbx_error("wrong dirty page 0x%p #%" PRIaPGNO,
__Wpedantic_format_voidptr(mp), pgno);
txn->mt_flags |= MDBX_TXN_ERROR;
return MDBX_PROBLEM;
}
- txn->tw.dirtyroom++;
- mdbx_tassert(txn, txn->mt_parent ||
- txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
- if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
- mdbx_dpage_free(txn->mt_env, mp, npages);
+ /* Страница испачкана в этой транзакции, но до этого могла быть
+ * аллоцирована, испачкана и пролита в одной из родительских транзакций.
+ * Её МОЖНО вытолкнуть в нераспределенный хвост. */
+ kind = "dirty";
+ /* Remove from dirty list */
+ mdbx_page_wash(txn, di, mp, npages);
+ } else if (is_dirty) {
+ /* The page MUST BE from parent transaction. */
+ if (mdbx_audit_enabled()) {
+ const MDBX_page *parent_dp = nullptr;
+ for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp;
+ parent = parent->mt_parent) {
+ mdbx_tassert(txn,
+ !parent->tw.spill_pages ||
+ !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1));
+ parent_dp = debug_dpl_find(parent->tw.dirtylist, pgno);
+ }
+ mdbx_tassert(txn, parent_dp == mp);
+ }
+ kind = "parent-dirty";
+ } else if (si) {
+ /* Страница пролита в этой транзакции, следовательно она аллоцирована
+ * и запачкана в этой или одной из родительских транзакций.
+ * Её МОЖНО вытолкнуть в нераспределенный хвост. */
+ kind = "spilled";
+ mdbx_spill_remove(txn, si, npages);
+ } else {
+ for (MDBX_txn *parent = txn->mt_parent;
+ parent && (parent->mt_flags & MDBX_TXN_SPILLS);
+ parent = parent->mt_parent)
+ if (parent->tw.spill_pages &&
+ mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) {
+ /* Страница аллоцирована, запачкана и пролита в одной из родительских
+ * транзакций. Её МОЖНО вытолкнуть в нераспределенный хвост. */
+ kind = "parent-spilled";
+ goto refund;
+ }
+ /* Страница используется (входит в MVCC-снимки доступные читателям).
+ * Её НЕЛЬЗЯ вытолкнуть в нераспределенных хвост. */
+ goto retire;
}
+ refund:
+ mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno);
+ txn->mt_next_pgno = pgno;
+ mdbx_refund(txn);
+ return MDBX_SUCCESS;
+ }
- if (unlikely(pgno + npages == txn->mt_next_pgno)) {
- txn->mt_next_pgno = pgno;
- mdbx_refund(txn);
- return MDBX_SUCCESS;
+ if (is_dirty) {
+ if (di) {
+ /* Dirty page from this transaction */
+ mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == mp);
+ if (unlikely(txn->tw.dirtylist->items[di].ptr != mp))
+ goto wrong_dirty;
+
+ /* If suitable we can reuse it through loose list */
+ if (likely(
+ npages == 1 &&
+ txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit &&
+ (!MDBX_ENABLE_REFUND ||
+ /* skip pages near to the end in favor of compactification */
+ txn->mt_next_pgno >
+ pgno + txn->mt_env->me_options.dp_loose_limit ||
+ txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) {
+ mdbx_debug("loosen dirty page %" PRIaPGNO, pgno);
+ mp->mp_flags = P_LOOSE | P_DIRTY;
+ mp->mp_next = txn->tw.loose_pages;
+ txn->tw.loose_pages = mp;
+ txn->tw.loose_count++;
+#if MDBX_ENABLE_REFUND
+ txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl)
+ ? pgno + 2
+ : txn->tw.loose_refund_wl;
+#endif /* MDBX_ENABLE_REFUND */
+ if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
+ memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ);
+ VALGRIND_MAKE_MEM_NOACCESS(page_data(mp),
+ txn->mt_env->me_psize - PAGEHDRSZ);
+ ASAN_POISON_MEMORY_REGION(page_data(mp),
+ txn->mt_env->me_psize - PAGEHDRSZ);
+ return MDBX_SUCCESS;
+ }
+
+#if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__)
+ if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
+#endif
+ {
+ /* Страница могла быть изменена в одной из родительских транзакций,
+ * в том числе, позже выгружена и затем снова загружена и изменена.
+ * В обоих случаях её нельзя затирать на диске и помечать недоступной
+ * в asan и/или valgrind */
+ for (MDBX_txn *parent = txn->mt_parent;
+ parent && (parent->mt_flags & MDBX_TXN_SPILLS);
+ parent = parent->mt_parent) {
+ if (parent->tw.spill_pages &&
+ mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1,
+ npages << 1))
+ goto skip_invalidate;
+ if (mdbx_dpl_intersect(parent->tw.dirtylist, pgno, npages))
+ goto skip_invalidate;
+ }
+
+#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
+ if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB))
+#endif
+ mdbx_kill_page(txn->mt_env, mp, pgno, npages);
+ if (!(txn->mt_flags & MDBX_WRITEMAP)) {
+ VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)),
+ pgno2bytes(txn->mt_env, npages) -
+ PAGEHDRSZ);
+ ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)),
+ pgno2bytes(txn->mt_env, npages) -
+ PAGEHDRSZ);
+ }
+ }
+ skip_invalidate:
+ /* Remove from dirty list */
+ mdbx_page_wash(txn, di, mp, npages);
+ } else {
+ /* Dirty page MUST BE a clone from (one of) parent transaction(s). */
+ if (mdbx_audit_enabled()) {
+ const MDBX_page *parent_dp = nullptr;
+ /* Check parent(s)'s dirty lists. */
+ for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp;
+ parent = parent->mt_parent) {
+ mdbx_tassert(txn,
+ !parent->tw.spill_pages ||
+ !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1));
+ parent_dp = debug_dpl_find(parent->tw.dirtylist, pgno);
+ }
+ mdbx_tassert(txn, parent_dp == mp);
+ }
}
+ reclaim:
+ mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno);
int rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
- return MDBX_SUCCESS;
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
+ return rc;
}
- mp->mp_pgno = pgno;
- mp->mp_flags = P_LOOSE | P_DIRTY;
- mp->mp_next = txn->tw.loose_pages;
- txn->tw.loose_pages = mp;
- txn->tw.loose_count++;
- if (unlikely(txn->mt_next_pgno == pgno + 1))
- mdbx_refund(txn);
+ if (si) {
+ /* Page ws spilled in this txn */
+ mdbx_spill_remove(txn, si, npages);
+ /* Страница могла быть выделена и затем пролита в этой транзакции,
+ * тогда её необходимо поместить в reclaimed-список.
+ * Либо она могла быть выделена в одной из родительских транзакций и затем
+ * пролита в этой транзакции, тогда её необходимо поместить в retired-список
+ * для последующей фильтрации при коммите. */
+ for (MDBX_txn *parent = txn->mt_parent; parent;
+ parent = parent->mt_parent) {
+ if (mdbx_dpl_exist(parent->tw.dirtylist, pgno))
+ goto retire;
+ }
+ /* Страница точно была выделена в этой транзакции
+ * и теперь может быть использована повторно. */
+ goto reclaim;
+ } else {
+ /* Страница может входить в доступный читателям MVCC-снимок, либо же она
+ * могла быть выделена, а затем пролита в одной из родительских транзакций.
+ * Поэтому пока помещаем её в retired-список, который будет фильтроваться
+ * относительно dirty- и spilled-списков родительских транзакций при коммите
+ * дочерних транзакций, либо же будет записан в GC в неизменном виде. */
+ }
- return MDBX_SUCCESS;
+retire:
+ mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno);
+ int rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages);
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
+ return rc;
}
static int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) {
- const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
- const pgno_t pgno = mp->mp_pgno;
- MDBX_txn *const txn = mc->mc_txn;
-
if (unlikely(mc->mc_flags & C_SUB)) {
MDBX_db *outer = mdbx_outer_db(mc);
mdbx_cassert(mc, !IS_BRANCH(mp) || outer->md_branch_pages > 0);
@@ -7144,36 +7462,10 @@ static int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) {
mc->mc_db->md_branch_pages -= IS_BRANCH(mp);
mdbx_cassert(mc, !IS_LEAF(mp) || mc->mc_db->md_leaf_pages > 0);
mc->mc_db->md_leaf_pages -= IS_LEAF(mp);
- mdbx_cassert(mc, !IS_OVERFLOW(mp) || mc->mc_db->md_overflow_pages >= npages);
- mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0;
-
- if (IS_DIRTY(mp)) {
- int rc = mdbx_page_loose(txn, mp);
- if (unlikely(rc != MDBX_SUCCESS))
- mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
- return rc;
- }
-
- if (txn->tw.spill_pages) {
- const unsigned i = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1);
- if (i) {
- /* This page is no longer spilled */
- mdbx_tassert(txn, i == MDBX_PNL_SIZE(txn->tw.spill_pages) ||
- txn->tw.spill_pages[i + 1] >= (pgno + npages) << 1);
- txn->tw.spill_pages[i] |= 1;
- if (i == MDBX_PNL_SIZE(txn->tw.spill_pages))
- MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1;
- int rc = mdbx_page_loose(txn, mp);
- if (unlikely(rc != MDBX_SUCCESS))
- mc->mc_flags &= ~(C_INITIALIZED | C_EOF);
- return rc;
- }
- }
-
- mdbx_tassert(txn, mp == pgno2page(txn->mt_env, pgno));
- int rc = mdbx_pnl_append_range(&txn->tw.retired_pages, pgno, npages);
- mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, pgno) == nullptr);
- return rc;
+ mdbx_cassert(mc, !IS_OVERFLOW(mp) ||
+ mc->mc_db->md_overflow_pages >= mp->mp_pages);
+ mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? mp->mp_pages : 0;
+ return mdbx_page_loose(mc->mc_txn, mp);
}
static __must_check_result __always_inline int
@@ -7195,66 +7487,69 @@ mdbx_retire_pgno(MDBX_cursor *mc, const pgno_t pgno) {
return rc;
}
-/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
+/* Toggle P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
*
* [in] mc A cursor handle for the current operation.
* [in] pflags Flags of the pages to update:
* - P_DIRTY to set P_KEEP,
- * - P_DIRTY|P_KEEP to clear it.
- * [in] all No shortcuts. Needed except after a full mdbx_page_flush().
- *
- * Returns 0 on success, non-zero on failure. */
-static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) {
- const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP;
- MDBX_txn *txn = mc->mc_txn;
- MDBX_cursor *m3, *m0 = mc;
- MDBX_xcursor *mx;
- MDBX_page *mp;
- unsigned i, j;
- int rc = MDBX_SUCCESS;
-
- /* Mark pages seen by cursors: First m0, then tracked cursors */
- for (i = txn->mt_numdbs;;) {
- if (mc->mc_flags & C_INITIALIZED) {
- for (m3 = mc;; m3 = &mx->mx_cursor) {
- mp = NULL;
- for (j = 0; j < m3->mc_snum; j++) {
- mp = m3->mc_pg[j];
- if ((mp->mp_flags & Mask) == pflags)
- mp->mp_flags ^= P_KEEP;
- }
- mx = m3->mc_xcursor;
- /* Proceed to mx if it is at a sub-database */
- if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
- break;
- if (!(mp && IS_LEAF(mp)))
- break;
- if (!(node_flags(page_node(mp, m3->mc_ki[j - 1])) & F_SUBDATA))
- break;
+ * - P_DIRTY|P_KEEP to clear it. */
+static void mdbx_cursor_xkeep(MDBX_cursor *mc, unsigned pflags) {
+ const unsigned mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP;
+ if (mc->mc_flags & C_INITIALIZED) {
+ MDBX_cursor *m3 = mc;
+ for (;;) {
+ MDBX_page *mp = NULL;
+ for (unsigned j = 0; j < m3->mc_snum; j++) {
+ mp = m3->mc_pg[j];
+ if ((mp->mp_flags & mask) == pflags)
+ mp->mp_flags ^= P_KEEP;
}
+ if (!(mp && IS_LEAF(mp)))
+ break;
+ /* Proceed to mx if it is at a sub-database */
+ MDBX_xcursor *mx = m3->mc_xcursor;
+ if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
+ break;
+ const unsigned nkeys = page_numkeys(mp);
+ unsigned ki = m3->mc_ki[m3->mc_top];
+ mdbx_cassert(mc, nkeys > 0 &&
+ (ki < nkeys ||
+ (ki == nkeys && (mx->mx_cursor.mc_flags & C_EOF))));
+ ki -= ki >= nkeys;
+ if (!(node_flags(page_node(mp, ki)) & F_SUBDATA))
+ break;
+ m3 = &mx->mx_cursor;
}
- mc = mc->mc_next;
- for (; !mc || mc == m0; mc = txn->tw.cursors[--i])
- if (i == 0)
- goto mark_done;
}
+}
+
+/* Mark pages seen by cursors: First m0, then tracked cursors
+ * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). */
+static void mdbx_txn_xkeep(MDBX_txn *txn, MDBX_cursor *m0,
+ const unsigned pflags, const bool all) {
+ if (m0)
+ mdbx_cursor_xkeep(m0, pflags);
+
+ for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i)
+ if (txn->mt_dbistate[i] & DBI_DIRTY)
+ for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next)
+ if (mc != m0)
+ mdbx_cursor_xkeep(mc, pflags);
-mark_done:
if (all) {
/* Mark dirty root pages */
- for (i = 0; i < txn->mt_numdbs; i++) {
+ const unsigned mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP;
+ for (unsigned i = 0; i < txn->mt_numdbs; i++) {
if (txn->mt_dbistate[i] & DBI_DIRTY) {
pgno_t pgno = txn->mt_dbs[i].md_root;
if (pgno == P_INVALID)
continue;
MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno);
- if (dp && (dp->mp_flags & Mask) == pflags)
+ if (dp && (dp->mp_flags & mask) == pflags)
dp->mp_flags ^= P_KEEP;
}
}
}
-
- return rc;
}
/* Spill pages from the dirty list back to disk.
@@ -7283,110 +7578,137 @@ mark_done:
* we can't spill a page in a child txn if it was already spilled in a
* parent txn. That would alter the parent txns' data even though
* the child hasn't committed yet, and we'd have no way to undo it if
- * the child aborted.
- *
- * [in] mc cursor A cursor handle identifying the transaction and
- * database for which we are checking space.
- * [in] key For a put operation, the key being stored.
- * [in] data For a put operation, the data being stored.
- *
- * Returns 0 on success, non-zero on failure. */
-static int mdbx_page_spill(MDBX_cursor *mc, const MDBX_val *key,
- const MDBX_val *data) {
- if (mc->mc_flags & C_SUB)
+ * the child aborted. */
+static int mdbx_txn_spill(MDBX_txn *txn, MDBX_cursor *m0, unsigned need) {
+#ifndef MDBX_DEBUG_SPILLING
+ if (likely(txn->tw.dirtyroom > need))
return MDBX_SUCCESS;
-
- MDBX_txn *txn = mc->mc_txn;
- MDBX_DPL dl = txn->tw.dirtylist;
-
- /* Estimate how much space this op will take */
- pgno_t i = mc->mc_db->md_depth;
- /* Named DBs also dirty the main DB */
- if (mc->mc_dbi >= CORE_DBS)
- i += txn->mt_dbs[MAIN_DBI].md_depth;
- /* For puts, roughly factor in the key+data size */
- if (key)
- i += bytes2pgno(txn->mt_env, node_size(key, data) + txn->mt_env->me_psize);
- i += i; /* double it for good measure */
- pgno_t need = i;
-
- if (txn->tw.dirtyroom > i)
+ unsigned spill = need - txn->tw.dirtyroom;
+#else
+ /* spill at least one page if defined MDBX_DEBUG_SPILLING */
+ unsigned spill = (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1;
+#endif /* MDBX_DEBUG_SPILLING */
+
+ const unsigned dirty = txn->tw.dirtylist->length;
+ const unsigned spill_min =
+ txn->mt_env->me_options.spill_min_denominator
+ ? dirty / txn->mt_env->me_options.spill_min_denominator
+ : 0;
+ const unsigned spill_max =
+ dirty - (txn->mt_env->me_options.spill_max_denominator
+ ? dirty / txn->mt_env->me_options.spill_max_denominator
+ : 0);
+ spill = (spill > spill_min) ? spill : spill_min;
+ spill = (spill < spill_max) ? spill : spill_max;
+ if (!spill)
return MDBX_SUCCESS;
- /* Less aggressive spill - we originally spilled the entire dirty list,
- * with a few exceptions for cursor pages and DB root pages. But this
- * turns out to be a lot of wasted effort because in a large txn many
- * of those pages will need to be used again. So now we spill only 1/8th
- * of the dirty pages. Testing revealed this to be a good tradeoff,
- * better than 1/2, 1/4, or 1/10. */
- if (need < MDBX_DPL_TXNFULL / 8)
- need = MDBX_DPL_TXNFULL / 8;
+ mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", spill,
+ txn->tw.dirtyroom, need);
+ mdbx_tassert(txn, txn->tw.dirtylist->length >= spill);
+ int rc;
if (!txn->tw.spill_pages) {
- txn->tw.spill_pages = mdbx_pnl_alloc(need);
- if (unlikely(!txn->tw.spill_pages))
- return MDBX_ENOMEM;
+ txn->tw.spill_least_removed = INT_MAX;
+ txn->tw.spill_pages = mdbx_pnl_alloc(spill);
+ if (unlikely(!txn->tw.spill_pages)) {
+ rc = MDBX_ENOMEM;
+ goto bailout;
+ }
} else {
/* purge deleted slots */
- MDBX_PNL sl = txn->tw.spill_pages;
- pgno_t num = MDBX_PNL_SIZE(sl), j = 0;
- for (i = 1; i <= num; i++) {
- if ((sl[i] & 1) == 0)
- sl[++j] = sl[i];
- }
- MDBX_PNL_SIZE(sl) = j;
+ mdbx_spill_purge(txn);
+ rc = mdbx_pnl_reserve(&txn->tw.spill_pages, spill);
+ (void)rc /* ignore since the resulting list may be shorter
+ and mdbx_pnl_append() will increase pnl on demand */
+ ;
}
/* Preserve pages which may soon be dirtied again */
- int rc = mdbx_pages_xkeep(mc, P_DIRTY, true);
- if (unlikely(rc != MDBX_SUCCESS))
- goto bailout;
+ mdbx_txn_xkeep(txn, m0, P_DIRTY, true);
+ MDBX_dpl *const dl = mdbx_dpl_sort(txn->tw.dirtylist);
/* Save the page IDs of all the pages we're flushing */
/* flush from the tail forward, this saves a lot of shifting later on. */
- for (i = dl->length; i && need; i--) {
- pgno_t pn = dl[i].pgno << 1;
- MDBX_page *dp = dl[i].ptr;
- if (dp->mp_flags & (P_LOOSE | P_KEEP))
+ const unsigned dl_len_before = dl->length;
+ unsigned spilled = 0;
+ unsigned keep = dl_len_before;
+ for (; keep && spill; keep--) {
+ const pgno_t pgno = dl->items[keep].pgno;
+ MDBX_page *dp = dl->items[keep].ptr;
+ const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1;
+ if (dp->mp_flags & (P_LOOSE | P_KEEP)) {
+ mdbx_debug("skip %s %u page %" PRIaPGNO,
+ (dp->mp_flags & P_LOOSE) ? "loose" : "keep", npages,
+ dp->mp_pgno);
+ skip:
continue;
+ }
/* Can't spill twice,
- * make sure it's not already in a parent's spill list. */
- if (txn->mt_parent) {
- MDBX_txn *parent;
- for (parent = txn->mt_parent; parent; parent = parent->mt_parent) {
+ * make sure it's not already in a parent's spill list(s). */
+ MDBX_txn *parent = txn->mt_parent;
+ if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) {
+ do
if (parent->tw.spill_pages &&
- mdbx_pnl_exist(parent->tw.spill_pages, pn)) {
+ mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1,
+ npages << 1)) {
+ mdbx_debug("skip parent-spilled %u page %" PRIaPGNO, npages, pgno);
dp->mp_flags |= P_KEEP;
- break;
+ goto skip;
}
- }
- if (parent)
- continue;
+ while ((parent = parent->mt_parent) != nullptr);
}
- rc = mdbx_pnl_append(&txn->tw.spill_pages, pn);
+ mdbx_debug("spill %u page %" PRIaPGNO, npages, dp->mp_pgno);
+ rc = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
- need--;
+ spill--;
+ spilled += 1;
}
mdbx_pnl_sort(txn->tw.spill_pages);
/* Flush the spilled part of dirty list */
- rc = mdbx_page_flush(txn, i);
+ rc = mdbx_page_flush(txn, keep);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
/* Reset any dirty pages we kept that page_flush didn't see */
- rc = mdbx_pages_xkeep(mc, P_DIRTY | P_KEEP, i != 0);
+ mdbx_tassert(txn, dl_len_before - spilled == dl->length);
+ mdbx_txn_xkeep(txn, m0, P_DIRTY | P_KEEP, keep > 0);
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
bailout:
txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS;
return rc;
}
+static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key,
+ const MDBX_val *data) {
+ if (mc->mc_flags & C_SUB)
+ return MDBX_SUCCESS;
+ MDBX_txn *txn = mc->mc_txn;
+ if (txn->mt_flags & MDBX_WRITEMAP)
+ return MDBX_SUCCESS;
+
+ /* Estimate how much space this operation will take: */
+ /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */
+ unsigned need = CURSOR_STACK + 3;
+ /* 2) GC/FreeDB for any payload */
+ if (mc->mc_dbi > FREE_DBI) {
+ need += txn->mt_dbs[FREE_DBI].md_depth + 3;
+ /* 3) Named DBs also dirty the main DB */
+ if (mc->mc_dbi > MAIN_DBI)
+ need += txn->mt_dbs[MAIN_DBI].md_depth + 3;
+ }
+ /* 4) Factor the key+data which to be put in */
+ need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1;
+ return mdbx_txn_spill(txn, mc, need);
+}
+
/*----------------------------------------------------------------------------*/
static __always_inline bool meta_bootid_match(const MDBX_meta *meta) {
- return meta->mm_bootid.x == bootid.x && meta->mm_bootid.y == bootid.y &&
+ return memcmp(&meta->mm_bootid, &bootid, 16) == 0 &&
(bootid.x | bootid.y) != 0;
}
@@ -7402,9 +7724,8 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta,
static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta,
const bool allow_volatile) {
- mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
- txnid_t a = safe64_read(&meta->mm_txnid_a);
- txnid_t b = safe64_read(&meta->mm_txnid_b);
+ txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a);
+ txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b);
if (allow_volatile)
return (a == b) ? a : 0;
mdbx_assert(env, a == b);
@@ -7423,33 +7744,35 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env,
static __inline void mdbx_meta_update_begin(const MDBX_env *env,
MDBX_meta *meta, txnid_t txnid) {
- mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
- mdbx_assert(env, meta->mm_txnid_a.inconsistent < txnid &&
- meta->mm_txnid_b.inconsistent < txnid);
+ mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
+ mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid &&
+ unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
(void)env;
- safe64_update(&meta->mm_txnid_a, txnid);
+ unaligned_poke_u64(4, meta->mm_txnid_b, 0);
+ unaligned_poke_u64(4, meta->mm_txnid_a, txnid);
}
static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
txnid_t txnid) {
- mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
- mdbx_assert(env, meta->mm_txnid_a.inconsistent == txnid);
- mdbx_assert(env, meta->mm_txnid_b.inconsistent < txnid);
+ mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env));
+ mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid);
+ mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid);
(void)env;
mdbx_jitter4testing(true);
- meta->mm_bootid = bootid;
- safe64_update(&meta->mm_txnid_b, txnid);
+ memcpy(&meta->mm_bootid, &bootid, 16);
+ unaligned_poke_u64(4, meta->mm_txnid_b, txnid);
}
static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
txnid_t txnid) {
- mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env));
+ mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) ||
+ meta >= METAPAGE_END(env));
(void)env;
/* update inconsistent since this function used ONLY for filling meta-image
* for writing, but not the actual meta-page */
- meta->mm_bootid = bootid;
- meta->mm_txnid_a.inconsistent = txnid;
- meta->mm_txnid_b.inconsistent = txnid;
+ memcpy(&meta->mm_bootid, &bootid, 16);
+ unaligned_poke_u64(4, meta->mm_txnid_a, txnid);
+ unaligned_poke_u64(4, meta->mm_txnid_b, txnid);
}
static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) {
@@ -7579,8 +7902,10 @@ static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) {
static const char *mdbx_durable_str(const MDBX_meta *const meta) {
if (META_IS_STEADY(meta))
- return (meta->mm_datasync_sign == mdbx_meta_sign(meta)) ? "Steady"
- : "Tainted";
+ return (unaligned_peek_u64(4, meta->mm_datasync_sign) ==
+ mdbx_meta_sign(meta))
+ ? "Steady"
+ : "Tainted";
return "Weak";
}
@@ -7662,15 +7987,22 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) {
mp->mp_txnid = INVALID_TXNID;
mp->mp_flags |= P_DIRTY;
- const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp);
+ if (unlikely(txn->tw.dirtyroom == 0)) {
+ mdbx_error("Dirtyroom is depleted, DPL length %u",
+ txn->tw.dirtylist->length);
+ return MDBX_TXN_FULL;
+ }
+ const int rc = mdbx_dpl_append(txn, mp->mp_pgno, mp);
if (unlikely(rc != MDBX_SUCCESS)) {
txn->mt_flags |= MDBX_TXN_ERROR;
return rc;
}
txn->tw.dirtyroom--;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
mdbx_tassert(txn, txn->mt_parent ||
txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ txn->mt_env->me_options.dp_limit);
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
return MDBX_SUCCESS;
}
@@ -8000,7 +8332,7 @@ static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady,
mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady,
data_page(meta)->mp_pgno);
if (env->me_flags & MDBX_WRITEMAP)
- meta->mm_datasync_sign = wipe;
+ unaligned_poke_u64(4, meta->mm_datasync_sign, wipe);
else
return mdbx_pwrite(env->me_lazy_fd, &wipe, sizeof(meta->mm_datasync_sign),
(uint8_t *)&meta->mm_datasync_sign - env->me_map);
@@ -8096,32 +8428,40 @@ static __inline txnid_t pp_txnid2chk(const MDBX_txn *txn) {
__hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
MDBX_page **const mp, int flags) {
int rc;
- MDBX_txn *txn = mc->mc_txn;
- MDBX_env *env = txn->mt_env;
+ MDBX_txn *const txn = mc->mc_txn;
+ MDBX_env *const env = txn->mt_env;
MDBX_page *np;
+ const unsigned coalesce_threshold =
+ env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4;
if (likely(flags & MDBX_ALLOC_GC)) {
flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM);
- if (unlikely(mc->mc_flags & C_RECLAIMING)) {
- /* If mc is updating the GC, then the retired-list cannot play
- * catch-up with itself by growing while trying to save it. */
- flags &= ~MDBX_ALLOC_GC;
- } else if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) {
- /* avoid (recursive) search inside empty tree and while tree is updating,
- * https://github.com/erthink/libmdbx/issues/31 */
- flags &= ~MDBX_ALLOC_GC;
- }
+ if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold)
+ flags &= ~MDBX_COALESCE;
+ if (unlikely(
+ /* If mc is updating the GC, then the retired-list cannot play
+ catch-up with itself by growing while trying to save it. */
+ (mc->mc_flags & C_RECLAIMING) ||
+ /* avoid (recursive) search inside empty tree and while tree is
+ updating, https://github.com/erthink/libmdbx/issues/31 */
+ txn->mt_dbs[FREE_DBI].md_entries == 0 ||
+ /* If our dirty list is already full, we can't touch GC */
+ (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth &&
+ !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY))))
+ flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE);
}
if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) {
/* If there are any loose pages, just use them */
mdbx_assert(env, mp && num);
if (likely(txn->tw.loose_pages)) {
+#if MDBX_ENABLE_REFUND
if (txn->tw.loose_refund_wl > txn->mt_next_pgno) {
mdbx_refund(txn);
if (unlikely(!txn->tw.loose_pages))
- goto skip_cache;
+ goto no_loose;
}
+#endif /* MDBX_ENABLE_REFUND */
np = txn->tw.loose_pages;
txn->tw.loose_pages = np->mp_next;
@@ -8137,10 +8477,13 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
return MDBX_SUCCESS;
}
}
-skip_cache:
+#if MDBX_ENABLE_REFUND
+no_loose:
+#endif /* MDBX_ENABLE_REFUND */
- mdbx_tassert(
- txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
pgno_t pgno, *re_list = txn->tw.reclaimed_pglist;
unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list);
txnid_t oldest = 0, last = 0;
@@ -8156,8 +8499,8 @@ skip_cache:
* Prefer pages with lower pgno. */
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno));
- if (likely(flags & MDBX_ALLOC_CACHE) && re_len > wanna_range &&
- (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) {
+ if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE &&
+ re_len > wanna_range) {
mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno &&
MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno);
range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
@@ -8187,11 +8530,6 @@ skip_cache:
}
if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */
- if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) &&
- !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) {
- /* If our dirty list is already full, we can't touch GC */
- flags &= ~MDBX_ALLOC_GC;
- }
if (unlikely(!(flags & MDBX_ALLOC_GC)))
break /* reclaiming is prohibited for now */;
@@ -8302,22 +8640,16 @@ skip_cache:
goto fail;
}
const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl);
- /* TODO: provide a user-configurable threshold */
- const unsigned threshold_2_stop_gc_reclaiming = MDBX_PNL_MAX / 4;
- if (unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >
- threshold_2_stop_gc_reclaiming) &&
+ if (flags != MDBX_ALLOC_GC &&
+ unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >
+ env->me_options.rp_augment_limit) &&
(pgno_add(txn->mt_next_pgno, num) <= txn->mt_geo.upper ||
gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >=
- MDBX_PNL_MAX / 16 * 15)) {
+ MDBX_PGL_LIMIT / 16 * 15)) {
/* Stop reclaiming to avoid overflow the page list.
* This is a rare case while search for a continuously multi-page region
* in a large database. https://github.com/erthink/libmdbx/issues/123 */
- flags &= ~MDBX_ALLOC_GC;
- if (unlikely((flags & MDBX_ALLOC_ALL) == 0)) {
- /* Oh, we can't do anything */
- rc = MDBX_TXN_FULL;
- goto fail;
- }
+ flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE);
break;
}
rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len);
@@ -8337,10 +8669,9 @@ skip_cache:
mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO
" num %u, PNL",
last, txn->mt_dbs[FREE_DBI].md_root, gc_len);
- unsigned i;
- for (i = gc_len; i; i--)
+ for (unsigned i = gc_len; i; i--)
mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]);
- mdbx_debug_extra_print("%s", "\n");
+ mdbx_debug_extra_print("%s\n", ".");
}
/* Merge in descending sorted order */
@@ -8351,56 +8682,32 @@ skip_cache:
rc = MDBX_CORRUPTED;
goto fail;
}
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
re_len = MDBX_PNL_SIZE(re_list);
mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno);
- if (re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) {
+ if (MDBX_ENABLE_REFUND && re_len &&
+ unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) {
/* Refund suitable pages into "unallocated" space */
mdbx_refund(txn);
re_list = txn->tw.reclaimed_pglist;
re_len = MDBX_PNL_SIZE(re_list);
}
- if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) {
- /* Done for a kick-reclaim mode, actually no page needed */
+ /* Done for a kick-reclaim mode, actually no page needed */
+ if (unlikely((flags & MDBX_ALLOC_CACHE) == 0))
return MDBX_SUCCESS;
- }
/* Don't try to coalesce too much. */
- if (unlikely(re_len > MDBX_DPL_TXNFULL / 42))
- break;
- if (re_len /* current size */ >= env->me_maxgc_ov1page ||
+ if (re_len /* current size */ > coalesce_threshold ||
(re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >=
- env->me_maxgc_ov1page / 2))
+ coalesce_threshold / 2))
flags &= ~MDBX_COALESCE;
}
- if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE &&
- re_len > wanna_range) {
- range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
- pgno = MDBX_PNL_LEAST(re_list);
- if (likely(wanna_range == 0))
- goto done;
-#if MDBX_PNL_ASCENDING
- mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1);
- while (true) {
- unsigned range_end = range_begin + wanna_range;
- if (re_list[range_end] - pgno == wanna_range)
- goto done;
- if (range_end == re_len)
- break;
- pgno = re_list[++range_begin];
- }
-#else
- mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len);
- while (true) {
- if (re_list[range_begin - wanna_range] - pgno == wanna_range)
- goto done;
- if (range_begin == wanna_range)
- break;
- pgno = re_list[--range_begin];
- }
-#endif /* MDBX_PNL sort-order */
+ if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_CACHE)) {
+ flags -= MDBX_COALESCE;
+ continue;
}
/* There is no suitable pages in the GC and to be able to allocate
@@ -8512,8 +8819,9 @@ skip_cache:
}
fail:
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (likely(mp)) {
*mp = nullptr;
txn->mt_flags |= MDBX_TXN_ERROR;
@@ -8525,10 +8833,12 @@ skip_cache:
done:
if (unlikely(mp == nullptr))
return MDBX_SUCCESS;
+ if (unlikely(txn->tw.dirtyroom < 1))
+ return MDBX_TXN_FULL;
mdbx_ensure(env, pgno >= NUM_METAS);
if (env->me_flags & MDBX_WRITEMAP) {
np = pgno2page(env, pgno);
- /* LY: reset no-access flag from mdbx_loose_page() */
+ /* LY: reset no-access flag from mdbx_page_loose() */
VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num));
ASAN_UNPOISON_MEMORY_REGION(np, pgno2bytes(env, num));
} else {
@@ -8552,8 +8862,9 @@ done:
for (unsigned i = range_begin - num; i < re_len;)
re_list[++i] = re_list[++range_begin];
#endif
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
} else {
txn->mt_next_pgno = pgno + num;
mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno);
@@ -8566,14 +8877,18 @@ done:
np->mp_pgno = pgno;
np->mp_leaf2_ksize = 0;
np->mp_flags = 0;
- np->mp_pages = num;
+ if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) {
+ np->mp_pages = num;
+ np->mp_flags = P_OVERFLOW;
+ }
rc = mdbx_page_dirty(txn, np);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
*mp = np;
- mdbx_tassert(
- txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
return MDBX_SUCCESS;
}
@@ -8609,44 +8924,46 @@ __hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, size_t psize) {
* ret is unchanged if mp wasn't spilled. */
static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp,
MDBX_page **ret) {
- MDBX_env *env = txn->mt_env;
- pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
-
- for (const MDBX_txn *tx2 = txn; tx2; tx2 = tx2->mt_parent) {
- if (!tx2->tw.spill_pages)
+ mdbx_tassert(txn, !IS_DIRTY(mp));
+ const pgno_t spilled_pgno = mp->mp_pgno << 1;
+ const MDBX_txn *scan = txn;
+ do {
+ if ((scan->mt_flags & MDBX_TXN_SPILLS) == 0)
+ break;
+ if (!scan->tw.spill_pages)
continue;
- unsigned i = mdbx_pnl_exist(tx2->tw.spill_pages, pn);
- if (!i)
+ const unsigned si = mdbx_pnl_exist(scan->tw.spill_pages, spilled_pgno);
+ if (!si)
continue;
- if (txn->tw.dirtyroom == 0)
- return MDBX_TXN_FULL;
- unsigned num = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
+ const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
MDBX_page *np = mp;
- if ((env->me_flags & MDBX_WRITEMAP) == 0) {
- np = mdbx_page_malloc(txn, num);
+ if ((txn->mt_flags & MDBX_WRITEMAP) == 0) {
+ np = mdbx_page_malloc(txn, npages);
if (unlikely(!np))
return MDBX_ENOMEM;
- if (unlikely(num > 1))
- memcpy(np, mp, pgno2bytes(env, num));
+ if (likely(npages == 1))
+ mdbx_page_copy(np, mp, txn->mt_env->me_psize);
else
- mdbx_page_copy(np, mp, env->me_psize);
+ memcpy(np, mp, pgno2bytes(txn->mt_env, npages));
}
mdbx_debug("unspill page %" PRIaPGNO, mp->mp_pgno);
- if (tx2 == txn) {
+ if (scan == txn) {
/* If in current txn, this page is no longer spilled.
* If it happens to be the last page, truncate the spill list.
* Otherwise mark it as deleted by setting the LSB. */
- txn->tw.spill_pages[i] |= 1;
- if (i == MDBX_PNL_SIZE(txn->tw.spill_pages))
- MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1;
+ mdbx_spill_remove(txn, si, npages);
} /* otherwise, if belonging to a parent txn, the
* page remains spilled until child commits */
int rc = mdbx_page_dirty(txn, np);
- if (likely(rc == MDBX_SUCCESS))
- *ret = np;
- return rc;
- }
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
+ mdbx_dpage_free(txn->mt_env, np, npages);
+ return rc;
+ }
+ *ret = np;
+ break;
+ } while ((scan = scan->mt_parent) != nullptr);
return MDBX_SUCCESS;
}
@@ -8657,14 +8974,31 @@ static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp,
*
* Returns 0 on success, non-zero on failure. */
__hot static int mdbx_page_touch(MDBX_cursor *mc) {
- MDBX_page *mp = mc->mc_pg[mc->mc_top], *np;
+ MDBX_page *const mp = mc->mc_pg[mc->mc_top], *np;
MDBX_txn *txn = mc->mc_txn;
MDBX_cursor *m2, *m3;
pgno_t pgno;
int rc;
- mdbx_cassert(mc, !IS_OVERFLOW(mp));
- if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
+ if (mdbx_assert_enabled()) {
+ if (mc->mc_dbi >= CORE_DBS) {
+ if (mc->mc_flags & C_SUB) {
+ MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db);
+ MDBX_cursor_couple *couple =
+ container_of(mx, MDBX_cursor_couple, inner);
+ mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db);
+ mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx);
+ mdbx_cassert(mc, *couple->outer.mc_dbistate & DBI_DIRTY);
+ } else {
+ mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY);
+ }
+ mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY);
+ }
+ mdbx_cassert(mc, !IS_OVERFLOW(mp));
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
+ }
+
+ if (!IS_DIRTY(mp)) {
if (txn->mt_flags & MDBX_TXN_SPILLS) {
np = NULL;
rc = mdbx_page_unspill(txn, mp, &np);
@@ -8677,12 +9011,12 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) {
if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, 1)) ||
(rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL))))
goto fail;
+
pgno = np->mp_pgno;
mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc),
mp->mp_pgno, pgno);
mdbx_cassert(mc, mp->mp_pgno != pgno);
mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno);
- mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, mp->mp_pgno) == nullptr);
/* Update the parent page, if any, to point to the new page */
if (mc->mc_top) {
MDBX_page *parent = mc->mc_pg[mc->mc_top - 1];
@@ -8710,18 +9044,25 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) {
}
mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno);
- mdbx_cassert(mc, txn->tw.dirtylist->length <= MDBX_DPL_TXNFULL);
+ mdbx_cassert(mc, txn->tw.dirtylist->length <=
+ MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
/* No - copy it */
np = mdbx_page_malloc(txn, 1);
if (unlikely(!np)) {
rc = MDBX_ENOMEM;
goto fail;
}
- rc = mdbx_dpl_append(txn->tw.dirtylist, pgno, np);
+ /* insert a clone of parent's dirty page, so don't touch dirtyroom */
+ rc = mdbx_dpl_append(txn, pgno, np);
if (unlikely(rc)) {
mdbx_dpage_free(txn->mt_env, np, 1);
goto fail;
}
+
+ np->mp_pgno = pgno;
+ np->mp_txnid = INVALID_TXNID;
+ np->mp_flags |= P_DIRTY;
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
} else {
return MDBX_SUCCESS;
}
@@ -8956,13 +9297,11 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) {
if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */)
mc->mc_signature = stage /* Promote closed state to parent txn */;
else if (merge) {
- /* Preserve changes from nested to parent txn */
+ /* Restore pointers to parent txn */
mc->mc_next = bk->mc_next;
mc->mc_backup = bk->mc_backup;
mc->mc_txn = bk->mc_txn;
- *bk->mc_db = *mc->mc_db;
mc->mc_db = bk->mc_db;
- *bk->mc_dbistate = *mc->mc_dbistate;
mc->mc_dbistate = bk->mc_dbistate;
if (mx) {
if (mx != bk->mc_xcursor) {
@@ -9283,7 +9622,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
if (likely(r)) {
safe64_reset(&r->mr_txnid, false);
r->mr_snapshot_pages_used = meta->mm_geo.next;
- r->mr_snapshot_pages_retired = meta->mm_pages_retired;
+ r->mr_snapshot_pages_retired =
+ unaligned_peek_u64(4, meta->mm_pages_retired);
safe64_write(&r->mr_txnid, snap);
mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid());
@@ -9321,7 +9661,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
txn->mt_canary = meta->mm_canary;
if (likely(r)) {
r->mr_snapshot_pages_used = meta->mm_geo.next;
- r->mr_snapshot_pages_retired = meta->mm_pages_retired;
+ r->mr_snapshot_pages_retired =
+ unaligned_peek_u64(4, meta->mm_pages_retired);
r->mr_txnid.inconsistent = txn->mt_txnid;
mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid());
@@ -9388,8 +9729,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
const txnid_t snap = mdbx_meta_txnid_stable(env, meta);
txn->mt_txnid = safe64_txnid_next(snap);
if (unlikely(txn->mt_txnid > MAX_TXNID)) {
- mdbx_error("%s", "txnid overflow!");
rc = MDBX_TXN_FULL;
+ mdbx_error("txnid overflow, raise %d", rc);
goto bailout;
}
@@ -9397,11 +9738,14 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
txn->mt_child = NULL;
txn->tw.loose_pages = NULL;
txn->tw.loose_count = 0;
- txn->tw.dirtyroom = MDBX_DPL_TXNFULL;
- txn->tw.dirtylist = env->me_dirtylist;
+#if MDBX_ENABLE_REFUND
+ txn->tw.loose_refund_wl = 0;
+#endif /* MDBX_ENABLE_REFUND */
+ txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit;
mdbx_dpl_clear(txn->tw.dirtylist);
MDBX_PNL_SIZE(txn->tw.retired_pages) = 0;
txn->tw.spill_pages = NULL;
+ txn->tw.spill_least_removed = 0;
txn->tw.last_reclaimed = 0;
if (txn->tw.lifo_reclaimed)
MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0;
@@ -9412,7 +9756,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
/* Moved to here to avoid a data race in read TXNs */
txn->mt_geo = meta->mm_geo;
- txn->tw.loose_refund_wl = txn->mt_next_pgno;
}
/* Setup db info */
@@ -9599,6 +9942,16 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
if (unlikely(rc != MDBX_SUCCESS))
return rc;
+ if (env->me_options.spill_parent4child_denominator) {
+ /* Spill dirty-pages of parent to provide dirtyroom for child txn */
+ rc = mdbx_txn_spill(parent, nullptr,
+ parent->tw.dirtylist->length /
+ env->me_options.spill_parent4child_denominator);
+ if (unlikely(rc != MDBX_SUCCESS))
+ return rc;
+ }
+ mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
+
flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS);
/* Child txns save MDBX_pgstate and use own copy of cursors */
size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1);
@@ -9628,26 +9981,59 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
txn->mt_env = env;
if (parent) {
- mdbx_tassert(txn, mdbx_dirtylist_check(parent));
+ mdbx_tassert(parent, mdbx_dirtylist_check(parent));
txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs);
txn->mt_dbiseqs = parent->mt_dbiseqs;
- txn->tw.dirtylist = mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1));
- txn->tw.reclaimed_pglist =
- mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist));
- if (!txn->tw.dirtylist || !txn->tw.reclaimed_pglist) {
+ rc = mdbx_dpl_alloc(txn);
+ if (likely(rc == MDBX_SUCCESS)) {
+ txn->tw.reclaimed_pglist =
+ mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist));
+ if (unlikely(!txn->tw.reclaimed_pglist))
+ rc = MDBX_ENOMEM;
+ }
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ nested_failed:
mdbx_pnl_free(txn->tw.reclaimed_pglist);
- mdbx_free(txn->tw.dirtylist);
+ mdbx_dpl_free(txn);
mdbx_free(txn);
- return MDBX_ENOMEM;
+ return rc;
}
- mdbx_dpl_clear(txn->tw.dirtylist);
+ txn->tw.dirtyroom = parent->tw.dirtyroom;
+
+ /* Move loose pages to reclaimed list */
+ if (parent->tw.loose_count) {
+ do {
+ MDBX_page *lp = parent->tw.loose_pages;
+ const unsigned di = mdbx_dpl_exist(parent->tw.dirtylist, lp->mp_pgno);
+ mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp);
+ mdbx_tassert(parent, lp->mp_flags == (P_LOOSE | P_DIRTY));
+ rc =
+ mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1);
+ if (unlikely(rc != MDBX_SUCCESS))
+ goto nested_failed;
+ parent->tw.loose_pages = lp->mp_next;
+ /* Remove from dirty list */
+ mdbx_page_wash(parent, di, lp, 1);
+ } while (parent->tw.loose_pages);
+ parent->tw.loose_count = 0;
+#if MDBX_ENABLE_REFUND
+ parent->tw.loose_refund_wl = 0;
+#endif /* MDBX_ENABLE_REFUND */
+ mdbx_tassert(parent, mdbx_dirtylist_check(parent));
+ }
+
+ mdbx_dpl_sort(parent->tw.dirtylist);
+ if (parent->tw.spill_pages)
+ mdbx_spill_purge(parent);
+
memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist,
MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist));
mdbx_assert(env, mdbx_pnl_check4assert(
txn->tw.reclaimed_pglist,
(txn->mt_next_pgno /* LY: intentional assignment here,
only for assertion */
- = parent->mt_next_pgno)));
+ = parent->mt_next_pgno) -
+ MDBX_ENABLE_REFUND));
txn->tw.last_reclaimed = parent->tw.last_reclaimed;
if (parent->tw.lifo_reclaimed) {
@@ -9661,9 +10047,10 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
(void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages);
txn->mt_txnid = parent->mt_txnid;
- txn->tw.dirtyroom = parent->tw.dirtyroom;
txn->mt_geo = parent->mt_geo;
- txn->tw.loose_refund_wl = parent->tw.loose_refund_wl;
+#if MDBX_ENABLE_REFUND
+ txn->tw.loose_refund_wl = 0;
+#endif /* MDBX_ENABLE_REFUND */
txn->mt_canary = parent->mt_canary;
parent->mt_flags |= MDBX_TXN_HAS_CHILD;
parent->mt_child = txn;
@@ -9678,7 +10065,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
mdbx_tassert(parent,
parent->mt_parent ||
parent->tw.dirtyroom + parent->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ env->me_options.dp_limit);
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
env->me_txn = txn;
rc = mdbx_cursor_shadow(parent, txn);
if (unlikely(rc != MDBX_SUCCESS))
@@ -9699,10 +10087,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
mdbx_assert(env, (txn->mt_flags &
~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
/* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0);
- else
- mdbx_assert(env,
- (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED |
- MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC)) == 0);
+ else {
+ mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED |
+ MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC |
+ MDBX_TXN_SPILLS)) == 0);
+ assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed);
+ }
txn->mt_signature = MDBX_MT_SIGNATURE;
txn->mt_userctx = context;
*ret = txn;
@@ -9743,7 +10133,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
/* fetch info from volatile head */
head_meta = mdbx_meta_head(env);
head_txnid = mdbx_meta_txnid_fluid(env, head_meta);
- head_retired = head_meta->mm_pages_retired;
+ head_retired = unaligned_peek_u64(4, head_meta->mm_pages_retired);
info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now);
info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper);
info->txn_space_leftover =
@@ -9802,7 +10192,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
: MDBX_PNL_SIZE(txn->tw.retired_pages));
info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
info->txn_space_dirty =
- pgno2bytes(env, MDBX_DPL_TXNFULL - txn->tw.dirtyroom);
+ pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom);
info->txn_reader_lag = INT64_MAX;
MDBX_lockinfo *const lck = env->me_lck;
if (scan_rlt && lck) {
@@ -9849,13 +10239,50 @@ int mdbx_txn_flags(const MDBX_txn *txn) {
return txn->mt_flags;
}
+/* Check for misused dbi handles */
+#define TXN_DBI_CHANGED(txn, dbi) \
+ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
+
+static void dbi_import_locked(MDBX_txn *txn) {
+ MDBX_env *const env = txn->mt_env;
+ const unsigned n = env->me_numdbs;
+ for (unsigned i = CORE_DBS; i < n; ++i) {
+ if (i >= txn->mt_numdbs) {
+ txn->mt_dbistate[i] = 0;
+ if (!(txn->mt_flags & MDBX_TXN_RDONLY))
+ txn->tw.cursors[i] = NULL;
+ }
+ if ((env->me_dbflags[i] & DB_VALID) &&
+ !(txn->mt_dbistate[i] & DBI_USRVALID)) {
+ txn->mt_dbiseqs[i] = env->me_dbiseqs[i];
+ txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS;
+ txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE;
+ mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL);
+ }
+ }
+ txn->mt_numdbs = n;
+}
+
+/* Import DBI which opened after txn started into context */
+static __cold bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) {
+ if (dbi < CORE_DBS || dbi >= txn->mt_env->me_numdbs)
+ return false;
+
+ mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) ==
+ MDBX_SUCCESS);
+ dbi_import_locked(txn);
+ mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) ==
+ MDBX_SUCCESS);
+ return txn->mt_dbistate[dbi] & DBI_USRVALID;
+}
+
/* Export or close DBI handles opened in this txn. */
-static void mdbx_dbis_update(MDBX_txn *txn, int keep) {
+static void dbi_update(MDBX_txn *txn, int keep) {
mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0);
MDBX_dbi n = txn->mt_numdbs;
if (n) {
bool locked = false;
- MDBX_env *env = txn->mt_env;
+ MDBX_env *const env = txn->mt_env;
for (unsigned i = n; --i >= CORE_DBS;) {
if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0))
@@ -9865,11 +10292,10 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) {
mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
locked = true;
}
+ if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i])
+ continue /* dbi explicitly closed and/or then re-opened by other txn */;
if (keep) {
env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID;
- mdbx_compiler_barrier();
- if (env->me_numdbs <= i)
- env->me_numdbs = i + 1;
} else {
char *ptr = env->me_dbxs[i].md_name.iov_base;
if (ptr) {
@@ -9883,17 +10309,95 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) {
}
}
+ n = env->me_numdbs;
+ if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) {
+ if (!locked) {
+ mdbx_ensure(env,
+ mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
+ locked = true;
+ }
+
+ n = env->me_numdbs;
+ while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID))
+ --n;
+ env->me_numdbs = n;
+ }
+
if (unlikely(locked))
mdbx_ensure(env,
mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
}
}
+/* Filter-out pgno list from transaction's dirty-page list */
+static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl,
+ const bool spilled) {
+ if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) {
+ mdbx_tassert(txn, mdbx_pnl_check4assert(pl, txn->mt_next_pgno << spilled));
+ MDBX_dpl *dl = mdbx_dpl_sort(txn->tw.dirtylist);
+
+ /* Scanning in ascend order */
+ const int step = MDBX_PNL_ASCENDING ? 1 : -1;
+ const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl);
+ const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0;
+ mdbx_tassert(txn, pl[begin] <= pl[end - step]);
+
+ unsigned r = mdbx_dpl_search(dl, pl[begin] >> spilled);
+ mdbx_tassert(txn, dl->sorted == dl->length);
+ for (int i = begin; r <= dl->length;) { /* scan loop */
+ assert(i != end);
+ mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0);
+ pgno_t pl_pgno = pl[i] >> spilled;
+ pgno_t dp_pgno = dl->items[r].pgno;
+ if (likely(dp_pgno != pl_pgno)) {
+ const bool cmp = dp_pgno < pl_pgno;
+ r += cmp;
+ i += cmp ? 0 : step;
+ if (likely(i != end))
+ continue;
+ return;
+ }
+
+ /* update loop */
+ unsigned w = r;
+ remove_dl:
+ if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) {
+ MDBX_page *dp = dl->items[r].ptr;
+ mdbx_dpage_free(txn->mt_env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1);
+ }
+ ++r;
+ next_i:
+ i += step;
+ if (unlikely(i == end)) {
+ while (r <= dl->length)
+ dl->items[w++] = dl->items[r++];
+ } else {
+ while (r <= dl->length) {
+ assert(i != end);
+ mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0);
+ pl_pgno = pl[i] >> spilled;
+ dp_pgno = dl->items[r].pgno;
+ if (dp_pgno < pl_pgno)
+ dl->items[w++] = dl->items[r++];
+ else if (dp_pgno > pl_pgno)
+ goto next_i;
+ else
+ goto remove_dl;
+ }
+ }
+ dl->sorted = dl->length = w - 1;
+ txn->tw.dirtyroom += r - w;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
+ return;
+ }
+ }
+}
+
/* End a transaction, except successful commit of a nested transaction.
* May be called twice for readonly txns: First reset it, then abort.
* [in] txn the transaction handle to end
* [in] mode why and how to end the transaction */
-static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
+static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) {
MDBX_env *env = txn->mt_env;
static const char *const names[] = MDBX_END_NAMES;
@@ -9955,18 +10459,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
#endif
if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */
mdbx_cursors_eot(txn, false);
- if (!(env->me_flags & MDBX_WRITEMAP))
- mdbx_dlist_free(txn);
txn->mt_flags = MDBX_TXN_FINISHED;
txn->mt_owner = 0;
env->me_txn = txn->mt_parent;
+ mdbx_pnl_free(txn->tw.spill_pages);
+ txn->tw.spill_pages = nullptr;
if (txn == env->me_txn0) {
mdbx_assert(env, txn->mt_parent == NULL);
/* Export or close DBI handles created in this txn */
- mdbx_dbis_update(txn, mode & MDBX_END_UPDATE);
+ dbi_update(txn, mode & MDBX_END_UPDATE);
mdbx_pnl_shrink(&txn->tw.retired_pages);
mdbx_pnl_shrink(&txn->tw.reclaimed_pglist);
+ if (!(env->me_flags & MDBX_WRITEMAP))
+ mdbx_dlist_free(txn);
/* The writer mutex was locked in mdbx_txn_begin. */
mdbx_txn_unlock(env);
} else {
@@ -9975,10 +10481,9 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE);
mdbx_assert(env, parent->mt_child == txn &&
(parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
- mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
- mdbx_pnl_free(txn->tw.reclaimed_pglist);
- mdbx_pnl_free(txn->tw.spill_pages);
+ mdbx_assert(
+ env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (txn->tw.lifo_reclaimed) {
mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >=
@@ -9996,9 +10501,14 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
parent->tw.retired_pages = txn->tw.retired_pages;
}
- mdbx_free(txn->tw.dirtylist);
- parent->mt_child = NULL;
+ parent->mt_child = nullptr;
parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
+ mdbx_tassert(parent, mdbx_dirtylist_check(parent));
+ mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
+ if (!(env->me_flags & MDBX_WRITEMAP))
+ mdbx_dlist_free(txn);
+ mdbx_dpl_free(txn);
+ mdbx_pnl_free(txn->tw.reclaimed_pglist);
if (parent->mt_geo.upper != txn->mt_geo.upper ||
parent->mt_geo.now != txn->mt_geo.now) {
@@ -10080,6 +10590,7 @@ int mdbx_txn_abort(MDBX_txn *txn) {
if (txn->mt_child)
mdbx_txn_abort(txn->mt_child);
+ mdbx_tassert(txn, mdbx_dirtylist_check(txn));
return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE);
}
@@ -10091,10 +10602,7 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
pgno_t pending = 0;
if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) {
pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) +
- (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored) +
- txn->tw.retired2parent_count;
- for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent)
- pending += parent->tw.loose_count;
+ (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored);
}
MDBX_cursor_couple cx;
@@ -10158,10 +10666,8 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base,
node_ks(node)) == 0) {
txn->mt_dbistate[k] |= DBI_AUDITED;
- if (txn->mt_dbistate[k] & DBI_DIRTY) {
- mdbx_tassert(txn, (txn->mt_dbistate[k] & DBI_STALE) == 0);
+ if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE))
db = txn->mt_dbs + k;
- }
break;
}
}
@@ -10179,10 +10685,14 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) !=
DBI_VALID)
continue;
- if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) {
- count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages +
- txn->mt_dbs[i].md_overflow_pages;
- } else {
+ for (MDBX_txn *t = txn; t; t = t->mt_parent)
+ if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) {
+ count += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages +
+ t->mt_dbs[i].md_overflow_pages;
+ txn->mt_dbistate[i] |= DBI_AUDITED;
+ break;
+ }
+ if (!(txn->mt_dbistate[i] & DBI_AUDITED)) {
mdbx_warning("audit %s@%" PRIaTXN
": unable account dbi %d / \"%*s\", state 0x%02x",
txn->mt_parent ? "nested-" : "", txn->mt_txnid, i,
@@ -10197,12 +10707,11 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0)
mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose-count) + "
- "%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored) "
- "+ %u(retired2parent)",
+ "%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored)",
txn->mt_txnid, pending, txn->tw.loose_count,
MDBX_PNL_SIZE(txn->tw.reclaimed_pglist),
txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0,
- retired_stored, txn->tw.retired2parent_count);
+ retired_stored);
mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO
"(free) + %" PRIaPGNO "(count) = %" PRIaPGNO
"(total) <> %" PRIaPGNO "(next-pgno)",
@@ -10289,11 +10798,12 @@ retry:
++loop;
retry_noaccount:
mdbx_trace("%s", " >> restart");
- mdbx_tassert(
- txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ txn->mt_env->me_options.dp_limit);
if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 9 : 99))) {
mdbx_error("too more loops %u, bailout", loop);
rc = MDBX_PROBLEM;
@@ -10313,8 +10823,9 @@ retry_noaccount:
MDBX_val key, data;
mdbx_trace("%s", " >> continue");
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (lifo) {
if (cleaned_gc_slot < (txn->tw.lifo_reclaimed
? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
@@ -10385,11 +10896,12 @@ retry_noaccount:
}
}
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ txn->mt_env->me_options.dp_limit);
if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS))
@@ -10398,8 +10910,9 @@ retry_noaccount:
/* return suitable into unallocated space */
if (mdbx_refund(txn)) {
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(
+ txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS))
@@ -10447,16 +10960,17 @@ retry_noaccount:
}
/* filter-out list of dirty-pages from loose-pages */
- const MDBX_DPL dl = txn->tw.dirtylist;
+ MDBX_dpl *const dl = txn->tw.dirtylist;
unsigned w = 0;
for (unsigned r = w; ++r <= dl->length;) {
- MDBX_page *dp = dl[r].ptr;
+ MDBX_page *dp = dl->items[r].ptr;
mdbx_tassert(txn, (dp->mp_flags & P_DIRTY));
- mdbx_tassert(txn, dl[r].pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) <=
- txn->mt_next_pgno);
+ mdbx_tassert(txn,
+ dl->items[r].pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) <=
+ txn->mt_next_pgno);
if ((dp->mp_flags & P_LOOSE) == 0) {
if (++w != r)
- dl[w] = dl[r];
+ dl->items[w] = dl->items[r];
} else {
mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY));
if ((env->me_flags & MDBX_WRITEMAP) == 0)
@@ -10469,8 +10983,12 @@ retry_noaccount:
dl->length = w;
dl->sorted = 0;
txn->tw.dirtyroom += txn->tw.loose_count;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
txn->tw.loose_pages = NULL;
txn->tw.loose_count = 0;
+#if MDBX_ENABLE_REFUND
+ txn->tw.loose_refund_wl = 0;
+#endif /* MDBX_ENABLE_REFUND */
}
const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
@@ -10512,7 +11030,7 @@ retry_noaccount:
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
for (; i; i--)
mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]);
- mdbx_debug_extra_print("%s", "\n");
+ mdbx_debug_extra_print("%s\n", ".");
}
if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
mdbx_trace("%s.reclaimed-list changed %u -> %u, retry", dbg_prefix_mode,
@@ -10525,8 +11043,9 @@ retry_noaccount:
}
/* handle reclaimed and lost pages - merge and store both into gc */
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, txn->tw.loose_count == 0);
mdbx_trace("%s", " >> reserving");
@@ -10759,8 +11278,9 @@ retry_noaccount:
mdbx_prep_backlog(txn, &couple.outer, data.iov_len);
rc = mdbx_cursor_put(&couple.outer, &key, &data,
MDBX_RESERVE | MDBX_NOOVERWRITE);
- mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
- txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
@@ -10791,8 +11311,9 @@ retry_noaccount:
? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot
: reused_gc_slot;
rc = MDBX_SUCCESS;
- mdbx_tassert(
- txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno));
+ mdbx_tassert(txn,
+ mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
+ txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, mdbx_dirtylist_check(txn));
if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) {
MDBX_val key, data;
@@ -10885,6 +11406,13 @@ retry_noaccount:
mdbx_notice("%s", "** restart: reclaimed-slots changed");
goto retry;
}
+ if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) {
+ mdbx_tassert(txn,
+ retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages));
+ mdbx_notice("** restart: retired-list growth (%u -> %u)",
+ retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages));
+ goto retry;
+ }
pgno_t *dst = data.iov_base;
*dst++ = chunk;
@@ -10956,9 +11484,11 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov,
rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes);
}
- if (unlikely(rc != MDBX_SUCCESS)) {
+ if (unlikely(rc != MDBX_SUCCESS))
mdbx_error("Write error: %s", mdbx_strerror(rc));
- txn->mt_flags |= MDBX_TXN_ERROR;
+ else {
+ VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + iov_off, iov_bytes);
+ ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + iov_off, iov_bytes);
}
for (unsigned i = 0; i < iov_items; i++)
@@ -10973,30 +11503,30 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov,
* Returns 0 on success, non-zero on failure. */
__hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
struct iovec iov[MDBX_COMMIT_PAGES];
- const MDBX_DPL dl = (keep || txn->tw.loose_count > 1)
- ? mdbx_dpl_sort(txn->tw.dirtylist)
- : txn->tw.dirtylist;
+ MDBX_dpl *const dl = mdbx_dpl_sort(txn->tw.dirtylist);
MDBX_env *const env = txn->mt_env;
pgno_t flush_begin = MAX_PAGENO;
pgno_t flush_end = MIN_PAGENO;
+ int rc = MDBX_SUCCESS;
unsigned iov_items = 0;
size_t iov_bytes = 0;
size_t iov_off = 0;
+
unsigned r, w;
for (r = w = keep; ++r <= dl->length;) {
- MDBX_page *dp = dl[r].ptr;
+ MDBX_page *dp = dl->items[r].ptr;
mdbx_tassert(txn,
dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno);
mdbx_tassert(txn, dp->mp_flags & P_DIRTY);
/* Don't flush this page yet */
if (dp->mp_flags & P_KEEP) {
- dp->mp_flags &= ~P_KEEP;
- dl[++w] = dl[r];
+ dp->mp_flags -= P_KEEP;
+ dl->items[++w] = dl->items[r];
continue;
}
if (dp->mp_flags & P_LOOSE) {
- dl[++w] = dl[r];
+ dl->items[++w] = dl->items[r];
continue;
}
@@ -11005,7 +11535,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
flush_end =
(flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages;
*env->me_unsynced_pages += npages;
- dp->mp_flags &= ~P_DIRTY;
+ dp->mp_flags -= P_DIRTY;
dp->mp_txnid = pp_txnid2chk(txn);
if ((env->me_flags & MDBX_WRITEMAP) == 0) {
@@ -11013,9 +11543,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
if (iov_off + iov_bytes != pgno2bytes(env, dp->mp_pgno) ||
iov_items == ARRAY_LENGTH(iov) || iov_bytes + size > MAX_WRITE) {
if (iov_items) {
- int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
+ rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes);
#if defined(__linux__) || defined(__gnu_linux__)
if (mdbx_linux_kernel_version >= 0x02060b00)
/* Linux kernels older than version 2.6.11 ignore the addr and nbytes
@@ -11026,6 +11554,12 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
env->me_os_psize);
iov_items = 0;
iov_bytes = 0;
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ do
+ dl->items[++w] = dl->items[r];
+ while (++r <= dl->length);
+ break;
+ }
}
iov_off = pgno2bytes(env, dp->mp_pgno);
}
@@ -11036,10 +11570,20 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
}
}
- if (iov_items) {
- int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
+ mdbx_tassert(txn, dl->sorted == dl->length && r == dl->length + 1);
+ txn->tw.dirtyroom += dl->length - w;
+ assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit);
+ dl->sorted = dl->length = w;
+ mdbx_tassert(txn, txn->mt_parent ||
+ txn->tw.dirtyroom + txn->tw.dirtylist->length ==
+ txn->mt_env->me_options.dp_limit);
+
+ if (iov_items)
+ rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes);
+
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ txn->mt_flags |= MDBX_TXN_ERROR;
+ return rc;
}
#if defined(__linux__) || defined(__gnu_linux__)
@@ -11056,55 +11600,358 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
/* TODO: use flush_begin & flush_end for msync() & sync_file_range(). */
(void)flush_begin;
(void)flush_end;
-
- txn->tw.dirtyroom += r - 1 - w;
- dl->length = w;
- mdbx_tassert(txn, txn->mt_parent ||
- txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
return MDBX_SUCCESS;
}
-/* Check for misused dbi handles */
-#define TXN_DBI_CHANGED(txn, dbi) \
- ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
-
-/* Import DBI which opened after txn started into context */
-static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) {
- MDBX_env *env = txn->mt_env;
- if (dbi < CORE_DBS || dbi >= env->me_numdbs)
- return false;
-
- mdbx_ensure(env, mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS);
- const unsigned snap_numdbs = env->me_numdbs;
- mdbx_compiler_barrier();
- for (unsigned i = CORE_DBS; i < snap_numdbs; ++i) {
- if (i >= txn->mt_numdbs)
- txn->mt_dbistate[i] = 0;
- if (!(txn->mt_dbistate[i] & DBI_USRVALID) &&
- (env->me_dbflags[i] & DB_VALID)) {
- txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS;
- txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE;
- mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL);
- }
- }
- txn->mt_numdbs = snap_numdbs;
-
- mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
- return txn->mt_dbistate[dbi] & DBI_USRVALID;
-}
-
/* Check txn and dbi arguments to a function */
static __always_inline bool mdbx_txn_dbi_exists(MDBX_txn *txn, MDBX_dbi dbi,
unsigned validity) {
if (likely(dbi < txn->mt_numdbs && (txn->mt_dbistate[dbi] & validity)))
return true;
- return mdbx_txn_import_dbi(txn, dbi);
+ return dbi_import(txn, dbi);
}
int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); }
+/* Merge child txn into parent */
+static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn,
+ const unsigned parent_retired_len) {
+ MDBX_dpl *const src = mdbx_dpl_sort(txn->tw.dirtylist);
+
+ /* Remove refunded pages from parent's dirty list */
+ MDBX_dpl *const dst = mdbx_dpl_sort(parent->tw.dirtylist);
+ while (MDBX_ENABLE_REFUND && dst->length &&
+ dst->items[dst->length].pgno >= parent->mt_next_pgno) {
+ if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) {
+ MDBX_page *dp = dst->items[dst->length].ptr;
+ mdbx_dpage_free(txn->mt_env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1);
+ }
+ dst->length -= 1;
+ }
+ parent->tw.dirtyroom += dst->sorted - dst->length;
+ assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit);
+ dst->sorted = dst->length;
+ mdbx_tassert(parent,
+ parent->mt_parent ||
+ parent->tw.dirtyroom + parent->tw.dirtylist->length ==
+ parent->mt_env->me_options.dp_limit);
+
+ /* Remove reclaimed pages from parent's dirty list */
+ const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist;
+ mdbx_dpl_sift(parent, reclaimed_list, false);
+
+ /* Move retired pages from parent's dirty & spilled list to reclaimed */
+ unsigned r, w, d, s, l;
+ for (r = w = parent_retired_len;
+ ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) {
+ const pgno_t pgno = parent->tw.retired_pages[r];
+ const unsigned di = mdbx_dpl_exist(parent->tw.dirtylist, pgno);
+ const unsigned si = (!di && unlikely(parent->tw.spill_pages))
+ ? mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)
+ : 0;
+ unsigned npages;
+ const char *kind;
+ if (di) {
+ MDBX_page *dp = parent->tw.dirtylist->items[di].ptr;
+ mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH |
+ P_OVERFLOW | P_DIRTY)) == 0);
+ npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1;
+ mdbx_page_wash(parent, di, dp, npages);
+ kind = "dirty";
+ l = 1;
+ if (unlikely(npages > l)) {
+ /* OVERFLOW-страница могла быть переиспользована по частям. Тогда
+ * в retired-списке может быть только начало последовательности,
+ * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
+ * переносим в reclaimed с проверкой на обрыв последовательности.
+ * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
+ * страница была разбита на части, то важно удалить dirty-элемент,
+ * а все осколки будут учтены отдельно. */
+
+ /* Список retired страниц не сортирован, но для ускорения сортировки
+ * дополняется в соответствии с MDBX_PNL_ASCENDING */
+#if MDBX_PNL_ASCENDING
+ const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages);
+ while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) {
+ ++r;
+ if (++l == npages)
+ break;
+ }
+#else
+ while (w > parent_retired_len &&
+ parent->tw.retired_pages[w - 1] == pgno + l) {
+ --w;
+ if (++l == npages)
+ break;
+ }
+#endif
+ }
+ } else if (unlikely(si)) {
+ l = npages = 1;
+ mdbx_spill_remove(parent, si, 1);
+ kind = "spilled";
+ } else {
+ parent->tw.retired_pages[++w] = pgno;
+ continue;
+ }
+
+ mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l,
+ kind, pgno);
+ int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l);
+ mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS);
+ }
+ MDBX_PNL_SIZE(parent->tw.retired_pages) = w;
+
+ /* Filter-out parent spill list */
+ if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) {
+ const MDBX_PNL sl = mdbx_spill_purge(parent);
+ unsigned len = MDBX_PNL_SIZE(sl);
+ if (len) {
+ /* Remove refunded pages from parent's spill list */
+ if (MDBX_ENABLE_REFUND &&
+ MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) {
+#if MDBX_PNL_ASCENDING
+ unsigned i = MDBX_PNL_SIZE(sl);
+ assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
+ do {
+ if ((sl[i] & 1) == 0)
+ mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
+ i -= 1;
+ } while (i && sl[i] >= (parent->mt_next_pgno << 1));
+ MDBX_PNL_SIZE(sl) = i;
+#else
+ assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
+ unsigned i = 0;
+ do {
+ ++i;
+ if ((sl[i] & 1) == 0)
+ mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
+ } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1));
+ MDBX_PNL_SIZE(sl) = len -= i;
+ memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
+#endif
+ }
+ mdbx_tassert(txn, mdbx_pnl_check4assert(sl, parent->mt_next_pgno << 1));
+
+ /* Remove reclaimed pages from parent's spill list */
+ s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list);
+ /* Scanning from end to begin */
+ while (s && r) {
+ if (sl[s] & 1) {
+ --s;
+ continue;
+ }
+ const pgno_t spilled_pgno = sl[s] >> 1;
+ const pgno_t reclaimed_pgno = reclaimed_list[r];
+ if (reclaimed_pgno != spilled_pgno) {
+ const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
+ s -= !cmp;
+ r -= cmp;
+ } else {
+ mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO,
+ reclaimed_pgno);
+ mdbx_spill_remove(parent, s, 1);
+ --s;
+ --r;
+ }
+ }
+
+ /* Remove anything in our dirty list from parent's spill list */
+ /* Scanning spill list in descend order */
+ const int step = MDBX_PNL_ASCENDING ? -1 : 1;
+ s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1;
+ d = src->length;
+ while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) {
+ if (sl[s] & 1) {
+ s += step;
+ continue;
+ }
+ const pgno_t spilled_pgno = sl[s] >> 1;
+ const pgno_t dirty_pgno_form = src->items[d].pgno;
+ MDBX_page *dp = src->items[d].ptr;
+ const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1;
+ const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
+ if (dirty_pgno_form > spilled_pgno) {
+ --d;
+ continue;
+ }
+ if (dirty_pgno_to <= spilled_pgno) {
+ s += step;
+ continue;
+ }
+
+ mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages,
+ dirty_pgno_form);
+ mdbx_spill_remove(parent, s, 1);
+ s += step;
+ }
+
+ /* Squash deleted pagenums if we deleted any */
+ mdbx_spill_purge(parent);
+ }
+ }
+
+ /* Remove anything in our spill list from parent's dirty list */
+ if (txn->tw.spill_pages) {
+ mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages,
+ parent->mt_next_pgno << 1));
+ mdbx_dpl_sift(parent, txn->tw.spill_pages, true);
+ mdbx_tassert(parent,
+ parent->mt_parent ||
+ parent->tw.dirtyroom + parent->tw.dirtylist->length ==
+ parent->mt_env->me_options.dp_limit);
+ }
+
+ /* Find length of merging our dirty list with parent's and release
+ * filter-out pages */
+ for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
+ const MDBX_page *sp = src->items[s].ptr;
+ mdbx_tassert(parent,
+ (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW |
+ P_DIRTY | P_LOOSE)) == 0);
+ const unsigned s_npages = IS_OVERFLOW(sp) ? sp->mp_pages : 1;
+ const pgno_t s_pgno = src->items[s].pgno;
+
+ MDBX_page *dp = dst->items[d].ptr;
+ mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH |
+ P_OVERFLOW | P_DIRTY)) == 0);
+ const unsigned d_npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1;
+ const pgno_t d_pgno = dst->items[d].pgno;
+
+ if (d_pgno >= s_pgno + s_npages) {
+ --d;
+ ++l;
+ } else if (d_pgno + d_npages <= s_pgno) {
+ --s;
+ ++l;
+ } else {
+ dst->items[d--].ptr = nullptr;
+ if ((txn->mt_flags & MDBX_WRITEMAP) == 0)
+ mdbx_dpage_free(txn->mt_env, dp, d_npages);
+ }
+ }
+ assert(dst->sorted == dst->length);
+ mdbx_tassert(parent, dst->allocated >= l + d + s);
+ dst->sorted = l + d + s; /* the merged length */
+
+ /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
+ if (dst->sorted >= dst->length) {
+ /* from end to begin with dst extending */
+ for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
+ if (unlikely(l <= d)) {
+ /* squash to get a gap of free space for merge */
+ for (r = w = 1; r <= d; ++r)
+ if (dst->items[r].ptr) {
+ if (w != r) {
+ dst->items[w] = dst->items[r];
+ dst->items[r].ptr = nullptr;
+ }
+ ++w;
+ }
+ mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1);
+ d = w - 1;
+ continue;
+ }
+ assert(l > d);
+ if (dst->items[d].ptr) {
+ dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno)
+ ? dst->items[d--]
+ : src->items[s--];
+ } else
+ --d;
+ }
+ if (s > 0) {
+ assert(l == s);
+ while (d > 0) {
+ assert(dst->items[d].ptr == nullptr);
+ --d;
+ }
+ do {
+ assert(l > 0);
+ dst->items[l--] = src->items[s--];
+ } while (s > 0);
+ } else {
+ assert(l == d);
+ while (l > 0) {
+ assert(dst->items[l].ptr != nullptr);
+ --l;
+ }
+ }
+ } else {
+ /* from begin to end with dst shrinking (a lot of new overflow pages) */
+ for (l = s = d = 1; s <= src->length && d <= dst->length;) {
+ if (unlikely(l >= d)) {
+ /* squash to get a gap of free space for merge */
+ for (r = w = dst->length; r >= d; --r)
+ if (dst->items[r].ptr) {
+ if (w != r) {
+ dst->items[w] = dst->items[r];
+ dst->items[r].ptr = nullptr;
+ }
+ --w;
+ }
+ mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1);
+ d = w + 1;
+ continue;
+ }
+ assert(l < d);
+ if (dst->items[d].ptr) {
+ dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno)
+ ? dst->items[d++]
+ : src->items[s++];
+ } else
+ ++d;
+ }
+ if (s <= src->length) {
+ assert(dst->sorted - l == src->length - s);
+ while (d <= dst->length) {
+ assert(dst->items[d].ptr == nullptr);
+ --d;
+ }
+ do {
+ assert(l <= dst->sorted);
+ dst->items[l++] = src->items[s++];
+ } while (s <= src->length);
+ } else {
+ assert(dst->sorted - l == dst->length - d);
+ while (l <= dst->sorted) {
+ assert(l <= d && d <= dst->length && dst->items[d].ptr);
+ dst->items[l++] = dst->items[d++];
+ }
+ }
+ }
+ parent->tw.dirtyroom -= dst->sorted - dst->length;
+ assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit);
+ dst->length = dst->sorted;
+ mdbx_tassert(parent,
+ parent->mt_parent ||
+ parent->tw.dirtyroom + parent->tw.dirtylist->length ==
+ parent->mt_env->me_options.dp_limit);
+ mdbx_tassert(parent, mdbx_dirtylist_check(parent));
+ mdbx_dpl_free(txn);
+
+ if (txn->tw.spill_pages) {
+ if (parent->tw.spill_pages) {
+ /* Must not fail since space was preserved above. */
+ mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages);
+ mdbx_pnl_free(txn->tw.spill_pages);
+ } else {
+ parent->tw.spill_pages = txn->tw.spill_pages;
+ parent->tw.spill_least_removed = txn->tw.spill_least_removed;
+ }
+ mdbx_tassert(parent, mdbx_dirtylist_check(parent));
+ }
+
+ parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
+ if (parent->tw.spill_pages) {
+ assert(mdbx_pnl_check4assert(parent->tw.spill_pages,
+ parent->mt_next_pgno << 1));
+ if (MDBX_PNL_SIZE(parent->tw.spill_pages))
+ parent->mt_flags |= MDBX_TXN_SPILLS;
+ }
+}
+
int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
STATIC_ASSERT(MDBX_TXN_FINISHED ==
MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR);
@@ -11132,7 +11979,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
/* mdbx_txn_end() mode for a commit which writes nothing */
unsigned end_mode =
- MDBX_END_EMPTY_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE;
+ MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE;
if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)))
goto done;
@@ -11150,6 +11997,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
}
if (txn->mt_parent) {
+ mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0);
mdbx_assert(env, txn != env->me_txn0);
MDBX_txn *const parent = txn->mt_parent;
mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE);
@@ -11157,8 +12005,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
(parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0);
mdbx_assert(env, mdbx_dirtylist_check(txn));
- if (txn->tw.dirtylist->length == 0 &&
- (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0 &&
+ if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) &&
parent->mt_numdbs == txn->mt_numdbs) {
for (int i = txn->mt_numdbs; --i >= 0;) {
mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0);
@@ -11172,20 +12019,47 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
sizeof(parent->mt_geo)) == 0);
mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary,
sizeof(parent->mt_canary)) == 0);
+ mdbx_tassert(txn, !txn->tw.spill_pages ||
+ MDBX_PNL_SIZE(txn->tw.spill_pages) == 0);
+ mdbx_tassert(txn, txn->tw.loose_count == 0);
- end_mode = MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE;
+ /* fast completion of pure nested transaction */
+ end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE;
goto done;
}
/* Preserve space for spill list to avoid parent's state corruption
* if allocation fails. */
- if (txn->tw.spill_pages && parent->tw.spill_pages) {
- rc = mdbx_pnl_need(&parent->tw.spill_pages,
- MDBX_PNL_SIZE(txn->tw.spill_pages));
+ const unsigned parent_retired_len =
+ (unsigned)(uintptr_t)parent->tw.retired_pages;
+ mdbx_tassert(txn,
+ parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages));
+ const unsigned retired_delta =
+ MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len;
+ if (retired_delta) {
+ rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
+ if (txn->tw.spill_pages) {
+ if (parent->tw.spill_pages) {
+ rc = mdbx_pnl_need(&parent->tw.spill_pages,
+ MDBX_PNL_SIZE(txn->tw.spill_pages));
+ if (unlikely(rc != MDBX_SUCCESS))
+ goto fail;
+ }
+ mdbx_spill_purge(txn);
+ }
+
+ if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length >
+ parent->tw.dirtylist->allocated &&
+ !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length +
+ parent->tw.dirtylist->length))) {
+ rc = MDBX_ENOMEM;
+ goto fail;
+ }
+
//-------------------------------------------------------------------------
parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed;
@@ -11203,234 +12077,64 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
parent->mt_canary = txn->mt_canary;
parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY;
+ /* Move loose pages to parent */
+#if MDBX_ENABLE_REFUND
+ parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
+#endif /* MDBX_ENABLE_REFUND */
+ parent->tw.loose_count = txn->tw.loose_count;
+ parent->tw.loose_pages = txn->tw.loose_pages;
+
/* Merge our cursors into parent's and close them */
mdbx_cursors_eot(txn, true);
end_mode |= MDBX_END_EOTDONE;
- /* Update parent's DB table. */
+ /* Update parent's DBs array */
memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db));
parent->mt_numdbs = txn->mt_numdbs;
parent->mt_dbistate[FREE_DBI] = txn->mt_dbistate[FREE_DBI];
parent->mt_dbistate[MAIN_DBI] = txn->mt_dbistate[MAIN_DBI];
for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) {
- /* preserve parent's DB_NEW status */
- parent->mt_dbistate[i] = txn->mt_dbistate[i] | (parent->mt_dbistate[i] &
- (DBI_CREAT | DBI_FRESH));
- }
- ts_1 = latency ? mdbx_osal_monotime() : 0;
-
- /* Remove refunded pages from parent's dirty & spill lists */
- MDBX_DPL dst = mdbx_dpl_sort(parent->tw.dirtylist);
- while (dst->length && dst[dst->length].pgno >= parent->mt_next_pgno) {
- MDBX_page *mp = dst[dst->length].ptr;
- if (mp && (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0)
- mdbx_dpage_free(txn->mt_env, mp, IS_OVERFLOW(mp) ? mp->mp_pages : 1);
- dst->length -= 1;
- }
- parent->tw.dirtyroom += dst->sorted - dst->length;
- dst->sorted = dst->length;
- mdbx_tassert(parent,
- parent->mt_parent ||
- parent->tw.dirtyroom + parent->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
-
- if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0 &&
- MDBX_PNL_MOST(parent->tw.spill_pages) >= parent->mt_next_pgno << 1) {
- const MDBX_PNL ps = parent->tw.spill_pages;
-#if MDBX_PNL_ASCENDING
- unsigned i = MDBX_PNL_SIZE(ps);
- assert(MDBX_PNL_MOST(ps) == MDBX_PNL_LAST(ps));
- do
- i -= 1;
- while (i && ps[i] >= parent->mt_next_pgno << 1);
- MDBX_PNL_SIZE(ps) = i;
-#else
- assert(MDBX_PNL_MOST(ps) == MDBX_PNL_FIRST(ps));
- unsigned i = 1, len = MDBX_PNL_SIZE(ps);
- while (i < len && ps[i + 1] >= parent->mt_next_pgno << 1)
- ++i;
- MDBX_PNL_SIZE(ps) = len -= i;
- for (unsigned k = 1; k <= len; ++k)
- ps[k] = ps[k + i];
-#endif
- }
-
- /* Remove anything in our dirty list from parent's spill list */
- MDBX_DPL src = mdbx_dpl_sort(txn->tw.dirtylist);
- if (likely(src->length > 0) && parent->tw.spill_pages &&
- MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) {
- MDBX_PNL sp = parent->tw.spill_pages;
- assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno));
-
- const unsigned len = MDBX_PNL_SIZE(parent->tw.spill_pages);
- MDBX_PNL_SIZE(sp) = ~(pgno_t)0;
-
- /* Mark our dirty pages as deleted in parent spill list */
- unsigned r, w, i = 1;
- w = r = len;
- do {
- pgno_t pn = src[i].pgno << 1;
- while (pn > sp[r])
- r--;
- if (pn == sp[r]) {
- sp[r] = 1;
- w = --r;
- }
- } while (++i <= src->length);
-
- /* Squash deleted pagenums if we deleted any */
- for (r = w; ++r <= len;)
- if ((sp[r] & 1) == 0)
- sp[++w] = sp[r];
- MDBX_PNL_SIZE(sp) = w;
- assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno << 1));
- }
-
- /* Remove anything in our spill list from parent's dirty list */
- if (txn->tw.spill_pages && MDBX_PNL_SIZE(txn->tw.spill_pages) > 0) {
- const MDBX_PNL sp = txn->tw.spill_pages;
- mdbx_pnl_sort(sp);
- /* Scanning in ascend order */
- const int step = MDBX_PNL_ASCENDING ? 1 : -1;
- const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(sp);
- const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sp) + 1 : 0;
- mdbx_tassert(txn, sp[begin] <= sp[end - step]);
-
- unsigned r, w = r = mdbx_dpl_search(dst, sp[begin] >> 1);
- mdbx_tassert(txn, dst->sorted == dst->length);
- for (int i = begin; r <= dst->length;) {
- mdbx_tassert(txn, (sp[i] & 1) == 0);
- const pgno_t pgno = sp[i] >> 1;
- if (dst[r].pgno < pgno) {
- dst[w++] = dst[r++];
- } else if (dst[r].pgno > pgno) {
- i += step;
- if (i == end)
- while (r <= dst->length)
- dst[w++] = dst[r++];
- } else {
- MDBX_page *dp = dst[r++].ptr;
- if ((env->me_flags & MDBX_WRITEMAP) == 0)
- mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1);
- }
- }
- mdbx_tassert(txn, r == dst->length + 1);
- dst->length = w;
- parent->tw.dirtyroom += r - w;
- }
- assert(dst->sorted == dst->length);
- mdbx_tassert(parent,
- parent->mt_parent ||
- parent->tw.dirtyroom + parent->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
-
- unsigned d, s, l;
- /* Find length of merging our dirty list with parent's */
- for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0; ++l) {
- const pgno_t s_pgno = src[s].pgno;
- const pgno_t d_pgno = dst[d].pgno;
- d -= d_pgno >= s_pgno;
- s -= d_pgno <= s_pgno;
- }
- assert(dst->sorted == dst->length);
- dst->sorted = l += d + s;
- assert(dst->sorted >= dst->length);
- parent->tw.dirtyroom -= dst->sorted - dst->length;
-
- /* Merge our dirty list into parent's */
- for (d = dst->length, s = src->length; d > 0 && s > 0; --l) {
- if (dst[d].pgno > src[s].pgno)
- dst[l] = dst[d--];
- else if (dst[d].pgno < src[s].pgno)
- dst[l] = src[s--];
- else {
- MDBX_page *dp = dst[d--].ptr;
- if (dp && (env->me_flags & MDBX_WRITEMAP) == 0)
- mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pgno : 1);
- dst[l] = src[s--];
- }
- }
- if (s) {
- do
- dst[l--] = src[s--];
- while (s > 0);
- } else if (d) {
- do
- dst[l--] = dst[d--];
- while (d > 0);
- }
- assert(l == 0);
- dst->length = dst->sorted;
- mdbx_free(txn->tw.dirtylist);
- txn->tw.dirtylist = nullptr;
- mdbx_tassert(parent,
- parent->mt_parent ||
- parent->tw.dirtyroom + parent->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
-
- if (txn->tw.spill_pages) {
- if (parent->tw.spill_pages) {
- /* Must not fail since space was preserved above. */
- rc = mdbx_pnl_append_list(&parent->tw.spill_pages, txn->tw.spill_pages);
- mdbx_assert(env, rc == MDBX_SUCCESS);
- (void)rc;
- mdbx_pnl_free(txn->tw.spill_pages);
- mdbx_pnl_sort(parent->tw.spill_pages);
- } else {
- parent->tw.spill_pages = txn->tw.spill_pages;
- }
- }
-
- parent->mt_flags &= ~(MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD);
- if (parent->tw.spill_pages) {
- assert(mdbx_pnl_check4assert(parent->tw.spill_pages,
- parent->mt_next_pgno << 1));
- if (MDBX_PNL_SIZE(parent->tw.spill_pages))
- parent->mt_flags |= MDBX_TXN_SPILLS;
+ /* preserve parent's status */
+ const uint8_t state =
+ txn->mt_dbistate[i] |
+ (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
+ mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i,
+ (parent->mt_dbistate[i] != state) ? "update" : "still",
+ parent->mt_dbistate[i], state);
+ parent->mt_dbistate[i] = state;
}
+ ts_1 = latency ? mdbx_osal_monotime() : 0;
+ mdbx_txn_merge(parent, txn, parent_retired_len);
ts_2 = latency ? mdbx_osal_monotime() : 0;
- /* Append our loose page list to parent's */
- if (txn->tw.loose_pages) {
- MDBX_page **lp = &parent->tw.loose_pages;
- while (*lp)
- lp = &(*lp)->mp_next;
- *lp = txn->tw.loose_pages;
- parent->tw.loose_count += txn->tw.loose_count;
- }
- if (txn->tw.retired2parent_pages) {
- MDBX_page *mp = txn->tw.retired2parent_pages;
- do {
- MDBX_page *next = mp->mp_next;
- rc = mdbx_page_loose(parent, mp);
- if (unlikely(rc != MDBX_SUCCESS))
- goto fail;
- mp = next;
- } while (mp);
- }
-
env->me_txn = parent;
parent->mt_child = NULL;
mdbx_tassert(parent, mdbx_dirtylist_check(parent));
- /* Scan parent's loose page for suitable for refund */
- for (MDBX_page *mp = parent->tw.loose_pages; mp; mp = mp->mp_next) {
- if (mp->mp_pgno == parent->mt_next_pgno - 1) {
- mdbx_refund(parent);
- break;
- }
+#if MDBX_ENABLE_REFUND
+ mdbx_refund(parent);
+ if (mdbx_assert_enabled()) {
+ /* Check parent's loose pages not suitable for refund */
+ for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next)
+ mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl &&
+ lp->mp_pgno + 1 < parent->mt_next_pgno);
+ /* Check parent's reclaimed pages not suitable for refund */
+ if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist))
+ mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 <
+ parent->mt_next_pgno);
}
+#endif /* MDBX_ENABLE_REFUND */
ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0;
txn->mt_signature = 0;
mdbx_free(txn);
- mdbx_tassert(parent, mdbx_dirtylist_check(parent));
+ mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
rc = MDBX_SUCCESS;
goto provide_latency;
}
mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ txn->mt_env->me_options.dp_limit);
mdbx_cursors_eot(txn, false);
end_mode |= MDBX_END_EOTDONE;
@@ -11463,6 +12167,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
goto fail;
}
MDBX_db *db = &txn->mt_dbs[i];
+ mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN
+ " -> %" PRIaTXN,
+ i, pp_txnid2chk(txn), db->md_mod_txnid);
db->md_mod_txnid = pp_txnid2chk(txn);
data.iov_base = db;
WITH_CURSOR_TRACKING(couple.outer,
@@ -11497,12 +12204,13 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
txn->mt_dbs[MAIN_DBI].md_mod_txnid = pp_txnid2chk(txn);
MDBX_meta meta, *head = mdbx_meta_head(env);
- meta.mm_magic_and_version = head->mm_magic_and_version;
+ memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8);
meta.mm_extra_flags = head->mm_extra_flags;
meta.mm_validator_id = head->mm_validator_id;
meta.mm_extra_pagehdr = head->mm_extra_pagehdr;
- meta.mm_pages_retired =
- head->mm_pages_retired + MDBX_PNL_SIZE(txn->tw.retired_pages);
+ unaligned_poke_u64(4, meta.mm_pages_retired,
+ unaligned_peek_u64(4, head->mm_pages_retired) +
+ MDBX_PNL_SIZE(txn->tw.retired_pages));
meta.mm_geo = txn->mt_geo;
meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
@@ -11550,13 +12258,14 @@ static __cold int
mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize,
const MDBX_page *const page, const unsigned meta_number,
MDBX_meta *dest, const unsigned guess_pagesize) {
- if (meta->mm_magic_and_version != MDBX_DATA_MAGIC &&
- meta->mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) {
+ const uint64_t magic_and_version =
+ unaligned_peek_u64(4, &meta->mm_magic_and_version);
+ if (magic_and_version != MDBX_DATA_MAGIC &&
+ magic_and_version != MDBX_DATA_MAGIC_DEVEL) {
mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number,
- meta->mm_magic_and_version);
- return ((meta->mm_magic_and_version >> 8) != MDBX_MAGIC)
- ? MDBX_INVALID
- : MDBX_VERSION_MISMATCH;
+ magic_and_version);
+ return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
+ : MDBX_VERSION_MISMATCH;
}
if (page->mp_pgno != meta_number) {
@@ -11583,16 +12292,19 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize,
mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize);
}
- if (safe64_read(&meta->mm_txnid_a) != safe64_read(&meta->mm_txnid_b)) {
+ if (unaligned_peek_u64(4, &meta->mm_txnid_a) !=
+ unaligned_peek_u64(4, &meta->mm_txnid_b)) {
mdbx_warning("meta[%u] not completely updated, skip it", meta_number);
return MDBX_RESULT_TRUE;
}
/* LY: check signature as a checksum */
- if (META_IS_STEADY(meta) && meta->mm_datasync_sign != mdbx_meta_sign(meta)) {
+ if (META_IS_STEADY(meta) &&
+ unaligned_peek_u64(4, &meta->mm_datasync_sign) != mdbx_meta_sign(meta)) {
mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64
" != 0x%" PRIx64 "), skip it",
- meta_number, meta->mm_datasync_sign, mdbx_meta_sign(meta));
+ meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign),
+ mdbx_meta_sign(meta));
return MDBX_RESULT_TRUE;
}
@@ -11603,7 +12315,7 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize,
meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower,
meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper,
meta->mm_geo.grow, meta->mm_geo.shrink,
- meta->mm_txnid_a.inconsistent, mdbx_durable_str(meta));
+ unaligned_peek_u64(4, meta->mm_txnid_a), mdbx_durable_str(meta));
/* LY: check min-pages value */
if (meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO) {
@@ -11731,7 +12443,7 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize,
return MDBX_CORRUPTED;
}
- if (safe64_read(&meta->mm_txnid_a) == 0) {
+ if (unaligned_peek_u64(4, &meta->mm_txnid_a) == 0) {
mdbx_warning("meta[%u] has zero txnid, skip it", meta_number);
return MDBX_RESULT_TRUE;
}
@@ -11749,7 +12461,7 @@ static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
return rc;
memset(dest, 0, sizeof(MDBX_meta));
- dest->mm_datasync_sign = MDBX_DATASIGN_WEAK;
+ unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK);
rc = MDBX_CORRUPTED;
/* Read twice all meta pages so we can find the latest one. */
@@ -11850,7 +12562,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model,
model->mp_pgno = num;
model->mp_flags = P_META;
MDBX_meta *const model_meta = page_meta(model);
- model_meta->mm_magic_and_version = MDBX_DATA_MAGIC;
+ unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC);
model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower);
model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper);
@@ -11877,7 +12589,8 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model,
model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
mdbx_meta_set_txnid(env, model_meta, MIN_TXNID + num);
- model_meta->mm_datasync_sign = mdbx_meta_sign(model_meta);
+ unaligned_poke_u64(4, model_meta->mm_datasync_sign,
+ mdbx_meta_sign(model_meta));
return (MDBX_page *)((uint8_t *)model + env->me_psize);
}
@@ -11996,12 +12709,12 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
shrink = pending->mm_geo.now - bottom;
pending->mm_geo.now = bottom;
if (unlikely(mdbx_meta_txnid_stable(env, head) ==
- pending->mm_txnid_a.inconsistent)) {
+ unaligned_peek_u64(4, pending->mm_txnid_a))) {
const txnid_t txnid =
- safe64_txnid_next(pending->mm_txnid_a.inconsistent);
+ safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a));
if (unlikely(txnid > MAX_TXNID)) {
- mdbx_error("%s", "txnid overflow!");
rc = MDBX_TXN_FULL;
+ mdbx_error("txnid overflow, raise %d", rc);
goto fail;
}
mdbx_meta_set_txnid(env, pending, txnid);
@@ -12037,16 +12750,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
/* Steady or Weak */
if (rc == MDBX_RESULT_FALSE /* carry steady */) {
- pending->mm_datasync_sign = mdbx_meta_sign(pending);
*env->me_unsynced_pages = 0;
*env->me_sync_timestamp = mdbx_osal_monotime();
+ unaligned_poke_u64(4, pending->mm_datasync_sign, mdbx_meta_sign(pending));
} else {
assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
- pending->mm_datasync_sign = MDBX_DATASIGN_WEAK;
+ unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK);
}
MDBX_meta *target = nullptr;
- if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a.inconsistent) {
+ if (mdbx_meta_txnid_stable(env, head) ==
+ unaligned_peek_u64(4, pending->mm_txnid_a)) {
mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs,
sizeof(head->mm_dbs)) == 0);
mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary,
@@ -12077,7 +12791,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower,
pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper,
pending->mm_geo.grow, pending->mm_geo.shrink,
- pending->mm_txnid_a.inconsistent, mdbx_durable_str(pending));
+ unaligned_peek_u64(4, pending->mm_txnid_a),
+ mdbx_durable_str(pending));
mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO
"/%" PRIaPGNO,
@@ -12100,14 +12815,16 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2));
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
- mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) <
- pending->mm_txnid_a.inconsistent);
+ mdbx_ensure(env,
+ target == head || mdbx_meta_txnid_stable(env, target) <
+ unaligned_peek_u64(4, pending->mm_txnid_a));
if (flags & MDBX_WRITEMAP) {
mdbx_jitter4testing(true);
if (likely(target != head)) {
/* LY: 'invalidate' the meta. */
- mdbx_meta_update_begin(env, target, pending->mm_txnid_a.inconsistent);
- target->mm_datasync_sign = MDBX_DATASIGN_WEAK;
+ mdbx_meta_update_begin(env, target,
+ unaligned_peek_u64(4, pending->mm_txnid_a));
+ unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK);
#ifndef NDEBUG
/* debug: provoke failure to catch a violators, but don't touch mm_psize
* and mm_flags to allow readers catch actual pagesize. */
@@ -12122,18 +12839,19 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI];
target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI];
target->mm_canary = pending->mm_canary;
- target->mm_pages_retired = pending->mm_pages_retired;
+ memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8);
mdbx_jitter4testing(true);
mdbx_flush_incoherent_cpu_writeback();
/* LY: 'commit' the meta */
- mdbx_meta_update_end(env, target, pending->mm_txnid_b.inconsistent);
+ mdbx_meta_update_end(env, target,
+ unaligned_peek_u64(4, pending->mm_txnid_b));
mdbx_jitter4testing(true);
} else {
/* dangerous case (target == head), only mm_datasync_sign could
* me updated, check assertions once again */
mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) ==
- pending->mm_txnid_a.inconsistent &&
+ unaligned_peek_u64(4, pending->mm_txnid_a) &&
!META_IS_STEADY(head) && META_IS_STEADY(pending));
mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo,
sizeof(head->mm_geo)) == 0);
@@ -12142,7 +12860,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary,
sizeof(head->mm_canary)) == 0);
}
- target->mm_datasync_sign = pending->mm_datasync_sign;
+ memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8);
mdbx_flush_incoherent_cpu_writeback();
mdbx_jitter4testing(true);
/* sync meta-pages */
@@ -12179,7 +12897,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if (flags & MDBX_NOMETASYNC)
*env->me_unsynced_pages += 1;
else
- *env->me_meta_sync_txnid = pending->mm_txnid_a.low;
+ *env->me_meta_sync_txnid =
+ (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a);
/* LY: shrink datafile if needed */
if (unlikely(shrink)) {
@@ -12211,10 +12930,10 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) {
env->me_psize = (unsigned)pagesize;
STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4);
- STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_DPL_TXNFULL);
+ STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT / 4);
const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
- mdbx_ensure(env,
- maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_DPL_TXNFULL);
+ mdbx_ensure(env, maxgc_ov1page > 42 &&
+ maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4);
env->me_maxgc_ov1page = (unsigned)maxgc_ov1page;
STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42);
@@ -12229,6 +12948,12 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) {
env->me_psize2log = (uint8_t)log2n(pagesize);
mdbx_assert(env, pgno2bytes(env, 1) == pagesize);
mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2);
+
+ const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE);
+ if (env->me_options.dp_limit > max_pgno - NUM_METAS)
+ env->me_options.dp_limit = max_pgno - NUM_METAS;
+ if (env->me_options.dp_initial > env->me_options.dp_limit)
+ env->me_options.dp_initial = env->me_options.dp_limit;
}
__cold int mdbx_env_create(MDBX_env **penv) {
@@ -12244,6 +12969,19 @@ __cold int mdbx_env_create(MDBX_env **penv) {
env->me_pid = mdbx_getpid();
env->me_stuck_meta = -1;
+ env->me_options.dp_reserve_limit = 1024;
+ env->me_options.rp_augment_limit = 256 * 1024;
+ env->me_options.dp_limit = 64 * 1024;
+ if (env->me_options.dp_limit > MAX_PAGENO - NUM_METAS)
+ env->me_options.dp_limit = MAX_PAGENO - NUM_METAS;
+ env->me_options.dp_initial = MDBX_PNL_INITIAL;
+ if (env->me_options.dp_initial > env->me_options.dp_limit)
+ env->me_options.dp_initial = env->me_options.dp_limit;
+ env->me_options.spill_max_denominator = 8;
+ env->me_options.spill_min_denominator = 8;
+ env->me_options.spill_parent4child_denominator = 0;
+ env->me_options.dp_loose_limit = 64;
+
int rc;
const size_t os_psize = mdbx_syspagesize();
if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) {
@@ -12616,8 +13354,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
const txnid_t txnid =
safe64_txnid_next(mdbx_meta_txnid_stable(env, head));
if (unlikely(txnid > MAX_TXNID)) {
- mdbx_error("%s", "txnid overflow!");
rc = MDBX_TXN_FULL;
+ mdbx_error("txnid overflow, raise %d", rc);
goto bailout;
}
mdbx_meta_set_txnid(env, &meta, txnid);
@@ -12639,57 +13377,19 @@ __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) {
}
__cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(dbs > MDBX_MAX_DBI))
- return MDBX_EINVAL;
-
- if (unlikely(env->me_map))
- return MDBX_EPERM;
-
- env->me_maxdbs = dbs + CORE_DBS;
- return MDBX_SUCCESS;
+ return __inline_mdbx_env_set_maxdbs(env, dbs);
}
-__cold int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(!dbs))
- return MDBX_EINVAL;
-
- *dbs = env->me_maxdbs;
- return MDBX_SUCCESS;
+__cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) {
+ return __inline_mdbx_env_get_maxdbs(env, dbs);
}
__cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(readers < 1 || readers > MDBX_READERS_LIMIT))
- return MDBX_EINVAL;
-
- if (unlikely(env->me_map))
- return MDBX_EPERM;
-
- env->me_maxreaders = readers;
- return MDBX_SUCCESS;
+ return __inline_mdbx_env_set_maxreaders(env, readers);
}
__cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(!readers))
- return MDBX_EINVAL;
-
- *readers = env->me_maxreaders;
- return MDBX_SUCCESS;
+ return __inline_mdbx_env_get_maxreaders(env, readers);
}
/* Further setup required for opening an MDBX environment */
@@ -12741,7 +13441,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root,
meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now,
meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink,
- meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta));
+ unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta));
mdbx_setup_pagesize(env, meta.mm_psize);
const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next);
@@ -12808,7 +13508,8 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root,
meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now,
meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink,
- meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta));
+ unaligned_peek_u64(4, meta.mm_txnid_a),
+ mdbx_durable_str(&meta));
} else {
/* fetch back 'now/current' size, since it was ignored during comparison
* and may differ. */
@@ -12983,9 +13684,9 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
if (env->me_flags & MDBX_WRITEMAP) {
/* It is possible to update txnid without safe64_write(),
* since DB opened exclusive for now */
- head->mm_txnid_a.inconsistent = undo_txnid;
- head->mm_datasync_sign = MDBX_DATASIGN_WEAK;
- head->mm_txnid_b.inconsistent = undo_txnid;
+ unaligned_poke_u64(4, head->mm_txnid_a, undo_txnid);
+ unaligned_poke_u64(4, head->mm_datasync_sign, MDBX_DATASIGN_WEAK);
+ unaligned_poke_u64(4, head->mm_txnid_b, undo_txnid);
const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb;
const size_t paged_offset = floor_powerof2(offset, env->me_os_psize);
const size_t paged_length = ceil_powerof2(
@@ -12994,7 +13695,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
} else {
MDBX_meta rollback = *head;
mdbx_meta_set_txnid(env, &rollback, undo_txnid);
- rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK;
+ unaligned_poke_u64(4, rollback.mm_datasync_sign, MDBX_DATASIGN_WEAK);
err = mdbx_pwrite(env->me_lazy_fd, &rollback, sizeof(MDBX_meta),
(uint8_t *)head - (uint8_t *)env->me_map);
}
@@ -13056,7 +13757,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
const txnid_t txnid = mdbx_meta_txnid_stable(env, head);
const txnid_t next_txnid = safe64_txnid_next(txnid);
if (unlikely(txnid > MAX_TXNID)) {
- mdbx_error("%s", "txnid overflow!");
+ mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL);
return MDBX_TXN_FULL;
}
mdbx_notice("updating meta.geo: "
@@ -13498,7 +14199,7 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) {
page->mp_flags = P_META;
MDBX_meta *meta = page_meta(page);
- meta->mm_magic_and_version = MDBX_DATA_MAGIC;
+ unaligned_poke_u64(4, meta->mm_magic_and_version, MDBX_DATA_MAGIC);
meta->mm_psize = env->me_psize;
txnid_t txnid = mdbx_meta_txnid_stable(env, meta);
const txnid_t txnid0 = mdbx_meta_txnid_stable(env, METAPAGE(env, 0));
@@ -13513,11 +14214,11 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) {
if (!META_IS_STEADY(meta) || mdbx_recent_committed_txnid(env) != txnid) {
if (unlikely(txnid > MAX_TXNID)) {
- mdbx_error("%s", "txnid overflow!");
+ mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL);
return MDBX_TXN_FULL;
}
mdbx_meta_set_txnid(env, meta, txnid);
- meta->mm_datasync_sign = mdbx_meta_sign(meta);
+ unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta));
}
if (env->me_flags & MDBX_WRITEMAP) {
@@ -13779,15 +14480,9 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
}
}
#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */
- env->me_dirtylist = mdbx_calloc(MDBX_DPL_TXNFULL + 1, sizeof(MDBX_DP));
- if (!env->me_dirtylist)
- rc = MDBX_ENOMEM;
}
env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE;
- if (unlikely(rc != MDBX_SUCCESS))
- goto bailout;
-
env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1);
env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx));
env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0]));
@@ -13943,10 +14638,13 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
txn->mt_dbxs = env->me_dbxs;
txn->mt_flags = MDBX_TXN_FINISHED;
env->me_txn0 = txn;
- txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
- txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
- if (!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)
- rc = MDBX_ENOMEM;
+ rc = mdbx_dpl_alloc(txn);
+ if (likely(rc == MDBX_SUCCESS)) {
+ txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
+ txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL);
+ if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist))
+ rc = MDBX_ENOMEM;
+ }
} else
rc = MDBX_ENOMEM;
}
@@ -13958,7 +14656,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
MDBX_db *db = &meta->mm_dbs[MAIN_DBI];
mdbx_debug("opened database version %u, pagesize %u",
- (uint8_t)meta->mm_magic_and_version, env->me_psize);
+ (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version),
+ env->me_psize);
mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN,
data_page(meta)->mp_pgno, mdbx_meta_txnid_fluid(env, meta));
mdbx_debug("depth: %u", db->md_depth);
@@ -14043,8 +14742,8 @@ static __cold int mdbx_env_close0(MDBX_env *env) {
mdbx_free(env->me_dbiseqs);
mdbx_free(env->me_dbflags);
mdbx_free(env->me_pathname);
- mdbx_free(env->me_dirtylist);
if (env->me_txn0) {
+ mdbx_dpl_free(env->me_txn0);
mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed);
mdbx_pnl_free(env->me_txn0->tw.retired_pages);
mdbx_pnl_free(env->me_txn0->tw.spill_pages);
@@ -14120,10 +14819,10 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
mdbx_ensure(env, mdbx_ipclock_destroy(&env->me_lckless_stub.wlock) == 0);
#endif /* MDBX_LOCKING */
- while ((dp = env->me_dpages) != NULL) {
+ while ((dp = env->me_dp_reserve) != NULL) {
ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next));
VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
- env->me_dpages = dp->mp_next;
+ env->me_dp_reserve = dp->mp_next;
mdbx_free(dp);
}
VALGRIND_DESTROY_MEMPOOL(env);
@@ -14394,9 +15093,10 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret,
MDBX_page *p = nullptr;
mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
mdbx_assert(env, pp_txnid >= MIN_TXNID && pp_txnid <= txn->mt_txnid);
- const uint16_t illegal_bits = (txn->mt_flags & MDBX_TXN_RDONLY)
- ? P_LOOSE | P_SUBP | P_META | P_DIRTY
- : P_LOOSE | P_SUBP | P_META;
+ const uint16_t illegal_bits =
+ (txn->mt_flags & MDBX_TXN_RDONLY)
+ ? ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)
+ : ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_DIRTY);
int level;
if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) {
level = 1;
@@ -14405,7 +15105,7 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret,
* because the dirty list got full. Bring this page
* back in from the map (but don't unspill it here,
* leave that unless page_touch happens again). */
- if (txn->tw.spill_pages &&
+ if (unlikely(txn->mt_flags & MDBX_TXN_SPILLS) && txn->tw.spill_pages &&
mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) {
pp_txnid = txn->mt_txnid;
goto spilled;
@@ -15664,11 +16364,11 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) {
rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
+ *mc->mc_dbistate |= DBI_DIRTY;
+ mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY);
if (unlikely(rc))
return rc;
- *mc->mc_dbistate |= DBI_DIRTY;
- mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
}
mc->mc_top = 0;
if (mc->mc_snum) {
@@ -15700,6 +16400,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
if (unlikely(rc != MDBX_SUCCESS))
return rc;
+ if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)))
+ return MDBX_BAD_DBI;
+
mdbx_cassert(mc, cursor_is_tracked(mc));
env = mc->mc_txn->mt_env;
@@ -15943,7 +16646,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
} else {
rdata = data;
}
- if (unlikely(rc2 = mdbx_page_spill(mc, key, rdata)))
+ if (unlikely(rc2 = mdbx_cursor_spill(mc, key, rdata)))
return rc2;
}
@@ -15974,8 +16677,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
mc->mc_xcursor->mx_dbx.md_klen_max =
data->iov_len);
}
- *mc->mc_dbistate |= DBI_DIRTY;
- mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED)
np->mp_flags |= P_LEAF2;
mc->mc_flags |= C_INITIALIZED;
@@ -16091,7 +16792,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
if (unlikely(!np))
return MDBX_ENOMEM;
/* Note - this page is already counted in parent's dirtyroom */
- rc2 = mdbx_dpl_append(mc->mc_txn->tw.dirtylist, pg, np);
+ rc2 = mdbx_dpl_append(mc->mc_txn, pg, np);
if (unlikely(rc2 != MDBX_SUCCESS)) {
rc = rc2;
mdbx_dpage_free(env, np, ovpages);
@@ -16111,6 +16812,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data,
whole - off);
memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */
omp = np;
+ mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn));
}
node_set_ds(node, data->iov_len);
if (F_ISSET(flags, MDBX_RESERVE))
@@ -16499,6 +17201,9 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
if (unlikely(rc != MDBX_SUCCESS))
return rc;
+ if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)))
+ return MDBX_BAD_DBI;
+
if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
return MDBX_ENODATA;
@@ -16506,7 +17211,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
return MDBX_NOTFOUND;
if (unlikely(!(flags & MDBX_NOSPILL) &&
- (rc = mdbx_page_spill(mc, NULL, NULL))))
+ (rc = mdbx_cursor_spill(mc, NULL, NULL))))
return rc;
rc = mdbx_cursor_touch(mc);
@@ -16619,13 +17324,15 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num,
if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL))))
return rc;
*mp = np;
- mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno,
- mc->mc_txn->mt_env->me_psize);
+ mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi,
+ np->mp_pgno, num);
np->mp_flags = (uint16_t)(flags | P_DIRTY);
np->mp_txnid = INVALID_TXNID;
np->mp_lower = 0;
np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
+ *mc->mc_dbistate |= DBI_DIRTY;
+ mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY;
mc->mc_db->md_branch_pages += IS_BRANCH(np);
mc->mc_db->md_leaf_pages += IS_LEAF(np);
if (unlikely(IS_OVERFLOW(np))) {
@@ -17135,6 +17842,9 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple,
/* Initialize a cursor for a given transaction and database. */
static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) {
STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0);
+ if (unlikely(TXN_DBI_CHANGED(txn, dbi)))
+ return MDBX_BAD_DBI;
+
return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn,
&txn->mt_dbs[dbi], &txn->mt_dbxs[dbi],
&txn->mt_dbistate[dbi]);
@@ -17306,19 +18016,21 @@ again:
}
void mdbx_cursor_close(MDBX_cursor *mc) {
- if (mc) {
+ if (likely(mc)) {
mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE ||
mc->mc_signature == MDBX_MC_READY4CLOSE);
+ MDBX_txn *const txn = mc->mc_txn;
if (!mc->mc_backup) {
+ mc->mc_txn = NULL;
/* Remove from txn, if tracked.
* A read-only txn (!C_UNTRACK) may have been freed already,
* so do not peek inside it. Only write txns track cursors. */
if (mc->mc_flags & C_UNTRACK) {
- mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY));
- MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi];
+ mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
+ MDBX_cursor **prev = &txn->tw.cursors[mc->mc_dbi];
while (*prev && *prev != mc)
prev = &(*prev)->mc_next;
- mdbx_cassert(mc, *prev == mc);
+ mdbx_tassert(txn, *prev == mc);
*prev = mc->mc_next;
}
mc->mc_signature = 0;
@@ -17326,7 +18038,8 @@ void mdbx_cursor_close(MDBX_cursor *mc) {
mdbx_free(mc);
} else {
/* Cursor closed before nested txn ends */
- mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE);
+ mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE);
+ mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS);
mc->mc_signature = MDBX_MC_WAIT4EOT;
}
}
@@ -18029,6 +18742,7 @@ static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
cdst->mc_snum = csrc->mc_snum;
cdst->mc_top = csrc->mc_top;
cdst->mc_flags = csrc->mc_flags;
+ cdst->mc_dbistate = csrc->mc_dbistate;
for (unsigned i = 0; i < csrc->mc_snum; i++) {
cdst->mc_pg[i] = csrc->mc_pg[i];
@@ -18316,7 +19030,7 @@ static __cold int mdbx_page_check(MDBX_cursor *const mc,
if (IS_OVERFLOW(mp)) {
if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2))
return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages);
- if (unlikely(mp->mp_pgno > mc->mc_txn->mt_next_pgno - mp->mp_pages))
+ if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno))
return bad_page(mp, "overflow page beyond (%u) next-pgno\n",
mp->mp_pgno + mp->mp_pages);
return MDBX_SUCCESS;
@@ -18583,7 +19297,7 @@ static __cold int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) {
mdbx_tassert(mc->mc_txn, mc->mc_txn->mt_parent ||
mc->mc_txn->tw.dirtyroom +
mc->mc_txn->tw.dirtylist->length ==
- MDBX_DPL_TXNFULL);
+ mc->mc_txn->mt_env->me_options.dp_limit);
mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1);
if (unlikely(mc->mc_top != mc->mc_snum - 1))
return MDBX_CURSOR_FULL;
@@ -19642,7 +20356,7 @@ static __cold void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
/* Update signature */
assert(meta->mm_geo.now >= meta->mm_geo.next);
- meta->mm_datasync_sign = mdbx_meta_sign(meta);
+ unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta));
}
/* Make resizeable */
@@ -19831,7 +20545,7 @@ static __cold int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
make_sizeable(headcopy);
/* Update signature to steady */
- headcopy->mm_datasync_sign = mdbx_meta_sign(headcopy);
+ unaligned_poke_u64(4, headcopy->mm_datasync_sign, mdbx_meta_sign(headcopy));
/* Copy the data */
const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno);
@@ -20276,18 +20990,15 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
const MDBX_meta *const recent_meta = mdbx_meta_head(env);
arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta);
arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0);
- arg->mi_meta0_sign = meta0->mm_datasync_sign;
+ arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign);
arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1);
- arg->mi_meta1_sign = meta1->mm_datasync_sign;
+ arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign);
arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2);
- arg->mi_meta2_sign = meta2->mm_datasync_sign;
+ arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign);
if (likely(bytes > size_before_bootid)) {
- arg->mi_bootid.meta0.x = meta0->mm_bootid.x;
- arg->mi_bootid.meta1.x = meta0->mm_bootid.x;
- arg->mi_bootid.meta2.x = meta0->mm_bootid.x;
- arg->mi_bootid.meta0.y = meta0->mm_bootid.y;
- arg->mi_bootid.meta1.y = meta0->mm_bootid.y;
- arg->mi_bootid.meta2.y = meta0->mm_bootid.y;
+ memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16);
+ memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16);
+ memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16);
}
const MDBX_meta *txn_meta = recent_meta;
@@ -20314,11 +21025,14 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
arg->mi_mapsize = env->me_dxb_mmap.limit;
mdbx_compiler_barrier();
if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) &&
- arg->mi_meta0_sign == meta0->mm_datasync_sign &&
+ arg->mi_meta0_sign ==
+ unaligned_peek_u64(4, meta0->mm_datasync_sign) &&
arg->mi_meta1_txnid == mdbx_meta_txnid_fluid(env, meta1) &&
- arg->mi_meta1_sign == meta1->mm_datasync_sign &&
+ arg->mi_meta1_sign ==
+ unaligned_peek_u64(4, meta1->mm_datasync_sign) &&
arg->mi_meta2_txnid == mdbx_meta_txnid_fluid(env, meta2) &&
- arg->mi_meta2_sign == meta2->mm_datasync_sign &&
+ arg->mi_meta2_sign ==
+ unaligned_peek_u64(4, meta2->mm_datasync_sign) &&
recent_meta == mdbx_meta_head(env) &&
arg->mi_recent_txnid == mdbx_meta_txnid_fluid(env, recent_meta)))
break;
@@ -20562,18 +21276,8 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags,
goto early_bailout;
}
- if (txn->mt_numdbs < env->me_numdbs) {
- /* Import handles from env */
- for (unsigned i = txn->mt_numdbs; i < env->me_numdbs; ++i) {
- txn->mt_dbistate[i] = 0;
- if (env->me_dbflags[i] & DB_VALID) {
- txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS;
- txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE;
- mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL);
- }
- }
- txn->mt_numdbs = env->me_numdbs;
- }
+ /* Import handles from env */
+ dbi_import_locked(txn);
/* Rescan after mutex acquisition & import handles */
for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) {
@@ -20633,18 +21337,16 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags,
txn->mt_dbistate[slot] = (uint8_t)dbiflags;
txn->mt_dbxs[slot].md_name.iov_base = namedup;
txn->mt_dbxs[slot].md_name.iov_len = len;
- if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0)
- txn->tw.cursors[slot] = NULL;
- txn->mt_numdbs += (slot == txn->mt_numdbs);
- if ((dbiflags & DBI_CREAT) == 0) {
+ txn->mt_dbiseqs[slot] = ++env->me_dbiseqs[slot];
+ if (!(dbiflags & DBI_CREAT))
env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID;
+ if (txn->mt_numdbs == slot) {
mdbx_compiler_barrier();
- if (env->me_numdbs <= slot)
- env->me_numdbs = slot + 1;
- } else {
- env->me_dbiseqs[slot]++;
+ txn->mt_numdbs = env->me_numdbs = slot + 1;
+ if (!(txn->mt_flags & MDBX_TXN_RDONLY))
+ txn->tw.cursors[slot] = NULL;
}
- txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot];
+ mdbx_assert(env, env->me_numdbs > slot);
*dbi = slot;
}
@@ -20702,10 +21404,15 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
return MDBX_BAD_DBI;
env->me_dbflags[dbi] = 0;
+ env->me_dbiseqs[dbi]++;
env->me_dbxs[dbi].md_name.iov_len = 0;
mdbx_compiler_barrier();
env->me_dbxs[dbi].md_name.iov_base = NULL;
mdbx_free(ptr);
+
+ if (env->me_numdbs == dbi + 1)
+ env->me_numdbs = dbi;
+
return MDBX_SUCCESS;
}
@@ -20719,7 +21426,9 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
rc = mdbx_fastmutex_acquire(&env->me_dbi_lock);
if (likely(rc == MDBX_SUCCESS)) {
- rc = mdbx_dbi_close_locked(env, dbi);
+ rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID))
+ ? mdbx_dbi_close_locked(env, dbi)
+ : MDBX_BAD_DBI;
mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
}
return rc;
@@ -20885,7 +21594,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) {
txn->mt_flags |= MDBX_TXN_ERROR;
goto bailout;
}
- env->me_dbiseqs[dbi]++;
mdbx_dbi_close_locked(env, dbi);
mdbx_ensure(env,
mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS);
@@ -20974,11 +21682,14 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func,
if (txnid) {
retry_header:;
const MDBX_meta *const recent_meta = mdbx_meta_head(env);
- const uint64_t head_pages_retired = recent_meta->mm_pages_retired;
+ const uint64_t head_pages_retired =
+ unaligned_peek_u64(4, recent_meta->mm_pages_retired);
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, recent_meta);
mdbx_compiler_barrier();
- if (unlikely(recent_meta != mdbx_meta_head(env) ||
- head_pages_retired != recent_meta->mm_pages_retired) ||
+ if (unlikely(
+ recent_meta != mdbx_meta_head(env) ||
+ head_pages_retired !=
+ unaligned_peek_u64(4, recent_meta->mm_pages_retired)) ||
head_txnid != mdbx_meta_txnid_fluid(env, recent_meta))
goto retry_header;
@@ -21228,7 +21939,8 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env,
const MDBX_meta *head_meta = mdbx_meta_head(env);
const txnid_t gap =
(mdbx_meta_txnid_stable(env, head_meta) - laggard) / MDBX_TXNID_STEP;
- const uint64_t head_retired = head_meta->mm_pages_retired;
+ const uint64_t head_retired =
+ unaligned_peek_u64(4, head_meta->mm_pages_retired);
const size_t space =
(oldest_retired > head_retired)
? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired))
@@ -21261,43 +21973,11 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env,
}
__cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(env->me_flags & MDBX_RDONLY))
- return MDBX_EACCESS;
-
- if (unlikely(!env->me_map))
- return MDBX_EPERM;
-
- *env->me_autosync_threshold = bytes2pgno(env, threshold + env->me_psize - 1);
- if (threshold) {
- rc = mdbx_env_sync_poll(env);
- if (unlikely(MDBX_IS_ERROR(rc)))
- return rc;
- }
- return MDBX_SUCCESS;
+ return __inline_mdbx_env_set_syncbytes(env, threshold);
}
__cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
- int rc = check_env(env);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
-
- if (unlikely(env->me_flags & MDBX_RDONLY))
- return MDBX_EACCESS;
-
- if (unlikely(!env->me_map))
- return MDBX_EPERM;
-
- *env->me_autosync_period = mdbx_osal_16dot16_to_monotime(seconds_16dot16);
- if (seconds_16dot16) {
- rc = mdbx_env_sync_poll(env);
- if (unlikely(MDBX_IS_ERROR(rc)))
- return rc;
- }
- return MDBX_SUCCESS;
+ return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16);
}
__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) {
@@ -22457,7 +23137,7 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) {
return -1;
STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX);
- const uint64_t limit = pagesize * (uint64_t)(MDBX_DPL_TXNFULL - 1);
+ const uint64_t limit = pagesize * (uint64_t)(MDBX_PGL_LIMIT - 1);
return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit
: (intptr_t)MAX_MAPSIZE;
}
@@ -22701,6 +23381,229 @@ __cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) {
return get_default_datacmp(flags);
}
+__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
+ const uint64_t value) {
+ int err = check_env(env);
+ if (unlikely(err != MDBX_SUCCESS))
+ return err;
+
+ const bool lock_needed = (env->me_map && env->me_txn0 &&
+ env->me_txn0->mt_owner != mdbx_thread_self());
+ bool should_unlock = false;
+ switch (option) {
+ case MDBX_opt_sync_bytes:
+ if (unlikely(env->me_flags & MDBX_RDONLY))
+ return MDBX_EACCESS;
+ if (unlikely(!env->me_autosync_threshold))
+ return MDBX_EPERM;
+ if (sizeof(value) > sizeof(size_t) && unlikely(value != (size_t)value))
+ return MDBX_TOO_LARGE;
+ if ((*env->me_autosync_threshold =
+ bytes2pgno(env, (size_t)value + env->me_psize - 1)) != 0) {
+ err = mdbx_env_sync_poll(env);
+ if (unlikely(MDBX_IS_ERROR(err)))
+ return err;
+ }
+ break;
+
+ case MDBX_opt_sync_period:
+ if (unlikely(env->me_flags & MDBX_RDONLY))
+ return MDBX_EACCESS;
+ if (unlikely(!env->me_autosync_period))
+ return MDBX_EPERM;
+ if (unlikely(value > UINT32_MAX))
+ return MDBX_TOO_LARGE;
+ if ((*env->me_autosync_period =
+ mdbx_osal_16dot16_to_monotime((uint32_t)value)) != 0) {
+ err = mdbx_env_sync_poll(env);
+ if (unlikely(MDBX_IS_ERROR(err)))
+ return err;
+ }
+ break;
+
+ case MDBX_opt_max_db:
+ if (unlikely(value > MDBX_MAX_DBI))
+ return MDBX_EINVAL;
+ if (unlikely(env->me_map))
+ return MDBX_EPERM;
+ env->me_maxdbs = (unsigned)value + CORE_DBS;
+ break;
+
+ case MDBX_opt_max_readers:
+ if (unlikely(value < 1 || value > MDBX_READERS_LIMIT))
+ return MDBX_EINVAL;
+ if (unlikely(env->me_map))
+ return MDBX_EPERM;
+ env->me_maxreaders = (unsigned)value;
+ break;
+
+ case MDBX_opt_dp_reserve_limit:
+ if (unlikely(value > INT_MAX))
+ return MDBX_EINVAL;
+ if (env->me_options.dp_reserve_limit != (unsigned)value) {
+ if (lock_needed) {
+ err = mdbx_txn_lock(env, false);
+ if (unlikely(err != MDBX_SUCCESS))
+ return err;
+ should_unlock = true;
+ }
+ env->me_options.dp_reserve_limit = (unsigned)value;
+ while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) {
+ mdbx_assert(env, env->me_dp_reserve != NULL);
+ MDBX_page *mp = env->me_dp_reserve;
+ ASAN_UNPOISON_MEMORY_REGION(&mp->mp_next, sizeof(mp->mp_next));
+ VALGRIND_MAKE_MEM_DEFINED(&mp->mp_next, sizeof(mp->mp_next));
+ env->me_dp_reserve = mp->mp_next;
+ mdbx_free(mp);
+ env->me_dp_reserve_len -= 1;
+ }
+ }
+ break;
+
+ case MDBX_opt_rp_augment_limit:
+ if (unlikely(value > MDBX_PGL_LIMIT))
+ return MDBX_EINVAL;
+ env->me_options.rp_augment_limit = (unsigned)value;
+ break;
+
+ case MDBX_opt_txn_dp_limit:
+ case MDBX_opt_txn_dp_initial:
+ if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4 ||
+ value > bytes2pgno(env, env->me_dbgeo.upper) - NUM_METAS))
+ return MDBX_EINVAL;
+ if (unlikely(env->me_txn0 == NULL))
+ return MDBX_EACCESS;
+ if (lock_needed) {
+ err = mdbx_txn_lock(env, false);
+ if (unlikely(err != MDBX_SUCCESS))
+ return err;
+ should_unlock = true;
+ }
+ if (env->me_txn)
+ err = MDBX_EPERM /* unable change during transaction */;
+ else {
+ mdbx_dpl_clear(env->me_txn0->tw.dirtylist);
+ const unsigned value32 = (unsigned)value;
+ if (option == MDBX_opt_txn_dp_initial &&
+ env->me_options.dp_initial != value32) {
+ if (env->me_options.dp_limit < value32)
+ env->me_options.dp_limit = value32;
+ if (env->me_txn0->tw.dirtylist->allocated < value32 &&
+ !mdbx_dpl_reserve(env->me_txn0, value32))
+ err = MDBX_ENOMEM;
+ else
+ env->me_options.dp_initial = value32;
+ }
+ if (option == MDBX_opt_txn_dp_limit &&
+ env->me_options.dp_limit != value32) {
+ if (env->me_txn0->tw.dirtylist->allocated > value32 &&
+ !mdbx_dpl_reserve(env->me_txn0, value32))
+ err = MDBX_ENOMEM;
+ else {
+ if (env->me_options.dp_initial > value32)
+ env->me_options.dp_initial = value32;
+ env->me_options.dp_limit = value32;
+ }
+ }
+ }
+ break;
+
+ case MDBX_opt_spill_max_denominator:
+ if (unlikely(value > 255))
+ return MDBX_EINVAL;
+ env->me_options.spill_max_denominator = (uint8_t)value;
+ break;
+ case MDBX_opt_spill_min_denominator:
+ if (unlikely(value > 255))
+ return MDBX_EINVAL;
+ env->me_options.spill_min_denominator = (uint8_t)value;
+ break;
+ case MDBX_opt_spill_parent4child_denominator:
+ if (unlikely(value > 255))
+ return MDBX_EINVAL;
+ env->me_options.spill_parent4child_denominator = (uint8_t)value;
+ break;
+
+ case MDBX_opt_loose_limit:
+ if (unlikely(value > 255))
+ return MDBX_EINVAL;
+ env->me_options.dp_loose_limit = (uint8_t)value;
+ break;
+
+ default:
+ return MDBX_EINVAL;
+ }
+
+ if (should_unlock)
+ mdbx_txn_unlock(env);
+ return err;
+}
+
+__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option,
+ uint64_t *value) {
+ int err = check_env(env);
+ if (unlikely(err != MDBX_SUCCESS))
+ return err;
+ if (unlikely(!value))
+ return MDBX_EINVAL;
+
+ switch (option) {
+ case MDBX_opt_sync_bytes:
+ if (unlikely(!env->me_autosync_threshold))
+ return MDBX_EPERM;
+ *value = *env->me_autosync_threshold;
+ break;
+
+ case MDBX_opt_sync_period:
+ if (unlikely(!env->me_autosync_period))
+ return MDBX_EPERM;
+ *value = mdbx_osal_monotime_to_16dot16(*env->me_autosync_period);
+ break;
+
+ case MDBX_opt_max_db:
+ *value = env->me_maxdbs - CORE_DBS;
+ break;
+
+ case MDBX_opt_max_readers:
+ *value = env->me_maxreaders;
+ break;
+
+ case MDBX_opt_dp_reserve_limit:
+ *value = env->me_options.dp_reserve_limit;
+ break;
+
+ case MDBX_opt_rp_augment_limit:
+ *value = env->me_options.rp_augment_limit;
+ break;
+
+ case MDBX_opt_txn_dp_limit:
+ *value = env->me_options.dp_limit;
+ break;
+ case MDBX_opt_txn_dp_initial:
+ *value = env->me_options.dp_initial;
+ break;
+
+ case MDBX_opt_spill_max_denominator:
+ *value = env->me_options.spill_max_denominator;
+ break;
+ case MDBX_opt_spill_min_denominator:
+ *value = env->me_options.spill_min_denominator;
+ break;
+ case MDBX_opt_spill_parent4child_denominator:
+ *value = env->me_options.spill_parent4child_denominator;
+ break;
+
+ case MDBX_opt_loose_limit:
+ *value = env->me_options.dp_loose_limit;
+ break;
+
+ default:
+ return MDBX_EINVAL;
+ }
+
+ return MDBX_SUCCESS;
+}
+
/*** Attribute support functions for Nexenta **********************************/
#ifdef MDBX_NEXENTA_ATTRS
@@ -22962,14 +23865,12 @@ __dll_export
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
-#if MDBX_HUGE_TRANSACTIONS
- " MDBX_HUGE_TRANSACTIONS=YES"
-#endif /* MDBX_HUGE_TRANSACTIONS */
" MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG
" MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG
" MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG
+ " MDBX_ENABLE_REFUND=" STRINGIFY(MDBX_ENABLE_REFUND)
#ifdef __SANITIZE_ADDRESS__
" SANITIZE_ADDRESS=YES"
#endif /* __SANITIZE_ADDRESS__ */
@@ -23608,6 +24509,10 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) {
#endif
}
+#if !(defined(_WIN32) || defined(_WIN64))
+static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); }
+#endif /*! Windows */
+
MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) {
#if defined(_WIN32) || defined(_WIN64)
const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX);
@@ -23746,6 +24651,29 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
flags |= O_CLOEXEC;
#endif /* O_CLOEXEC */
+ /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */
+#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
+ int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1;
+ static const char dev_null[] = "/dev/null";
+ if (!is_valid_fd(STDIN_FILENO)) {
+ mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN",
+ STDIN_FILENO, dev_null);
+ stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY);
+ }
+ if (!is_valid_fd(STDOUT_FILENO)) {
+ mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub",
+ "OUT", STDOUT_FILENO, dev_null);
+ stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY);
+ }
+ if (!is_valid_fd(STDERR_FILENO)) {
+ mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub",
+ "ERR", STDERR_FILENO, dev_null);
+ stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY);
+ }
+#else
+#error "Unexpected or unsupported UNIX or POSIX system"
+#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
+
*fd = open(pathname, flags, unix_mode_bits);
#if defined(O_DIRECT)
if (*fd < 0 && (flags & O_DIRECT) &&
@@ -23754,6 +24682,45 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
*fd = open(pathname, flags, unix_mode_bits);
}
#endif /* O_DIRECT */
+
+ /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */
+#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
+ if (*fd == STDIN_FILENO) {
+ mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN",
+ STDIN_FILENO);
+ assert(stub_fd0 == -1);
+ *fd = dup(stub_fd0 = *fd);
+ }
+ if (*fd == STDOUT_FILENO) {
+ mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT",
+ STDOUT_FILENO);
+ assert(stub_fd1 == -1);
+ *fd = dup(stub_fd1 = *fd);
+ }
+ if (*fd == STDERR_FILENO) {
+ mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR",
+ STDERR_FILENO);
+ assert(stub_fd2 == -1);
+ *fd = dup(stub_fd2 = *fd);
+ }
+ if (stub_fd0 != -1)
+ close(stub_fd0);
+ if (stub_fd1 != -1)
+ close(stub_fd1);
+ if (stub_fd2 != -1)
+ close(stub_fd2);
+ if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) {
+ mdbx_error(
+ "Rejecting the use of a FD in the range "
+ "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption",
+ STDIN_FILENO, STDERR_FILENO);
+ close(*fd);
+ return EBADF;
+ }
+#else
+#error "Unexpected or unsupported UNIX or POSIX system"
+#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
+
if (*fd < 0)
return errno;
@@ -23777,6 +24744,7 @@ MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) {
#if defined(_WIN32) || defined(_WIN64)
return CloseHandle(fd) ? MDBX_SUCCESS : GetLastError();
#else
+ assert(fd > STDERR_FILENO);
return (close(fd) == 0) ? MDBX_SUCCESS : errno;
#endif
}
@@ -25423,9 +26391,9 @@ __dll_export
0,
9,
2,
- 12,
- {"2020-12-05T09:45:03+03:00", "455c7d633b3e490fe4f1e8c9999adccdf358f699", "3e7459b428373404e3c8ce238bd4061fa9d18f79",
- "v0.9.2-12-g3e7459b4"},
+ 110,
+ {"2021-01-23T01:13:20+03:00", "8a2017c7d169c0ee1b9b21cc67ea939d7abe5f08", "4e13d1239ab6580ed04cab377b746e7f8c214a9b",
+ "v0.9.2-110-g4e13d12"},
sourcery};
__dll_export
diff --git a/libs/libmdbx/src/mdbx.c++ b/libs/libmdbx/src/mdbx.c++
index 4f54ad8efb..049c93d588 100644
--- a/libs/libmdbx/src/mdbx.c++
+++ b/libs/libmdbx/src/mdbx.c++
@@ -12,7 +12,7 @@
* <http://www.OpenLDAP.org/license.html>. */
#define MDBX_ALLOY 1
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -339,7 +339,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -347,7 +347,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1542,11 +1542,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1596,6 +1591,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1917,10 +1930,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1940,17 +1953,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2199,7 +2213,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2237,19 +2252,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2264,37 +2280,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2418,14 +2424,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2435,26 +2443,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2584,11 +2585,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2601,6 +2600,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h
index 2a6a2d7893..d5c5ec99ce 100644
--- a/libs/libmdbx/src/mdbx.h
+++ b/libs/libmdbx/src/mdbx.h
@@ -823,7 +823,7 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t)
* \param [in] env An environment handle returned by \ref mdbx_env_create().
* \param [in] msg The assertion message, not including newline. */
typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function,
- int line, const char *msg,
+ int line, const char *fmt,
va_list args) MDBX_CXX17_NOEXCEPT;
/** \brief The "don't change `logger`" value for mdbx_setup_debug() */
@@ -1793,6 +1793,176 @@ LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf,
* \returns a non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_env_create(MDBX_env **penv);
+/** \brief MDBX environment options. */
+enum MDBX_option_t {
+ /** \brief Controls the maximum number of named databases for the environment.
+ *
+ * \details By default only unnamed key-value database could used and
+ * appropriate value should set by `MDBX_opt_max_db` to using any more named
+ * subDB(s). To reduce overhead, use the minimum sufficient value. This option
+ * may only set after \ref mdbx_env_create() and before \ref mdbx_env_open().
+ *
+ * \see mdbx_env_set_maxdbs() \see mdbx_env_get_maxdbs() */
+ MDBX_opt_max_db,
+
+ /** \brief Defines the maximum number of threads/reader slots
+ * for all processes interacting with the database.
+ *
+ * \details This defines the number of slots in the lock table that is used to
+ * track readers in the the environment. The default is about 100 for 4K
+ * system page size. Starting a read-only transaction normally ties a lock
+ * table slot to the current thread until the environment closes or the thread
+ * exits. If \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the
+ * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is
+ * destroyed. This option may only set after \ref mdbx_env_create() and before
+ * \ref mdbx_env_open(), and has an effect only when the database is opened by
+ * the first process interacts with the database.
+ *
+ * \see mdbx_env_set_maxreaders() \see mdbx_env_get_maxreaders() */
+ MDBX_opt_max_readers,
+
+ /** \brief Controls interprocess/shared threshold to force flush the data
+ * buffers to disk, if \ref MDBX_SAFE_NOSYNC is used.
+ *
+ * \see mdbx_env_set_syncbytes() \see mdbx_env_get_syncbytes() */
+ MDBX_opt_sync_bytes,
+
+ /** \brief Controls interprocess/shared relative period since the last
+ * unsteady commit to force flush the data buffers to disk,
+ * if \ref MDBX_SAFE_NOSYNC is used.
+ * \see mdbx_env_set_syncperiod() \see mdbx_env_get_syncperiod() */
+ MDBX_opt_sync_period,
+
+ /** \brief Controls the in-process limit to grow a list of reclaimed/recycled
+ * page's numbers for finding a sequence of contiguous pages for large data
+ * items.
+ *
+ * \details A long values requires allocation of contiguous database pages.
+ * To find such sequences, it may be necessary to accumulate very large lists,
+ * especially when placing very long values (more than a megabyte) in a large
+ * databases (several tens of gigabytes), which is much expensive in extreme
+ * cases. This threshold allows you to avoid such costs by allocating new
+ * pages at the end of the database (with its possible growth on disk),
+ * instead of further accumulating/reclaiming Garbage Collection records.
+ *
+ * On the other hand, too small threshold will lead to unreasonable database
+ * growth, or/and to the inability of put long values.
+ *
+ * The `MDBX_opt_rp_augment_limit` controls described limit for the current
+ * process. Default is 262144, it is usually enough for most cases. */
+ MDBX_opt_rp_augment_limit,
+
+ /** \brief Controls the in-process limit to grow a cache of dirty
+ * pages for reuse in the current transaction.
+ *
+ * \details A 'dirty page' refers to a page that has been updated in memory
+ * only, the changes to a dirty page are not yet stored on disk.
+ * To reduce overhead, it is reasonable to release not all such pages
+ * immediately, but to leave some ones in cache for reuse in the current
+ * transaction.
+ *
+ * The `MDBX_opt_loose_limit` allows you to set a limit for such cache inside
+ * the current process. Should be in the range 0..255, default is 64. */
+ MDBX_opt_loose_limit,
+
+ /** \brief Controls the in-process limit of a pre-allocated memory items
+ * for dirty pages.
+ *
+ * \details A 'dirty page' refers to a page that has been updated in memory
+ * only, the changes to a dirty page are not yet stored on disk.
+ * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and
+ * released when a transaction is committed. To reduce overhead, it is
+ * reasonable to release not all ones, but to leave some allocations in
+ * reserve for reuse in the next transaction(s).
+ *
+ * The `MDBX_opt_dp_reserve_limit` allows you to set a limit for such reserve
+ * inside the current process. Default is 1024. */
+ MDBX_opt_dp_reserve_limit,
+
+ /** \brief Controls the in-process limit of dirty pages
+ * for a write transaction.
+ *
+ * \details A 'dirty page' refers to a page that has been updated in memory
+ * only, the changes to a dirty page are not yet stored on disk.
+ * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and will
+ * be busy until are written to disk. Therefore for a large transactions is
+ * reasonable to limit dirty pages collecting above an some threshold but
+ * spill to disk instead.
+ *
+ * The `MDBX_opt_txn_dp_limit` controls described threshold for the current
+ * process. Default is 65536, it is usually enough for most cases. */
+ MDBX_opt_txn_dp_limit,
+
+ /** \brief Controls the in-process initial allocation size for dirty pages
+ * list of a write transaction. Default is 1024. */
+ MDBX_opt_txn_dp_initial,
+
+ /** \brief Controls the in-process how maximal part of the dirty pages may be
+ * spilled when necessary.
+ *
+ * \details The `MDBX_opt_spill_max_denominator` defines the denominator for
+ * limiting from the top for part of the current dirty pages may be spilled
+ * when the free room for a new dirty pages (i.e. distance to the
+ * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested
+ * operation.
+ * Exactly `max_pages_to_spill = dirty_pages - dirty_pages / N`,
+ * where `N` is the value set by `MDBX_opt_spill_max_denominator`.
+ *
+ * Should be in the range 0..255, where zero means no limit, i.e. all dirty
+ * pages could be spilled. Default is 8, i.e. no more than 7/8 of the current
+ * dirty pages may be spilled when reached the condition described above. */
+ MDBX_opt_spill_max_denominator,
+
+ /** \brief Controls the in-process how minimal part of the dirty pages should
+ * be spilled when necessary.
+ *
+ * \details The `MDBX_opt_spill_min_denominator` defines the denominator for
+ * limiting from the bottom for part of the current dirty pages should be
+ * spilled when the free room for a new dirty pages (i.e. distance to the
+ * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested
+ * operation.
+ * Exactly `min_pages_to_spill = dirty_pages / N`,
+ * where `N` is the value set by `MDBX_opt_spill_min_denominator`.
+ *
+ * Should be in the range 0..255, where zero means no restriction at the
+ * bottom. Default is 8, i.e. at least the 1/8 of the current dirty pages
+ * should be spilled when reached the condition described above. */
+ MDBX_opt_spill_min_denominator,
+
+ /** \brief Controls the in-process how much of the parent transaction dirty
+ * pages will be spilled while start each child transaction.
+ *
+ * \details The `MDBX_opt_spill_parent4child_denominator` defines the
+ * denominator to determine how much of parent transaction dirty pages will be
+ * spilled explicitly while start each child transaction.
+ * Exactly `pages_to_spill = dirty_pages / N`,
+ * where `N` is the value set by `MDBX_opt_spill_parent4child_denominator`.
+ *
+ * For a stack of nested transactions each dirty page could be spilled only
+ * once, and parent's dirty pages couldn't be spilled while child
+ * transaction(s) are running. Therefore a child transaction could reach
+ * \ref MDBX_TXN_FULL when parent(s) transaction has spilled too less (and
+ * child reach the limit of dirty pages), either when parent(s) has spilled
+ * too more (since child can't spill already spilled pages). So there is no
+ * universal golden ratio.
+ *
+ * Should be in the range 0..255, where zero means no explicit spilling will
+ * be performed during starting nested transactions.
+ * Default is 0, i.e. by default no spilling performed during starting nested
+ * transactions, that correspond historically behaviour. */
+ MDBX_opt_spill_parent4child_denominator,
+};
+#ifndef __cplusplus
+/** \ingroup c_settings */
+typedef enum MDBX_option_t MDBX_option_t;
+#endif
+
+LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
+ const uint64_t value);
+LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env,
+ const MDBX_option_t option,
+ uint64_t *value);
+
/** \brief Open an environment instance.
* \ingroup c_opening
*
@@ -2178,7 +2348,10 @@ LIBMDBX_INLINE_API(int, mdbx_env_sync_poll, (MDBX_env * env)) {
* a synchronous flush would be made.
*
* \returns A non-zero error value on failure and 0 on success. */
-LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold);
+LIBMDBX_INLINE_API(int, mdbx_env_set_syncbytes,
+ (MDBX_env * env, size_t threshold)) {
+ return mdbx_env_set_option(env, MDBX_opt_sync_bytes, threshold);
+}
/** \brief Sets relative period since the last unsteady commit to force flush
* the data buffers to disk, even of \ref MDBX_SAFE_NOSYNC flag in the
@@ -2210,8 +2383,10 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold);
* the last unsteady commit.
*
* \returns A non-zero error value on failure and 0 on success. */
-LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env,
- unsigned seconds_16dot16);
+LIBMDBX_INLINE_API(int, mdbx_env_set_syncperiod,
+ (MDBX_env * env, unsigned seconds_16dot16)) {
+ return mdbx_env_set_option(env, MDBX_opt_sync_period, seconds_16dot16);
+}
/** \brief Close the environment and release the memory map.
* \ingroup c_opening
@@ -2582,17 +2757,18 @@ mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags);
MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t
mdbx_limits_txnsize_max(intptr_t pagesize);
-/** \brief Set the maximum number of threads/reader slots for the environment.
- * \ingroup c_settings
+/** \brief Set the maximum number of threads/reader slots for for all processes
+ * interacts with the database. \ingroup c_settings
*
- * This defines the number of slots in the lock table that is used to track
- * readers in the the environment. The default is 119 for 4K system page size.
- * Starting a read-only transaction normally ties a lock table slot to the
- * current thread until the environment closes or the thread exits. If
+ * \details This defines the number of slots in the lock table that is used to
+ * track readers in the the environment. The default is about 100 for 4K system
+ * page size. Starting a read-only transaction normally ties a lock table slot
+ * to the current thread until the environment closes or the thread exits. If
* \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the slot to the
* \ref MDBX_txn object until it or the \ref MDBX_env object is destroyed.
* This function may only be called after \ref mdbx_env_create() and before
- * \ref mdbx_env_open().
+ * \ref mdbx_env_open(), and has an effect only when the database is opened by
+ * the first process interacts with the database.
* \see mdbx_env_get_maxreaders()
*
* \param [in] env An environment handle returned
@@ -2603,7 +2779,10 @@ mdbx_limits_txnsize_max(intptr_t pagesize);
* some possible errors are:
* \retval MDBX_EINVAL An invalid parameter was specified.
* \retval MDBX_EPERM The environment is already open. */
-LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers);
+LIBMDBX_INLINE_API(int, mdbx_env_set_maxreaders,
+ (MDBX_env * env, unsigned readers)) {
+ return mdbx_env_set_option(env, MDBX_opt_max_readers, readers);
+}
/** \brief Get the maximum number of threads/reader slots for the environment.
* \ingroup c_statinfo
@@ -2616,7 +2795,16 @@ LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers);
* \returns A non-zero error value on failure and 0 on success,
* some possible errors are:
* \retval MDBX_EINVAL An invalid parameter was specified. */
-LIBMDBX_API int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers);
+LIBMDBX_INLINE_API(int, mdbx_env_get_maxreaders,
+ (const MDBX_env *env, unsigned *readers)) {
+ int rc = MDBX_EINVAL;
+ if (readers) {
+ uint64_t proxy = 0;
+ rc = mdbx_env_get_option(env, MDBX_opt_max_readers, &proxy);
+ *readers = (unsigned)proxy;
+ }
+ return rc;
+}
/** \brief Set the maximum number of named databases for the environment.
* \ingroup c_settings
@@ -2639,7 +2827,9 @@ LIBMDBX_API int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers);
* some possible errors are:
* \retval MDBX_EINVAL An invalid parameter was specified.
* \retval MDBX_EPERM The environment is already open. */
-LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs);
+LIBMDBX_INLINE_API(int, mdbx_env_set_maxdbs, (MDBX_env * env, MDBX_dbi dbs)) {
+ return mdbx_env_set_option(env, MDBX_opt_max_db, dbs);
+}
/** \brief Get the maximum number of named databases for the environment.
* \ingroup c_statinfo
@@ -2651,7 +2841,16 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs);
* \returns A non-zero error value on failure and 0 on success,
* some possible errors are:
* \retval MDBX_EINVAL An invalid parameter was specified. */
-LIBMDBX_API int mdbx_env_get_maxdbs(MDBX_env *env, MDBX_dbi *dbs);
+LIBMDBX_INLINE_API(int, mdbx_env_get_maxdbs,
+ (const MDBX_env *env, MDBX_dbi *dbs)) {
+ int rc = MDBX_EINVAL;
+ if (dbs) {
+ uint64_t proxy = 0;
+ rc = mdbx_env_get_option(env, MDBX_opt_max_db, &proxy);
+ *dbs = (MDBX_dbi)proxy;
+ }
+ return rc;
+}
/** \brief Get the maximum size of keys can write.
* \ingroup c_statinfo
diff --git a/libs/libmdbx/src/mdbx_chk.c b/libs/libmdbx/src/mdbx_chk.c
index dd505dd96c..906c187347 100644
--- a/libs/libmdbx/src/mdbx_chk.c
+++ b/libs/libmdbx/src/mdbx_chk.c
@@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -361,7 +361,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -369,7 +369,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1962,17 +1975,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2440,14 +2446,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2457,26 +2465,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2606,11 +2607,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2623,6 +2622,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@@ -3596,7 +3606,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR,
data->iov_len);
size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0;
- if (number < 1 || number > MDBX_PNL_MAX)
+ if (number < 1 || number > MDBX_PGL_LIMIT)
problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number);
else if ((number + 1) * sizeof(pgno_t) > data->iov_len) {
problem_add("entry", txnid, "trimmed idl",
@@ -4176,7 +4186,21 @@ int main(int argc, char *argv[]) {
if (argc < 2)
usage(prog);
- for (int i; (i = getopt(argc, argv, "012TVvqnwctdis:")) != EOF;) {
+ for (int i; (i = getopt(argc, argv,
+ "0"
+ "1"
+ "2"
+ "T"
+ "V"
+ "v"
+ "q"
+ "n"
+ "w"
+ "c"
+ "t"
+ "d"
+ "i"
+ "s:")) != EOF;) {
switch (i) {
case 'V':
printf("mdbx_chk version %d.%d.%d.%d\n"
@@ -4697,7 +4721,7 @@ int main(int argc, char *argv[]) {
walk.pgcount, alloc_pages - gc_pages);
}
if (unused_pages != gc_pages) {
- error("gc pages mismatch (%" PRIu64 "(walked) != %" PRIu64 "(GC))\n",
+ error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n",
unused_pages, gc_pages);
}
} else if (verbose) {
diff --git a/libs/libmdbx/src/mdbx_copy.c b/libs/libmdbx/src/mdbx_copy.c
index cde9923308..0c831b4528 100644
--- a/libs/libmdbx/src/mdbx_copy.c
+++ b/libs/libmdbx/src/mdbx_copy.c
@@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -361,7 +361,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -369,7 +369,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1962,17 +1975,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2440,14 +2446,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2457,26 +2465,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2606,11 +2607,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2623,6 +2622,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
diff --git a/libs/libmdbx/src/mdbx_dump.c b/libs/libmdbx/src/mdbx_dump.c
index c51247a121..6b7a5aaa40 100644
--- a/libs/libmdbx/src/mdbx_dump.c
+++ b/libs/libmdbx/src/mdbx_dump.c
@@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -361,7 +361,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -369,7 +369,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1962,17 +1975,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2440,14 +2446,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2457,26 +2465,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2606,11 +2607,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2623,6 +2622,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@@ -3370,7 +3380,16 @@ int main(int argc, char *argv[]) {
if (argc < 2)
usage();
- while ((i = getopt(argc, argv, "af:lnps:Vrq")) != EOF) {
+ while ((i = getopt(argc, argv,
+ "a"
+ "f:"
+ "l"
+ "n"
+ "p"
+ "s:"
+ "V"
+ "r"
+ "q")) != EOF) {
switch (i) {
case 'V':
printf("mdbx_dump version %d.%d.%d.%d\n"
diff --git a/libs/libmdbx/src/mdbx_load.c b/libs/libmdbx/src/mdbx_load.c
index d1ac7e9238..ad9ddc90e6 100644
--- a/libs/libmdbx/src/mdbx_load.c
+++ b/libs/libmdbx/src/mdbx_load.c
@@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -361,7 +361,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -369,7 +369,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1962,17 +1975,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2440,14 +2446,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2457,26 +2465,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2606,11 +2607,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2623,6 +2622,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@@ -3566,16 +3576,17 @@ static int readline(MDBX_val *out, MDBX_val *buf) {
static void usage(void) {
fprintf(stderr,
- "usage: %s [-V] [-q] [-a] [-f file] [-s name] [-N] [-T] [-r] [-n]"
- " dbpath\n"
+ "usage: %s "
+ "[-V] [-q] [-a] [-f file] [-s name] [-N] [-p] [-T] [-r] [-n] dbpath\n"
" -V\t\tprint version and exit\n"
" -q\t\tbe quiet\n"
" -a\t\tappend records in input order (required for custom "
"comparators)\n"
" -f file\tread from file instead of stdin\n"
" -s name\tload into named subDB\n"
- " -N\t\tdon't overwrite existing records when loading (), just skip "
- "them\n"
+ " -N\t\tdon't overwrite existing records when loading, just skip "
+ "ones\n"
+ " -p\t\tpurge subDB before loading\n"
" -T\t\tread plaintext\n"
" -r\t\trescue mode (ignore errors to load corrupted DB dump)\n"
" -n\t\tdon't use subdirectory for newly created database "
@@ -3598,15 +3609,26 @@ int main(int argc, char *argv[]) {
MDBX_cursor *mc = nullptr;
MDBX_dbi dbi;
char *envname = nullptr;
- int envflags = MDBX_UTTERLY_NOSYNC, putflags = MDBX_UPSERT;
+ int envflags = MDBX_SAFE_NOSYNC | MDBX_ACCEDE, putflags = MDBX_UPSERT;
bool quiet = false;
bool rescue = false;
+ bool purge = false;
prog = argv[0];
if (argc < 2)
usage();
- while ((i = getopt(argc, argv, "af:ns:NTVrq")) != EOF) {
+ while ((i = getopt(argc, argv,
+ "a"
+ "f:"
+ "n"
+ "s:"
+ "N"
+ "p"
+ "T"
+ "V"
+ "r"
+ "q")) != EOF) {
switch (i) {
case 'V':
printf("mdbx_load version %d.%d.%d.%d\n"
@@ -3641,6 +3663,9 @@ int main(int argc, char *argv[]) {
case 'N':
putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA;
break;
+ case 'p':
+ purge = true;
+ break;
case 'T':
mode |= NOHDR | PRINT;
break;
@@ -3814,6 +3839,14 @@ int main(int argc, char *argv[]) {
}
}
+ if (purge) {
+ rc = mdbx_drop(txn, dbi, false);
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ error("mdbx_drop", rc);
+ goto txn_abort;
+ }
+ }
+
if (putflags & MDBX_APPEND)
putflags = (dbi_flags & MDBX_DUPSORT) ? putflags | MDBX_APPENDDUP
: putflags & ~MDBX_APPENDDUP;
diff --git a/libs/libmdbx/src/mdbx_stat.c b/libs/libmdbx/src/mdbx_stat.c
index aa5d501728..d19c672ff0 100644
--- a/libs/libmdbx/src/mdbx_stat.c
+++ b/libs/libmdbx/src/mdbx_stat.c
@@ -34,7 +34,7 @@
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. */
-#define MDBX_BUILD_SOURCERY b30bc0044d83cd1275fa00662c8265e39091a931353b79a46d21c9536795acb2_v0_9_2_12_g3e7459b4
+#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12
#ifdef MDBX_CONFIG_H
#include MDBX_CONFIG_H
#endif
@@ -361,7 +361,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define likely(cond) __builtin_expect(!!(cond), 1)
# else
-# define likely(x) (x)
+# define likely(x) (!!(x))
# endif
#endif /* likely */
@@ -369,7 +369,7 @@
# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__)
# define unlikely(cond) __builtin_expect(!!(cond), 0)
# else
-# define unlikely(x) (x)
+# define unlikely(x) (!!(x))
# endif
#endif /* unlikely */
@@ -1564,11 +1564,6 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#endif /* DOXYGEN */
-/** Enables support for huge write-transactions */
-#ifndef MDBX_HUGE_TRANSACTIONS
-#define MDBX_HUGE_TRANSACTIONS 0
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
#define MDBX_OSX_WANNA_DURABILITY 0
/** Using fsync() with chance of data lost on power failure */
@@ -1618,6 +1613,24 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC)
#endif /* MDBX_TRUST_RTC */
+/** Controls online database auto-compactification during write-transactions. */
+#ifndef MDBX_ENABLE_REFUND
+#define MDBX_ENABLE_REFUND 1
+#endif
+#if !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1)
+#error MDBX_ENABLE_REFUND must be defined as 0 or 1
+#endif /* MDBX_ENABLE_REFUND */
+
+/** Controls sort order of internal page number lists.
+ * The database format depend on this option and libmdbx builded with different
+ * option value are incompatible. */
+#ifndef MDBX_PNL_ASCENDING
+#define MDBX_PNL_ASCENDING 0
+#endif
+#if !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1)
+#error MDBX_PNL_ASCENDING must be defined as 0 or 1
+#endif /* MDBX_PNL_ASCENDING */
+
//------------------------------------------------------------------------------
/** Win32 File Locking API for \ref MDBX_LOCKING */
@@ -1939,10 +1952,10 @@ typedef struct mdbx_geo_t {
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
- uint64_t mm_magic_and_version;
+ uint32_t mm_magic_and_version[2];
/* txnid that committed this page, the first of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_a;
+ uint32_t mm_txnid_a[2];
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
@@ -1962,17 +1975,18 @@ typedef struct MDBX_meta {
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
-#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
- volatile uint64_t mm_datasync_sign;
+#define META_IS_STEADY(meta) \
+ SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign))
+ uint32_t mm_datasync_sign[2];
/* txnid that committed this page, the second of a two-phase-update pair */
- mdbx_safe64_t mm_txnid_b;
+ uint32_t mm_txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with mr_snapshot_pages_retired allows fast estimation
* of "how much reader is restraining GC recycling". */
- uint64_t mm_pages_retired;
+ uint32_t mm_pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
@@ -2221,7 +2235,8 @@ typedef struct MDBX_lockinfo {
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \
(unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29)
-#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
+#define MDBX_DATA_MAGIC \
+ ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
@@ -2259,19 +2274,20 @@ typedef struct MDBX_lockinfo {
#define MAX_MAPSIZE MAX_MAPSIZE64
#define MDBX_READERS_LIMIT \
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader))
+#define MDBX_PGL_LIMIT MAX_PAGENO
#else
#define MDBX_READERS_LIMIT 1024
#define MAX_MAPSIZE MAX_MAPSIZE32
+#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE)
#endif /* MDBX_WORDBITS */
/*----------------------------------------------------------------------------*/
-/* Two kind lists of pages (aka PNL) */
-/* An PNL is an Page Number List, a sorted array of IDs. The first element of
- * the array is a counter for how many actual page-numbers are in the list.
- * PNLs are sorted in descending order, this allow cut off a page with lowest
- * pgno (at the tail) just truncating the list */
-#define MDBX_PNL_ASCENDING 0
+/* An PNL is an Page Number List, a sorted array of IDs.
+ * The first element of the array is a counter for how many actual page-numbers
+ * are in the list. By default PNLs are sorted in descending order, this allow
+ * cut off a page with lowest pgno (at the tail) just truncating the list. The
+ * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */
typedef pgno_t *MDBX_PNL;
#if MDBX_PNL_ASCENDING
@@ -2286,37 +2302,27 @@ typedef pgno_t *MDBX_PNL;
typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */
-typedef union MDBX_DP {
- __anonymous_struct_extension__ struct {
- pgno_t pgno;
- MDBX_page *ptr;
- };
- __anonymous_struct_extension__ struct {
- unsigned sorted;
- unsigned length;
- };
-} MDBX_DP;
-
-/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
- * The first element's length member is a count of how many actual
- * elements are in the array. */
-typedef MDBX_DP *MDBX_DPL;
+typedef struct MDBX_dp {
+ pgno_t pgno;
+ MDBX_page *ptr;
+} MDBX_dp;
+
+/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */
+typedef struct MDBX_dpl {
+ unsigned sorted;
+ unsigned length;
+ unsigned allocated;
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
+ (!defined(__cplusplus) && defined(_MSC_VER))
+ MDBX_dp items[] /* dynamic size with holes at zero and after the last */;
+#endif
+} MDBX_dpl;
/* PNL sizes */
#define MDBX_PNL_GRANULATE 1024
#define MDBX_PNL_INITIAL \
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#if MDBX_HUGE_TRANSACTIONS
-#define MDBX_PNL_MAX \
- ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 2)
-#else
-#define MDBX_PNL_MAX \
- ((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
-#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
-#endif /* MDBX_HUGE_TRANSACTIONS */
-
#define MDBX_TXL_GRANULATE 32
#define MDBX_TXL_INITIAL \
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
@@ -2440,14 +2446,16 @@ struct MDBX_txn {
MDBX_cursor **cursors;
pgno_t *reclaimed_pglist; /* Reclaimed GC pages */
txnid_t last_reclaimed; /* ID of last used record */
+#if MDBX_ENABLE_REFUND
pgno_t loose_refund_wl /* FIXME: describe */;
+#endif /* MDBX_ENABLE_REFUND */
/* dirtylist room: Dirty array size - dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
unsigned dirtyroom;
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
- MDBX_DPL dirtylist;
+ MDBX_dpl *dirtylist;
/* The list of reclaimed txns from GC */
MDBX_TXL lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
@@ -2457,26 +2465,19 @@ struct MDBX_txn {
MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */
unsigned loose_count;
- /* Number of retired to parent pages (tw.retired2parent_pages) */
- unsigned retired2parent_count;
- /* The list of parent's txn dirty pages that retired (became unused)
- * in this transaction, linked through `mp_next`. */
- MDBX_page *retired2parent_pages;
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages;
+ unsigned spill_least_removed;
} tw;
};
};
-/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
- * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
- * raise this on a 64 bit machine. */
#if MDBX_WORDBITS >= 64
-#define CURSOR_STACK 28
+#define CURSOR_STACK 32
#else
-#define CURSOR_STACK 20
+#define CURSOR_STACK 24
#endif
struct MDBX_xcursor;
@@ -2606,11 +2607,9 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
- MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
+ MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
- /* MDBX_DP of pages written during a write txn. */
- MDBX_DPL me_dirtylist;
/* Number of freelist items that can fit in a single overflow page */
unsigned me_maxgc_ov1page;
unsigned me_branch_nodemax; /* max size of a branch-node */
@@ -2623,6 +2622,17 @@ struct MDBX_env {
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
+ unsigned me_dp_reserve_len;
+ struct {
+ unsigned dp_reserve_limit;
+ unsigned rp_augment_limit;
+ unsigned dp_limit;
+ unsigned dp_initial;
+ uint8_t dp_loose_limit;
+ uint8_t spill_max_denominator;
+ uint8_t spill_min_denominator;
+ uint8_t spill_parent4child_denominator;
+ } me_options;
struct {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
@@ -3225,7 +3235,14 @@ int main(int argc, char *argv[]) {
if (argc < 2)
usage(prog);
- while ((o = getopt(argc, argv, "Vaefnrs:")) != EOF) {
+ while ((o = getopt(argc, argv,
+ "V"
+ "a"
+ "e"
+ "f"
+ "n"
+ "r"
+ "s:")) != EOF) {
switch (o) {
case 'V':
printf("mdbx_stat version %d.%d.%d.%d\n"