diff options
author | George Hazan <ghazan@miranda.im> | 2018-08-15 12:26:44 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2018-08-15 12:26:44 +0300 |
commit | 5c70a399598e4b0665fc194f1d6c5b3c2cae9972 (patch) | |
tree | 470829d90edf3827d89012727d2bf5df60d44cfc /libs/libmdbx | |
parent | 61b92178e79ffc114a80bab4f057eb6f4c3f967f (diff) |
merge with libmdbx
Diffstat (limited to 'libs/libmdbx')
-rw-r--r-- | libs/libmdbx/src/src/bits.h | 5 | ||||
-rw-r--r-- | libs/libmdbx/src/src/defs.h | 7 | ||||
-rw-r--r-- | libs/libmdbx/src/src/lck-posix.c | 26 | ||||
-rw-r--r-- | libs/libmdbx/src/src/lck-windows.c | 45 | ||||
-rw-r--r-- | libs/libmdbx/src/src/mdbx.c | 605 | ||||
-rw-r--r-- | libs/libmdbx/src/src/osal.h | 1 | ||||
-rw-r--r-- | libs/libmdbx/src/src/tools/mdbx_chk.c | 2 |
7 files changed, 379 insertions, 312 deletions
diff --git a/libs/libmdbx/src/src/bits.h b/libs/libmdbx/src/src/bits.h index 955a583264..fca28e25e5 100644 --- a/libs/libmdbx/src/src/bits.h +++ b/libs/libmdbx/src/src/bits.h @@ -514,6 +514,7 @@ typedef MDBX_ID2 *MDBX_ID2L; #define MDBX_PNL_DB_MAX (MDBX_PNL_DB_SIZE - 1) #define MDBX_PNL_UM_MAX (MDBX_PNL_UM_SIZE - 1) +#define MDBX_PNL_MAX (MAX_PAGENO / 4) #define MDBX_PNL_SIZEOF(pl) (((pl)[0] + 1) * sizeof(pgno_t)) #define MDBX_PNL_IS_ZERO(pl) ((pl)[0] == 0) @@ -759,8 +760,8 @@ struct MDBX_env { MDBX_PNL me_free_pgs; /* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */ MDBX_ID2L me_dirtylist; - /* Max number of freelist items that can fit in a single overflow page */ - unsigned me_maxfree_1pg; + /* Number of freelist items that can fit in a single overflow page */ + unsigned me_maxgc_ov1page; /* Max size of a node on a page */ unsigned me_nodemax; unsigned me_maxkey_limit; /* max size of a key */ diff --git a/libs/libmdbx/src/src/defs.h b/libs/libmdbx/src/src/defs.h index b6076cc1b3..4b045efc1d 100644 --- a/libs/libmdbx/src/src/defs.h +++ b/libs/libmdbx/src/src/defs.h @@ -327,6 +327,13 @@ # define mdbx_func_ "<mdbx_unknown>" #endif +#if defined(__GNUC__) || __has_attribute(format) +#define __printf_args(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#else +#define __printf_args(format_index, first_arg) +#endif + /*----------------------------------------------------------------------------*/ #if defined(USE_VALGRIND) diff --git a/libs/libmdbx/src/src/lck-posix.c b/libs/libmdbx/src/src/lck-posix.c index 869b98c054..0aa9d85078 100644 --- a/libs/libmdbx/src/src/lck-posix.c +++ b/libs/libmdbx/src/src/lck-posix.c @@ -48,7 +48,7 @@ static __cold __attribute__((destructor)) void mdbx_global_destructor(void) { #endif #define LCK_WHOLE OFF_T_MAX -static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset, +static int mdbx_lck_op(mdbx_filehandle_t fd, int op, short lck, off_t offset, off_t len) { for (;;) { int rc; @@ -68,11 +68,19 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset, } } -static __inline int mdbx_lck_exclusive(int lfd) { +static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { assert(lfd != INVALID_HANDLE_VALUE); if (flock(lfd, LOCK_EX | LOCK_NB)) return errno; - return mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + if (rc != 0 && fallback2shared) { + while (flock(lfd, LOCK_SH)) { + int rc = errno; + if (rc != EINTR) + return rc; + } + } + return rc; } static __inline int mdbx_lck_shared(int lfd) { @@ -90,8 +98,6 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) { return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; } -int mdbx_lck_upgrade(MDBX_env *env) { return mdbx_lck_exclusive(env->me_lfd); } - int mdbx_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); @@ -150,6 +156,10 @@ int __cold mdbx_lck_init(MDBX_env *env) { goto bailout; #endif /* PTHREAD_PRIO_INHERIT */ + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); if (rc) goto bailout; @@ -163,7 +173,7 @@ bailout: void __cold mdbx_lck_destroy(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_lck && mdbx_lck_exclusive(env->me_lfd) == 0) { + if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) { mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) @@ -232,7 +242,7 @@ static int __cold internal_seize_lck(int lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* try exclusive access */ - int rc = mdbx_lck_exclusive(lfd); + int rc = mdbx_lck_exclusive(lfd, false); if (rc == 0) /* got exclusive */ return MDBX_RESULT_TRUE; @@ -241,7 +251,7 @@ static int __cold internal_seize_lck(int lfd) { rc = mdbx_lck_shared(lfd); if (rc == 0) { /* got shared, try exclusive again */ - rc = mdbx_lck_exclusive(lfd); + rc = mdbx_lck_exclusive(lfd, true); if (rc == 0) /* now got exclusive */ return MDBX_RESULT_TRUE; diff --git a/libs/libmdbx/src/src/lck-windows.c b/libs/libmdbx/src/src/lck-windows.c index 02b074e9fc..7da0755916 100644 --- a/libs/libmdbx/src/src/lck-windows.c +++ b/libs/libmdbx/src/src/lck-windows.c @@ -457,51 +457,6 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) { return MDBX_SUCCESS /* 7) now at S-? (used), done */; } -int mdbx_lck_upgrade(MDBX_env *env) { - /* Transite from locked state (S-E) to exclusive-write (E-E) */ - assert(env->me_fd != INVALID_HANDLE_VALUE); - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert((env->me_flags & MDBX_EXCLUSIVE) == 0); - - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_RESULT_TRUE /* files were must be opened non-shareable */; - - /* 1) must be at S-E (locked), transite to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "S-E(locked) >> ?-E(middle)", GetLastError()); - - /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); - if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) - return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive-write), done */ - - /* 5) still on ?-E (middle) */ - int rc = GetLastError(); - mdbx_jitter4testing(false); - if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { - /* 6) something went wrong, report but continue */ - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> E-E(exclusive-write)", rc); - } - - /* 7) still on ?-E (middle), try restore S-E (locked) */ - mdbx_jitter4testing(false); - rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) - ? MDBX_RESULT_FALSE - : GetLastError(); - - mdbx_jitter4testing(false); - if (rc != MDBX_RESULT_FALSE) { - mdbx_fatal("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> S-E(locked)", rc); - return rc; - } - - /* 8) now on S-E (locked) */ - return MDBX_RESULT_FALSE; -} - void mdbx_lck_destroy(MDBX_env *env) { int rc; diff --git a/libs/libmdbx/src/src/mdbx.c b/libs/libmdbx/src/src/mdbx.c index 57d6ec1928..f5b864595d 100644 --- a/libs/libmdbx/src/src/mdbx.c +++ b/libs/libmdbx/src/src/mdbx.c @@ -501,6 +501,7 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { * Allocates memory for an PNL of the given size. * Returns PNL on success, NULL on failure. */ static MDBX_PNL mdbx_pnl_alloc(size_t size) { + assert(size <= MDBX_PNL_MAX); MDBX_PNL pl = malloc((size + 2) * sizeof(pgno_t)); if (likely(pl)) { *pl++ = (pgno_t)size; @@ -536,12 +537,15 @@ static void mdbx_txl_free(MDBX_TXL list) { /* Append ID to PNL. The PNL must be big enough. */ static __inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t id) { - assert(pl[0] + (size_t)1 < MDBX_PNL_ALLOCLEN(pl)); + assert(pl[0] + (size_t)1 <= MDBX_PNL_ALLOCLEN(pl)); pl[pl[0] += 1] = id; } -static bool mdbx_pnl_check(MDBX_PNL pl) { +static bool mdbx_pnl_check(MDBX_PNL pl, bool allocated) { if (pl) { + if (allocated) { + assert(pl[0] <= MDBX_PNL_MAX && pl[0] <= pl[-1]); + } for (const pgno_t *ptr = pl + pl[0]; --ptr > pl;) { assert(MDBX_PNL_ORDERED(ptr[0], ptr[1])); assert(ptr[0] >= NUM_METAS); @@ -629,7 +633,7 @@ static void __hot mdbx_pnl_sort(MDBX_PNL pnl) { } #undef PNL_SMALL #undef PNL_SWAP - assert(mdbx_pnl_check(pnl)); + assert(mdbx_pnl_check(pnl, false)); } /* Search for an ID in an PNL. @@ -637,7 +641,7 @@ static void __hot mdbx_pnl_sort(MDBX_PNL pnl) { * [in] id The ID to search for. * Returns The index of the first ID greater than or equal to id. */ static unsigned __hot mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) { - assert(mdbx_pnl_check(pnl)); + assert(mdbx_pnl_check(pnl, true)); /* binary search of id in pl * if found, returns position of id @@ -687,18 +691,23 @@ static void mdbx_pnl_shrink(MDBX_PNL *ppl) { /* Grow an PNL. * Return the PNL to the size growed by given number. * [in,out] ppl Address of the PNL to grow. */ -static int mdbx_pnl_grow(MDBX_PNL *ppl, size_t num) { +static int __must_check_result mdbx_pnl_grow(MDBX_PNL *ppl, size_t num) { MDBX_PNL idn = *ppl - 1; + assert(idn[0] <= MDBX_PNL_MAX && idn[0] <= idn[-1]); + assert(num <= MDBX_PNL_MAX); + num += *idn; + if (unlikely(num > MDBX_PNL_MAX)) + return MDBX_TXN_FULL; /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); + idn = realloc(idn, (num + 2) * sizeof(pgno_t)); if (unlikely(!idn)) return MDBX_ENOMEM; *idn++ += (pgno_t)num; *ppl = idn; - return 0; + return MDBX_SUCCESS; } -static int mdbx_txl_grow(MDBX_TXL *ptr, size_t num) { +static int __must_check_result mdbx_txl_grow(MDBX_TXL *ptr, size_t num) { MDBX_TXL list = *ptr - 1; /* grow it */ list = realloc(list, ((size_t)*list + num + 2) * sizeof(txnid_t)); @@ -706,85 +715,96 @@ static int mdbx_txl_grow(MDBX_TXL *ptr, size_t num) { return MDBX_ENOMEM; *list++ += num; *ptr = list; - return 0; + return MDBX_SUCCESS; } /* Make room for num additional elements in an PNL. * [in,out] ppl Address of the PNL. * [in] num Number of elements to make room for. * Returns 0 on success, MDBX_ENOMEM on failure. */ -static int mdbx_pnl_need(MDBX_PNL *ppl, size_t num) { +static int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, size_t num) { MDBX_PNL pl = *ppl; + assert(pl[0] <= MDBX_PNL_MAX && pl[0] <= pl[-1]); + assert(num <= MDBX_PNL_MAX); num += pl[0]; if (unlikely(num > pl[-1])) { - num = (num + num / 4 + (256 + 2)) & -256; + if (unlikely(num > MDBX_PNL_MAX)) + return MDBX_TXN_FULL; + num = (num + num / 4 + (256 + 2)) & ~255u; + num = (num < MDBX_PNL_MAX + 2) ? num : MDBX_PNL_MAX + 2; pl = realloc(pl - 1, num * sizeof(pgno_t)); if (unlikely(!pl)) return MDBX_ENOMEM; *pl++ = (pgno_t)num - 2; *ppl = pl; } - return 0; + return MDBX_SUCCESS; } /* Append an ID onto an PNL. * [in,out] ppl Address of the PNL to append to. * [in] id The ID to append. * Returns 0 on success, MDBX_ENOMEM if the PNL is too large. */ -static int mdbx_pnl_append(MDBX_PNL *ppl, pgno_t id) { +static int __must_check_result mdbx_pnl_append(MDBX_PNL *ppl, pgno_t id) { MDBX_PNL pl = *ppl; /* Too big? */ if (unlikely(pl[0] >= pl[-1])) { - if (mdbx_pnl_grow(ppl, MDBX_PNL_UM_MAX)) - return MDBX_ENOMEM; + int rc = mdbx_pnl_grow(ppl, MDBX_PNL_UM_MAX); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; pl = *ppl; } pl[0]++; pl[pl[0]] = id; - return 0; + return MDBX_SUCCESS; } -static int mdbx_txl_append(MDBX_TXL *ptr, txnid_t id) { +static int __must_check_result mdbx_txl_append(MDBX_TXL *ptr, txnid_t id) { MDBX_TXL list = *ptr; /* Too big? */ if (unlikely(list[0] >= list[-1])) { - if (mdbx_txl_grow(ptr, (size_t)list[0])) - return MDBX_ENOMEM; + int rc = mdbx_txl_grow(ptr, (size_t)list[0]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; list = *ptr; } list[0]++; list[list[0]] = id; - return 0; + return MDBX_SUCCESS; } /* Append an PNL onto an PNL. * [in,out] ppl Address of the PNL to append to. * [in] app The PNL to append. * Returns 0 on success, MDBX_ENOMEM if the PNL is too large. */ -static int mdbx_pnl_append_list(MDBX_PNL *ppl, MDBX_PNL app) { +static int __must_check_result mdbx_pnl_append_list(MDBX_PNL *ppl, + MDBX_PNL app) { MDBX_PNL pnl = *ppl; /* Too big? */ if (unlikely(pnl[0] + app[0] >= pnl[-1])) { - if (mdbx_pnl_grow(ppl, app[0])) - return MDBX_ENOMEM; + int rc = mdbx_pnl_grow(ppl, app[0]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; pnl = *ppl; } memcpy(&pnl[pnl[0] + 1], &app[1], app[0] * sizeof(pgno_t)); pnl[0] += app[0]; - return 0; + return MDBX_SUCCESS; } -static int mdbx_txl_append_list(MDBX_TXL *ptr, MDBX_TXL append) { +static int __must_check_result mdbx_txl_append_list(MDBX_TXL *ptr, + MDBX_TXL append) { MDBX_TXL list = *ptr; /* Too big? */ if (unlikely(list[0] + append[0] >= list[-1])) { - if (mdbx_txl_grow(ptr, (size_t)append[0])) - return MDBX_ENOMEM; + int rc = mdbx_txl_grow(ptr, (size_t)append[0]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; list = *ptr; } memcpy(&list[list[0] + 1], &append[1], (size_t)append[0] * sizeof(txnid_t)); list[0] += append[0]; - return 0; + return MDBX_SUCCESS; } /* Append an ID range onto an PNL. @@ -792,27 +812,29 @@ static int mdbx_txl_append_list(MDBX_TXL *ptr, MDBX_TXL append) { * [in] id The lowest ID to append. * [in] n Number of IDs to append. * Returns 0 on success, MDBX_ENOMEM if the PNL is too large. */ -static int mdbx_pnl_append_range(MDBX_PNL *ppl, pgno_t id, size_t n) { +static int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl, pgno_t id, + size_t n) { pgno_t *pnl = *ppl, len = pnl[0]; /* Too big? */ if (unlikely(len + n > pnl[-1])) { - if (mdbx_pnl_grow(ppl, n | MDBX_PNL_UM_MAX)) - return MDBX_ENOMEM; + int rc = mdbx_pnl_grow(ppl, n | MDBX_PNL_UM_MAX); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; pnl = *ppl; } pnl[0] = len + (pgno_t)n; pnl += len; while (n) pnl[n--] = id++; - return 0; + return MDBX_SUCCESS; } /* Merge an PNL onto an PNL. The destination PNL must be big enough. * [in] pl The PNL to merge into. * [in] merge The PNL to merge. */ static void __hot mdbx_pnl_xmerge(MDBX_PNL pnl, MDBX_PNL merge) { - assert(mdbx_pnl_check(pnl)); - assert(mdbx_pnl_check(merge)); + assert(mdbx_pnl_check(pnl, true)); + assert(mdbx_pnl_check(merge, false)); pgno_t old_id, merge_id, i = merge[0], j = pnl[0], k = i + j, total = k; pnl[0] = MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0; /* delimiter for pl scan below */ @@ -824,7 +846,7 @@ static void __hot mdbx_pnl_xmerge(MDBX_PNL pnl, MDBX_PNL merge) { pnl[k--] = merge_id; } pnl[0] = total; - assert(mdbx_pnl_check(pnl)); + assert(mdbx_pnl_check(pnl, true)); } /* Search for an ID in an ID2L. @@ -873,45 +895,45 @@ static unsigned __hot mdbx_mid2l_search(MDBX_ID2L pnl, pgno_t id) { * [in,out] pnl The ID2L to insert into. * [in] id The ID2 to insert. * Returns 0 on success, -1 if the ID was already present in the ID2L. */ -static int mdbx_mid2l_insert(MDBX_ID2L pnl, MDBX_ID2 *id) { +static int __must_check_result mdbx_mid2l_insert(MDBX_ID2L pnl, MDBX_ID2 *id) { unsigned x = mdbx_mid2l_search(pnl, id->mid); if (unlikely(x < 1)) - return /* internal error */ -2; + return /* internal error */ MDBX_PROBLEM; if (x <= pnl[0].mid && pnl[x].mid == id->mid) - return /* duplicate */ -1; + return /* duplicate */ MDBX_PROBLEM; if (unlikely(pnl[0].mid >= MDBX_PNL_UM_MAX)) - return /* too big */ -2; + return /* too big */ MDBX_TXN_FULL; /* insert id */ pnl[0].mid++; for (unsigned i = (unsigned)pnl[0].mid; i > x; i--) pnl[i] = pnl[i - 1]; pnl[x] = *id; - return 0; + return MDBX_SUCCESS; } /* Append an ID2 into a ID2L. * [in,out] pnl The ID2L to append into. * [in] id The ID2 to append. * Returns 0 on success, -2 if the ID2L is too big. */ -static int mdbx_mid2l_append(MDBX_ID2L pnl, MDBX_ID2 *id) { +static int __must_check_result mdbx_mid2l_append(MDBX_ID2L pnl, MDBX_ID2 *id) { #if MDBX_DEBUG for (unsigned i = pnl[0].mid; i > 0; --i) { assert(pnl[i].mid != id->mid); if (unlikely(pnl[i].mid == id->mid)) - return -1; + return MDBX_PROBLEM; } #endif /* Too big? */ if (unlikely(pnl[0].mid >= MDBX_PNL_UM_MAX)) - return -2; + return /* too big */ MDBX_TXN_FULL; pnl[0].mid++; pnl[pnl[0].mid] = *id; - return 0; + return MDBX_SUCCESS; } /*----------------------------------------------------------------------------*/ @@ -1974,7 +1996,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { } /* Add a page to the txn's dirty list */ -static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { +static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { MDBX_ID2 mid; int rc, (*insert)(MDBX_ID2L, MDBX_ID2 *); @@ -1986,8 +2008,12 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { mid.mid = mp->mp_pgno; mid.mptr = mp; rc = insert(txn->mt_rw_dirtylist, &mid); - mdbx_tassert(txn, rc == 0); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; + } txn->mt_dirtyroom--; + return MDBX_SUCCESS; } static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, @@ -2144,7 +2170,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the freeDB, then the freelist cannot play + /* If mc is updating the freeDB, then the befree-list cannot play * catch-up with itself by growing while trying to save it. */ flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); @@ -2171,7 +2197,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); pgno_t pgno, *repg_list = env->me_reclaimed_pglist; unsigned repg_pos = 0, repg_len = repg_list ? repg_list[0] : 0; txnid_t oldest = 0, last = 0; @@ -2191,7 +2217,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (likely(flags & MDBX_ALLOC_CACHE) && repg_len > wanna_range && (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { #if MDBX_PNL_ASCENDING @@ -2305,7 +2331,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, pgno_t *re_pnl = (pgno_t *)data.iov_base; mdbx_tassert(txn, re_pnl[0] == 0 || data.iov_len == (re_pnl[0] + 1) * sizeof(pgno_t)); - mdbx_tassert(txn, mdbx_pnl_check(re_pnl)); + mdbx_tassert(txn, mdbx_pnl_check(re_pnl, false)); repg_pos = re_pnl[0]; if (!repg_list) { if (unlikely(!(env->me_reclaimed_pglist = repg_list = @@ -2375,7 +2401,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, mdbx_info("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, tail - txn->mt_next_pgno, tail, txn->mt_next_pgno); txn->mt_next_pgno = tail; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); } } @@ -2383,8 +2409,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (repg_len > MDBX_PNL_UM_SIZE / 2) break; if (flags & MDBX_COALESCE) { - if (repg_len /* current size */ >= env->me_maxfree_1pg / 2 || - repg_pos /* prev size */ >= env->me_maxfree_1pg / 4) + if (repg_len /* current size */ >= env->me_maxgc_ov1page || + repg_pos /* prev size */ >= env->me_maxgc_ov1page / 2) flags &= ~MDBX_COALESCE; } } @@ -2485,7 +2511,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } fail: - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (mp) { *mp = NULL; txn->mt_flags |= MDBX_TXN_ERROR; @@ -2516,7 +2542,7 @@ done: repg_list[0] = repg_len -= num; for (unsigned i = repg_pos - num; i < repg_len;) repg_list[++i] = repg_list[++repg_pos]; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); } else { txn->mt_next_pgno = pgno + num; mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); @@ -2530,10 +2556,12 @@ done: np->mp_leaf2_ksize = 0; np->mp_flags = 0; np->mp_pages = num; - mdbx_page_dirty(txn, np); + rc = mdbx_page_dirty(txn, np); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; *mp = np; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); return MDBX_SUCCESS; } @@ -2568,7 +2596,8 @@ static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { * [in] mp the page being referenced. It must not be dirty. * [out] ret the writable page, if any. * ret is unchanged if mp wasn't spilled. */ -static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { +static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, + MDBX_page **ret) { MDBX_env *env = txn->mt_env; const MDBX_txn *tx2; unsigned x; @@ -2607,7 +2636,10 @@ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ - mdbx_page_dirty(txn, np); + int rc = mdbx_page_dirty(txn, np); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + np->mp_flags |= P_DIRTY; *ret = np; break; @@ -2669,8 +2701,8 @@ static int mdbx_page_touch(MDBX_cursor *mc) { " in the dirtylist[%d], expecting %p", dl[x].mptr, pgno, x, mp); mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; + rc = MDBX_PROBLEM; + goto fail; } return MDBX_SUCCESS; } @@ -2680,12 +2712,15 @@ static int mdbx_page_touch(MDBX_cursor *mc) { mdbx_cassert(mc, dl[0].mid < MDBX_PNL_UM_MAX); /* No - copy it */ np = mdbx_page_malloc(txn, 1); - if (unlikely(!np)) - return MDBX_ENOMEM; + if (unlikely(!np)) { + rc = MDBX_ENOMEM; + goto fail; + } mid.mid = pgno; mid.mptr = np; rc = mdbx_mid2l_insert(dl, &mid); - mdbx_cassert(mc, rc == 0); + if (unlikely(rc)) + goto fail; } else { return MDBX_SUCCESS; } @@ -2921,7 +2956,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return MDBX_BAD_RSLOT; } else if (env->me_lck) { unsigned slot, nreaders; - const mdbx_pid_t pid = env->me_pid; const mdbx_tid_t tid = mdbx_thread_self(); mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); @@ -2931,13 +2965,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return rc; rc = MDBX_SUCCESS; - if (unlikely(env->me_live_reader != pid)) { + if (unlikely(env->me_live_reader != env->me_pid)) { rc = mdbx_rpid_set(env); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_rdt_unlock(env); return rc; } - env->me_live_reader = pid; + env->me_live_reader = env->me_pid; } while (1) { @@ -2970,11 +3004,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { env->me_lck->mti_numreaders = ++nreaders; if (env->me_close_readers < nreaders) env->me_close_readers = nreaders; - r->mr_pid = pid; + r->mr_pid = env->me_pid; mdbx_rdt_unlock(env); - if (likely(env->me_flags & MDBX_ENV_TXKEY)) + if (likely(env->me_flags & MDBX_ENV_TXKEY)) { + assert(env->me_live_reader == env->me_pid); mdbx_thread_rthc_set(env->me_txkey, r); + } } while (1) { @@ -3485,88 +3521,103 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { return MDBX_SUCCESS; } -/* Save the freelist as of this transaction to the freeDB. - * This changes the freelist. Keep trying until it stabilizes. */ -static int mdbx_freelist_save(MDBX_txn *txn) { +/* Cleanup reclaimed GC records, than save the befree-list as of this + * transaction to GC (aka freeDB). This recursive changes the reclaimed-list + * loose-list and befree-list. Keep trying until it stabilizes. */ +static int mdbx_update_gc(MDBX_txn *txn) { /* env->me_reclaimed_pglist[] can grow and shrink during this call. - * env->me_last_reclaimed and txn->mt_free_pages[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pages[]. */ - MDBX_cursor mc; + * env->me_last_reclaimed and txn->mt_befree_pages[] can only grow. + * Page numbers cannot disappear from txn->mt_befree_pages[]. */ MDBX_env *env = txn->mt_env; - int rc, more = 1; - txnid_t cleanup_reclaimed_id = 0, head_id = 0; - pgno_t befree_count = 0; - intptr_t head_room = 0, total_room = 0; - unsigned cleanup_reclaimed_pos = 0, refill_reclaimed_pos = 0; const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + MDBX_cursor mc; + int rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* MDBX_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - const intptr_t clean_limit = - (env->me_flags & (MDBX_NOMEMINIT | MDBX_WRITEMAP)) ? SSIZE_MAX - : env->me_maxfree_1pg; + const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; + mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); + (void)dbg_prefix_mode; + unsigned befree_stored = 0, loop = 0; + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + +retry: + mdbx_trace(" >> restart"); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + if (unlikely(/* paranoia */ ++loop > 42)) { + mdbx_error("too more loops %u, bailout", loop); + rc = MDBX_PROBLEM; + goto bailout; + } + + unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, + filled_gc_slot = ~0u; + txnid_t cleaned_gc_id = 0, + head_gc_id = lifo ? *env->me_oldest : env->me_last_reclaimed; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); -again_on_freelist_change: - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); while (1) { - /* Come back here after each Put() in case freelist changed */ + /* Come back here after each Put() in case befree-list changed */ MDBX_val key, data; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (!lifo) { /* If using records from freeDB which we have not yet deleted, * now delete them and any we reserved for me_reclaimed_pglist. */ - while (cleanup_reclaimed_id < env->me_last_reclaimed) { + while (cleaned_gc_id < env->me_last_reclaimed) { rc = mdbx_cursor_first(&mc, &key, NULL); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - cleanup_reclaimed_id = head_id = *(txnid_t *)key.iov_base; - total_room = head_room = 0; - more = 1; - mdbx_tassert(txn, cleanup_reclaimed_id <= env->me_last_reclaimed); + cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); + placed = 0; + mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed); mc.mc_flags |= C_RECLAIMING; + mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + cleaned_gc_id); rc = mdbx_cursor_del(&mc, 0); mc.mc_flags ^= C_RECLAIMING; - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } else if (txn->mt_lifo_reclaimed) { /* LY: cleanup reclaimed records. */ - while (cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]) { - cleanup_reclaimed_id = txn->mt_lifo_reclaimed[++cleanup_reclaimed_pos]; - key.iov_base = &cleanup_reclaimed_id; - key.iov_len = sizeof(cleanup_reclaimed_id); + while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) { + cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; + head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id; + key.iov_base = &cleaned_gc_id; + key.iov_len = sizeof(cleaned_gc_id); rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); if (likely(rc != MDBX_NOTFOUND)) { - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mc.mc_flags |= C_RECLAIMING; + mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + cleaned_gc_slot, cleaned_gc_id); rc = mdbx_cursor_del(&mc, 0); mc.mc_flags ^= C_RECLAIMING; - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + // handle loose pages - put ones into the reclaimed- or befree-list + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (txn->mt_loose_pages) { /* Return loose page numbers to me_reclaimed_pglist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!env->me_reclaimed_pglist) && !(lifo && env->me_last_reclaimed > 1)) { - /* Put loose page numbers in mt_free_pages, + /* Put loose page numbers in mt_befree_pages, * since unable to return them to me_reclaimed_pglist. */ if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, txn->mt_loose_count)) != 0)) @@ -3575,8 +3626,9 @@ again_on_freelist_change: mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno); } else { /* Room for loose pages + temp PNL with same */ - if ((rc = mdbx_pnl_need(&env->me_reclaimed_pglist, - 2 * txn->mt_loose_count + 1)) != 0) + rc = mdbx_pnl_need(&env->me_reclaimed_pglist, + 2 * txn->mt_loose_count + 1); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; MDBX_PNL loose = env->me_reclaimed_pglist + MDBX_PNL_ALLOCLEN(env->me_reclaimed_pglist) - @@ -3612,9 +3664,9 @@ again_on_freelist_change: txn->mt_loose_count = 0; } - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + // handle reclaimed pages - return suitable into unallocated space + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (env->me_reclaimed_pglist) { - /* Refund suitable pages into "unallocated" space */ pgno_t tail = txn->mt_next_pgno; pgno_t *const begin = env->me_reclaimed_pglist + 1; pgno_t *const end = begin + env->me_reclaimed_pglist[0]; @@ -3640,86 +3692,75 @@ again_on_freelist_change: mdbx_info("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, tail - txn->mt_next_pgno, tail, txn->mt_next_pgno); txn->mt_next_pgno = tail; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); } } - /* Save the PNL of pages freed by this txn, to a single record */ - if (befree_count < txn->mt_befree_pages[0]) { - if (unlikely(!befree_count)) { - /* Make sure last page of freeDB is touched and on freelist */ + // handle befree-list - store ones into singe gc-record + if (befree_stored < txn->mt_befree_pages[0]) { + if (unlikely(!befree_stored)) { + /* Make sure last page of freeDB is touched and on befree-list */ rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); - if (unlikely(rc && rc != MDBX_NOTFOUND)) + if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) goto bailout; } - pgno_t *befree_pages = txn->mt_befree_pages; /* Write to last page of freeDB */ key.iov_len = sizeof(txn->mt_txnid); key.iov_base = &txn->mt_txnid; do { - befree_count = befree_pages[0]; - data.iov_len = MDBX_PNL_SIZEOF(befree_pages); + data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - /* Retry if mt_free_pages[] grew during the Put() */ - befree_pages = txn->mt_befree_pages; - } while (befree_count < befree_pages[0]); + /* Retry if mt_befree_pages[] grew during the Put() */ + } while (data.iov_len < MDBX_PNL_SIZEOF(txn->mt_befree_pages)); - mdbx_pnl_sort(befree_pages); - memcpy(data.iov_base, befree_pages, data.iov_len); + befree_stored = (unsigned)txn->mt_befree_pages[0]; + mdbx_pnl_sort(txn->mt_befree_pages); + memcpy(data.iov_base, txn->mt_befree_pages, data.iov_len); + + mdbx_trace("%s.put-befree #%u @ %" PRIaTXN, dbg_prefix_mode, + befree_stored, txn->mt_txnid); if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { - unsigned i = (unsigned)befree_pages[0]; + unsigned i = befree_stored; mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO "", befree_pages[i]); + mdbx_debug_extra_print(" %" PRIaPGNO "", txn->mt_befree_pages[i]); mdbx_debug_extra_print("\n"); } continue; } - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); - const intptr_t rpl_len = - (env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0) + - txn->mt_loose_count; - if (rpl_len && refill_reclaimed_pos == 0) - refill_reclaimed_pos = 1; + // handle reclaimed and loost pages - merge and store both into gc + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + mdbx_tassert(txn, txn->mt_loose_count == 0); + const unsigned amount = + env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0; + const unsigned left = amount - placed; - /* Reserve records for me_reclaimed_pglist[]. Split it if multi-page, - * to avoid searching freeDB for a page range. Use keys in - * range [1,me_last_reclaimed]: Smaller than txnid of oldest reader. */ - if (total_room >= rpl_len) { - if (total_room == rpl_len || --more < 0) - break; - } else if (head_room >= (intptr_t)env->me_maxfree_1pg && head_id > 1) { - /* Keep current record (overflow page), add a new one */ - head_id--; - refill_reclaimed_pos++; - head_room = 0; - } + mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount, + placed, (int)left); + if (0 >= (int)left) + break; + mdbx_trace(" >> reserving"); + txnid_t reservation_gc_id; + const unsigned lifo_gc_slots = + txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0; if (lifo) { - if (refill_reclaimed_pos > - (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { + if (reused_gc_slot >= lifo_gc_slots) { /* LY: need just a txn-id for save page list. */ rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); - if (likely(rc == 0)) + if (likely(rc == MDBX_SUCCESS)) /* LY: ok, reclaimed from freedb. */ continue; if (unlikely(rc != MDBX_NOTFOUND)) /* LY: other troubles... */ goto bailout; - /* LY: freedb is empty, will look any free txn-id in high2low order. */ - if (unlikely(env->me_last_reclaimed < 1)) { - /* LY: not any txn in the past of freedb. */ - rc = MDBX_MAP_FULL; - goto bailout; - } - if (unlikely(!txn->mt_lifo_reclaimed)) { txn->mt_lifo_reclaimed = mdbx_txl_alloc(); if (unlikely(!txn->mt_lifo_reclaimed)) { @@ -3727,147 +3768,190 @@ again_on_freelist_change: goto bailout; } } - /* LY: append the list. */ - rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, - env->me_last_reclaimed - 1); - if (unlikely(rc)) + /* LY: freedb is empty, will look any free txn-id in high2low order. */ + rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, --head_gc_id); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - --env->me_last_reclaimed; - /* LY: note that freeDB cleanup is not needed. */ - ++cleanup_reclaimed_pos; + cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; + + mdbx_trace("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, head_gc_id, cleaned_gc_slot); } mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); - head_id = txn->mt_lifo_reclaimed[refill_reclaimed_pos]; + reservation_gc_id = txn->mt_lifo_reclaimed[++reused_gc_slot]; + mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", + dbg_prefix_mode, reservation_gc_id, reused_gc_slot); + head_gc_id = + (head_gc_id > reservation_gc_id) ? reservation_gc_id : head_gc_id; } else { mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); + reused_gc_slot++ /* just count reserved records */; + reservation_gc_id = head_gc_id--; + mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); + } + + mdbx_trace("%s: head_gc_id %" PRIaTXN + ", reused_gc_slot %u, lifo_gc_slots %u, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, head_gc_id, reused_gc_slot, lifo_gc_slots, + reservation_gc_id); + + const bool no_slots_more = + head_gc_id < 2 && (!lifo || reused_gc_slot >= lifo_gc_slots); + const unsigned chunk = + (left < env->me_maxgc_ov1page || no_slots_more) + ? left + : (left < env->me_maxgc_ov1page * 2) + ? /* the half to each of the last two chunks */ left / 2 + : env->me_maxgc_ov1page; + + mdbx_trace("%s: chunk %u, no_slots_more %s, gc-per-ovpage %u", + dbg_prefix_mode, chunk, no_slots_more ? "yes" : "no", + env->me_maxgc_ov1page); + + mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); + if (unlikely(reservation_gc_id < 1)) { + /* LY: not any txn in the past of freedb. */ + rc = MDBX_PROBLEM; + goto bailout; } - /* (Re)write {key = head_id, PNL length = head_room} */ - total_room -= head_room; - head_room = rpl_len - total_room; - if (head_room > (intptr_t)env->me_maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_reclaimed_pglist */ - head_room /= (head_id < INT16_MAX) ? (pgno_t)head_id - : INT16_MAX; /* amortize page sizes */ - head_room += env->me_maxfree_1pg - head_room % (env->me_maxfree_1pg + 1); - } else if (head_room < 0) { - /* Rare case, not bothering to delete this record */ - head_room = 0; - continue; - } - key.iov_len = sizeof(head_id); - key.iov_base = &head_id; - data.iov_len = (head_room + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); - if (unlikely(rc)) + key.iov_len = sizeof(reservation_gc_id); + key.iov_base = &reservation_gc_id; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, + placed + 1, placed + chunk + 1, reservation_gc_id); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* PNL is initially empty, zero out at least the length */ - pgno_t *pgs = (pgno_t *)data.iov_base; - intptr_t i = head_room > clean_limit ? head_room : 0; - do { - pgs[i] = 0; - } while (--i >= 0); - total_room += head_room; + memset(data.iov_base, 0, sizeof(pgno_t)); + placed += chunk; + mdbx_trace("%s.placed %u (+%u), continue", dbg_prefix_mode, placed, chunk); continue; } mdbx_tassert(txn, - cleanup_reclaimed_pos == + cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - /* Fill in the reserved me_reclaimed_pglist records */ + mdbx_trace(" >> filling"); + /* Fill in the reserved records */ + filled_gc_slot = reused_gc_slot; rc = MDBX_SUCCESS; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); + mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - size_t rpl_left = env->me_reclaimed_pglist[0]; - pgno_t *rpl_end = env->me_reclaimed_pglist + rpl_left; - if (txn->mt_lifo_reclaimed == 0) { + unsigned left = env->me_reclaimed_pglist[0]; + pgno_t *end = env->me_reclaimed_pglist + left; + if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); rc = mdbx_cursor_first(&mc, &key, &data); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { mdbx_tassert(txn, lifo != 0); } while (1) { - txnid_t id; - if (txn->mt_lifo_reclaimed == 0) { + txnid_t fill_gc_id; + mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, + (unsigned)env->me_reclaimed_pglist[0]); + if (txn->mt_lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); - id = *(txnid_t *)key.iov_base; - mdbx_tassert(txn, id <= env->me_last_reclaimed); + fill_gc_id = *(txnid_t *)key.iov_base; + if (filled_gc_slot-- /* just countdown reserved records */ == 0 || + fill_gc_id > env->me_last_reclaimed) { + mdbx_notice( + "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN, + filled_gc_slot, fill_gc_id, env->me_last_reclaimed); + goto retry; + } } else { mdbx_tassert(txn, lifo != 0); - mdbx_tassert(txn, - refill_reclaimed_pos > 0 && - refill_reclaimed_pos <= txn->mt_lifo_reclaimed[0]); - id = txn->mt_lifo_reclaimed[refill_reclaimed_pos--]; - key.iov_base = &id; - key.iov_len = sizeof(id); + if (filled_gc_slot == 0) { + mdbx_notice("** restart: reserve depleted (filled_slot == 0)"); + goto retry; + } + mdbx_tassert(txn, filled_gc_slot > 0 && + filled_gc_slot <= txn->mt_lifo_reclaimed[0]); + fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot--]; + mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, (unsigned)filled_gc_slot); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } mdbx_tassert( - txn, cleanup_reclaimed_pos == + txn, cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - size_t chunk_len = (data.iov_len / sizeof(pgno_t)) - 1; - if (chunk_len > rpl_left) - chunk_len = rpl_left; - data.iov_len = (chunk_len + 1) * sizeof(pgno_t); - key.iov_base = &id; - key.iov_len = sizeof(id); - - rpl_end -= chunk_len; - data.iov_base = rpl_end; - pgno_t save = rpl_end[0]; - rpl_end[0] = (pgno_t)chunk_len; - mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); + const size_t space = (data.iov_len / sizeof(pgno_t)) - 1; + const unsigned chunk = (space > left) ? left : (unsigned)space; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); + + end -= chunk; + data.iov_base = end; + pgno_t save = end[0]; + end[0] = (pgno_t)chunk; + mdbx_tassert(txn, mdbx_pnl_check(end, false)); mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); mc.mc_flags ^= C_RECLAIMING; - mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); + mdbx_tassert(txn, mdbx_pnl_check(end, false)); mdbx_tassert( - txn, cleanup_reclaimed_pos == + txn, cleaned_gc_slot == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - rpl_end[0] = save; - if (unlikely(rc)) + pgno_t *from = end + 1, *to = end + end[0]; + mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO + "] @%" PRIaTXN, + dbg_prefix_mode, (unsigned)end[0], + (unsigned)(from - env->me_reclaimed_pglist), *from, + (unsigned)(to - env->me_reclaimed_pglist), *to, fill_gc_id); + end[0] = save; + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rpl_left -= chunk_len; - if (rpl_left == 0) + left -= chunk; + if (left == 0) { + rc = MDBX_SUCCESS; break; + } if (!lifo) { rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } -bailout: + mdbx_tassert(txn, rc == MDBX_SUCCESS); if (txn->mt_lifo_reclaimed) { - mdbx_tassert(txn, rc || cleanup_reclaimed_pos == txn->mt_lifo_reclaimed[0]); - if (rc == MDBX_SUCCESS && - cleanup_reclaimed_pos != txn->mt_lifo_reclaimed[0]) { - mdbx_tassert(txn, cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]); - /* LY: zeroed cleanup_idx to force cleanup - * and refill created freeDB records. */ - cleanup_reclaimed_pos = 0; - /* LY: restart filling */ - total_room = head_room = refill_reclaimed_pos = 0; - more = 1; - goto again_on_freelist_change; + mdbx_tassert(txn, cleaned_gc_slot == txn->mt_lifo_reclaimed[0]); + if (unlikely(filled_gc_slot != 0)) { + mdbx_notice("** restart: reserve excess (filled-slot %u > 0)", + filled_gc_slot); + goto retry; } + } + +bailout: + if (txn->mt_lifo_reclaimed) { txn->mt_lifo_reclaimed[0] = 0; if (txn != env->me_txn0) { mdbx_txl_free(txn->mt_lifo_reclaimed); @@ -3875,6 +3959,7 @@ bailout: } } + mdbx_trace("<<< rc = %d", rc); return rc; } @@ -3997,7 +4082,7 @@ static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) { (env->me_dbflags[i] & MDBX_VALID)) { txn->mt_dbs[i].md_flags = env->me_dbflags[i] & PERSISTENT_FLAGS; txn->mt_dbflags[i] = DB_VALID | DB_USRVALID | DB_STALE; - assert(txn->mt_dbxs[i].md_cmp != NULL); + mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); } } txn->mt_numdbs = snap_numdbs; @@ -4241,7 +4326,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { } } - rc = mdbx_freelist_save(txn); + rc = mdbx_update_gc(txn); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -4847,7 +4932,8 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { #define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db))) -#define mdbx_maxfree1pg(pagesize) (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) +#define mdbx_maxgc_ov1page(pagesize) \ + (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) int mdbx_get_maxkeysize(size_t pagesize) { if (pagesize == 0) @@ -4869,11 +4955,11 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { mdbx_ensure(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; - STATIC_ASSERT(mdbx_maxfree1pg(MIN_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_maxfree1pg(MAX_PAGESIZE) < MDBX_PNL_DB_MAX); - const intptr_t maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, maxfree_1pg > 42 && maxfree_1pg < MDBX_PNL_DB_MAX); - env->me_maxfree_1pg = (unsigned)maxfree_1pg; + STATIC_ASSERT(mdbx_maxgc_ov1page(MIN_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_maxgc_ov1page(MAX_PAGESIZE) < MDBX_PNL_DB_MAX); + const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + mdbx_ensure(env, maxgc_ov1page > 42 && maxgc_ov1page < MDBX_PNL_DB_MAX); + env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42); STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX); @@ -5945,6 +6031,8 @@ static void __cold mdbx_env_close0(MDBX_env *env) { if (env->me_flags & MDBX_ENV_TXKEY) mdbx_rthc_remove(env->me_txkey); + if (env->me_live_reader) + (void)mdbx_rpid_clear(env); if (env->me_map) { mdbx_munmap(&env->me_dxb_mmap); @@ -6352,9 +6440,14 @@ static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, mapped: p = pgno2page(env, pgno); - /* TODO: check p->mp_validator here */ done: + if ((p->mp_flags & P_OVERFLOW) == 0 && + unlikely(p->mp_upper < p->mp_lower || + PAGEHDRSZ + p->mp_upper > env->me_psize)) + return MDBX_CORRUPTED; + /* TODO: more checks here, including p->mp_validator */ + *ret = p; if (lvl) *lvl = level; @@ -7492,7 +7585,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, DVAL((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; - if (flags & MDBX_CURRENT) { + if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. @@ -7713,7 +7806,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, offset *= 4; /* space for 4 more */ break; } - /* FALLTHRU: Big enough MDBX_DUPFIXaED sub-page */ + /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ __fallthrough; case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: @@ -7823,7 +7916,10 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, id2.mptr = np; /* Note - this page is already counted in parent's dirtyroom */ rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_rw_dirtylist, &id2); - mdbx_cassert(mc, rc2 == 0); + if (unlikely(rc2 != MDBX_SUCCESS)) { + rc = rc2; + goto fail; + } /* Currently we make the page look as with put() in the * parent txn, in case the user peeks at MDBX_RESERVEd @@ -7860,8 +7956,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { mdbx_cassert(mc, NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, mc->mc_pg[mc->mc_top]->mp_upper == - mc->mc_pg[mc->mc_top]->mp_lower); mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) && !IS_LEAF2(mc->mc_pg[mc->mc_top])); mdbx_cassert(mc, NODEDSZ(leaf) == 0); @@ -7869,7 +7963,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, key->iov_len < UINT16_MAX); leaf->mn_ksize = (uint16_t)key->iov_len; memcpy(NODEKEY(leaf), key->iov_base, key->iov_len); - assert((char *)NODEDATA(leaf) + NODEDSZ(leaf) < + assert((char *)NODEKEY(leaf) + NODEDSZ(leaf) < (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } @@ -8014,6 +8108,7 @@ new_sub: /* should not happen, we deleted that item */ rc = MDBX_PROBLEM; } +fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } @@ -10914,10 +11009,10 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, } unsigned dbflag = DB_FRESH | DB_VALID | DB_USRVALID; + MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ assert(rc == MDBX_NOTFOUND); - MDBX_db db_dummy; memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; diff --git a/libs/libmdbx/src/src/osal.h b/libs/libmdbx/src/src/osal.h index daa79064f9..a1feb998d2 100644 --- a/libs/libmdbx/src/src/osal.h +++ b/libs/libmdbx/src/src/osal.h @@ -552,7 +552,6 @@ int mdbx_lck_init(MDBX_env *env); int mdbx_lck_seize(MDBX_env *env); int mdbx_lck_downgrade(MDBX_env *env, bool complete); -int mdbx_lck_upgrade(MDBX_env *env); void mdbx_lck_destroy(MDBX_env *env); int mdbx_rdt_lock(MDBX_env *env); diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.c b/libs/libmdbx/src/src/tools/mdbx_chk.c index 51096c4053..772102ccb6 100644 --- a/libs/libmdbx/src/src/tools/mdbx_chk.c +++ b/libs/libmdbx/src/src/tools/mdbx_chk.c @@ -340,7 +340,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, data->iov_len); else { const pgno_t number = *iptr++; - if (number >= MDBX_PNL_UM_MAX) + if (number < 1 || number > MDBX_PNL_MAX) problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", number); else if ((number + 1) * sizeof(pgno_t) != data->iov_len) |