diff options
Diffstat (limited to 'libs/libmdbx/src')
-rw-r--r-- | libs/libmdbx/src/CMakeLists.txt | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/ChangeLog.md | 25 | ||||
-rw-r--r-- | libs/libmdbx/src/README.md | 9 | ||||
-rw-r--r-- | libs/libmdbx/src/VERSION | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/cmake/compiler.cmake | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/cmake/profile.cmake | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/cmake/utils.cmake | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/man1/mdbx_chk.1 | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/man1/mdbx_copy.1 | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/man1/mdbx_dump.1 | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/man1/mdbx_load.1 | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/man1/mdbx_stat.1 | 2 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c | 1245 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.c++ | 170 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.h | 36 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx.h++ | 6 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_chk.c | 170 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_copy.c | 168 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_dump.c | 168 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_load.c | 168 | ||||
-rw-r--r-- | libs/libmdbx/src/mdbx_stat.c | 168 |
21 files changed, 1327 insertions, 1026 deletions
diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt index ff21fb5e0a..46afa66fca 100644 --- a/libs/libmdbx/src/CMakeLists.txt +++ b/libs/libmdbx/src/CMakeLists.txt @@ -1,5 +1,5 @@ ## -## Copyright 2020 Leonid Yuriev <leo@yuriev.ru> +## Copyright 2020-2021 Leonid Yuriev <leo@yuriev.ru> ## and other libmdbx authors: please see AUTHORS file. ## All rights reserved. ## diff --git a/libs/libmdbx/src/ChangeLog.md b/libs/libmdbx/src/ChangeLog.md index 1d79b89061..50b911f5a7 100644 --- a/libs/libmdbx/src/ChangeLog.md +++ b/libs/libmdbx/src/ChangeLog.md @@ -6,16 +6,18 @@ ChangeLog TODO: - Engage new terminology (https://github.com/erthink/libmdbx/issues/137). - Rework/speedup the implementation of the dirty page list (lazy compactification, lazy sorting via merge). - - Resolve few TODOs (https://github.com/erthink/libmdbx/issues/123, https://github.com/erthink/libmdbx/issues/124, - https://github.com/erthink/libmdbx/issues/127, https://github.com/erthink/libmdbx/issues/128, + - Resolve few TODOs (https://github.com/erthink/libmdbx/issues/124, https://github.com/erthink/libmdbx/issues/127, https://github.com/erthink/libmdbx/issues/132, https://github.com/erthink/libmdbx/issues/115). - Finalize C++ API (few typos and trivia bugs are still likely for now). - - Packages for ROSA Linux, ALT Linux, Fedora/RHEL, Debian/Ubuntu. + - Packages for [ROSA Linux](https://www.rosalinux.ru/), [ALT Linux](https://www.altlinux.org/), Fedora/RHEL, Debian/Ubuntu. Acknowledgements: - - Mahlon E. Smith (http://www.martini.nu/) for FreeBSD port of libmdbx. - - 장세연 (http://www.castis.com) for bug fixing and PR. + - [Mahlon E. Smith](http://www.martini.nu/) for [FreeBSD port of libmdbx](https://svnweb.freebsd.org/ports/head/databases/mdbx/). + - [장세연](http://www.castis.com) for bug fixing and PR. + - [Clément Renault](https://github.com/Kerollmops/heed) for [Heed](https://github.com/Kerollmops/heed) fully typed Rust wrapper. + - [Alex Sharov](https://github.com/AskAlexSharov) for bug reporting. + - [Noel Kuntze](https://github.com/Thermi) for bug reporting. Removed options and features: @@ -24,7 +26,7 @@ Removed options and features: New features: - Package for FreeBSD is available now by Mahlon E. Smith. - - New API functions to get/set various options (https://github.com/erthink/libmdbx/issues/128). + - New API functions to get/set various options (https://github.com/erthink/libmdbx/issues/128): - the maximum number of named databases for the environment; - the maximum number of threads/reader slots; - threshold (since the last unsteady commit) to force flush the data buffers to disk; @@ -39,8 +41,14 @@ New features: - how much of the parent transaction dirty pages will be spilled while start each child transaction; - Unlimited/Dynamic size of retired and dirty page lists (https://github.com/erthink/libmdbx/issues/123). - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. - - Reworked spilling of large transaction and committing of nested transactions. + - Reworked spilling of large transaction and committing of nested transactions: + - page spilling code reworked to avoid the flaws and bugs inherited from LMDB; + - limit for number of dirty pages now is controllable at runtime; + - a spilled pages, including overflow/large pages, now can be reused and refunded/compactified in nested transactions; + - more effective refunding/compactification especially for the loosed page cache. - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. + - Added `mdbx_default_pagesize()` function. + - Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic). Fixes: @@ -52,6 +60,9 @@ Fixes: - Fixed handling states of cursors's and subDBs's for nested transactions. - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://github.com/erthink/libmdbx/issues/153). + - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://github.com/erthink/libmdbx/issues/123). + - Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint. + - Fixed operation on systems with unusual small/large page size, including PowerPC (https://github.com/erthink/libmdbx/issues/157). ## v0.9.2 at 2020-11-27 diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md index 799c993c91..66da37c454 100644 --- a/libs/libmdbx/src/README.md +++ b/libs/libmdbx/src/README.md @@ -72,11 +72,12 @@ _MithrilDB_ is a rightly relevant name. <!-- section-end --> [![https://t.me/libmdbx](https://raw.githubusercontent.com/wiki/erthink/libmdbx/img/telegram.png)](https://t.me/libmdbx) -[![Build Status](https://travis-ci.org/erthink/libmdbx.svg?branch=master)](https://travis-ci.org/erthink/libmdbx) -[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) +[![GithubCI](https://github.com/erthink/libmdbx/workflows/CI/badge.svg)](https://github.com/erthink/libmdbx/actions?query=workflow%3ACI) +[![TravisCI](https://travis-ci.org/erthink/libmdbx.svg?branch=master)](https://travis-ci.org/erthink/libmdbx) +[![AppveyorCI](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) [![CircleCI](https://circleci.com/gh/erthink/libmdbx/tree/master.svg?style=svg)](https://circleci.com/gh/erthink/libmdbx/tree/master) +[![CirrusCI](https://api.cirrus-ci.com/github/erthink/libmdbx.svg)](https://cirrus-ci.com/github/erthink/libmdbx) [![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx) -[![Build Status](https://api.cirrus-ci.com/github/erthink/libmdbx.svg)](https://cirrus-ci.com/github/erthink/libmdbx) *The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.* @@ -484,7 +485,7 @@ Bindings | Runtime | GitHub | Author | | ------- | ------ | ------ | | [Nim](https://en.wikipedia.org/wiki/Nim_(programming_language)) | [NimDBX](https://github.com/snej/nimdbx) | [Jens Alfke](https://github.com/snej) -| Rust | [mdbx-rs](https://github.com/Kerollmops/mdbx-rs) | [Clément Renault](https://github.com/Kerollmops) | +| Rust | [heed](https://github.com/Kerollmops/heed), [mdbx-rs](https://github.com/Kerollmops/mdbx-rs) | [Clément Renault](https://github.com/Kerollmops) | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | diff --git a/libs/libmdbx/src/VERSION b/libs/libmdbx/src/VERSION index b57017e6e7..ad5a80ff3c 100644 --- a/libs/libmdbx/src/VERSION +++ b/libs/libmdbx/src/VERSION @@ -1 +1 @@ -0.9.2.110 +0.9.2.136 diff --git a/libs/libmdbx/src/cmake/compiler.cmake b/libs/libmdbx/src/cmake/compiler.cmake index 22324a64f8..ef49da85b1 100644 --- a/libs/libmdbx/src/cmake/compiler.cmake +++ b/libs/libmdbx/src/cmake/compiler.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2020 Leonid Yuriev <leo@yuriev.ru>. +## Copyright (c) 2012-2021 Leonid Yuriev <leo@yuriev.ru>. ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/libs/libmdbx/src/cmake/profile.cmake b/libs/libmdbx/src/cmake/profile.cmake index cf285653c5..66f4ed873d 100644 --- a/libs/libmdbx/src/cmake/profile.cmake +++ b/libs/libmdbx/src/cmake/profile.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2020 Leonid Yuriev <leo@yuriev.ru>. +## Copyright (c) 2012-2021 Leonid Yuriev <leo@yuriev.ru>. ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/libs/libmdbx/src/cmake/utils.cmake b/libs/libmdbx/src/cmake/utils.cmake index 4a48a15a63..daf4b39d28 100644 --- a/libs/libmdbx/src/cmake/utils.cmake +++ b/libs/libmdbx/src/cmake/utils.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2020 Leonid Yuriev <leo@yuriev.ru>. +## Copyright (c) 2012-2021 Leonid Yuriev <leo@yuriev.ru>. ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/libs/libmdbx/src/man1/mdbx_chk.1 b/libs/libmdbx/src/man1/mdbx_chk.1 index 0348f90b06..1c13a91310 100644 --- a/libs/libmdbx/src/man1/mdbx_chk.1 +++ b/libs/libmdbx/src/man1/mdbx_chk.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. +.\" Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDBX_CHK 1 "2020-09-25" "MDBX 0.9.1" .SH NAME diff --git a/libs/libmdbx/src/man1/mdbx_copy.1 b/libs/libmdbx/src/man1/mdbx_copy.1 index 83fbc904b9..2a448cea8c 100644 --- a/libs/libmdbx/src/man1/mdbx_copy.1 +++ b/libs/libmdbx/src/man1/mdbx_copy.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. +.\" Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/libs/libmdbx/src/man1/mdbx_dump.1 b/libs/libmdbx/src/man1/mdbx_dump.1 index bdb4d55366..5a940a9d35 100644 --- a/libs/libmdbx/src/man1/mdbx_dump.1 +++ b/libs/libmdbx/src/man1/mdbx_dump.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. +.\" Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/libs/libmdbx/src/man1/mdbx_load.1 b/libs/libmdbx/src/man1/mdbx_load.1 index 950ae0283e..88db9740e1 100644 --- a/libs/libmdbx/src/man1/mdbx_load.1 +++ b/libs/libmdbx/src/man1/mdbx_load.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. +.\" Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/libs/libmdbx/src/man1/mdbx_stat.1 b/libs/libmdbx/src/man1/mdbx_stat.1 index ec0ffb1280..5cb70c2b3a 100644 --- a/libs/libmdbx/src/man1/mdbx_stat.1 +++ b/libs/libmdbx/src/man1/mdbx_stat.1 @@ -1,4 +1,4 @@ -.\" Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. +.\" Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. diff --git a/libs/libmdbx/src/mdbx.c b/libs/libmdbx/src/mdbx.c index 895c263eef..6ceafe5097 100644 --- a/libs/libmdbx/src/mdbx.c +++ b/libs/libmdbx/src/mdbx.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -104,7 +104,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -536,7 +536,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -999,15 +999,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1043,14 +1045,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1065,21 +1059,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1809,6 +1805,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1847,6 +1868,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1855,6 +1877,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1881,24 +1904,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2101,7 +2106,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2113,23 +2118,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2142,25 +2142,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2169,21 +2169,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2195,8 +2195,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2251,7 +2251,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2535,7 +2535,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2581,11 +2581,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2593,12 +2593,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2615,13 +2615,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ @@ -3043,7 +3043,7 @@ static __maybe_unused void static_checks(void) { } #endif /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru>. + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru>. * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -3383,16 +3383,24 @@ __cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) return -1; return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags); } +size_t mdbx_default_pagesize(void) { + size_t pagesize = mdbx_syspagesize(); + mdbx_ensure(nullptr, is_powerof2(pagesize)); + pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; + pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; + return pagesize; +} + __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags) { if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); + pagesize = (intptr_t)mdbx_default_pagesize(); if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !is_powerof2((size_t)pagesize))) @@ -3417,7 +3425,7 @@ __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) return -1; return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags); @@ -3426,7 +3434,7 @@ __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags) { if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); + pagesize = (intptr_t)mdbx_default_pagesize(); if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !is_powerof2((size_t)pagesize))) @@ -3789,10 +3797,163 @@ size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { /*------------------------------------------------------------------------------ * safe read/write volatile 64-bit fields on 32-bit architectures. */ +enum MDBX_memory_order { + mo_Relaxed, + mo_AcquireRelease, + mo_SequentialConsistency +}; + +#ifdef MDBX_HAVE_C11ATOMICS + +#if defined(__clang__) && __clang__ < 8 +#define MDBX_WORKAROUND_CLANG_C11ATOMICS_BUG(type, ptr) \ + ((volatile _Atomic(type) *)(ptr)) +#else +#define MDBX_WORKAROUND_CLANG_C11ATOMICS_BUG(type, ptr) (ptr) +#endif /* __clang__ < 8.x */ + +static __always_inline memory_order mo_c11_store(enum MDBX_memory_order fence) { + switch (fence) { + default: + assert(false); + __unreachable(); + case mo_Relaxed: + return memory_order_relaxed; + case mo_AcquireRelease: + return memory_order_release; + case mo_SequentialConsistency: + return memory_order_seq_cst; + } +} + +static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { + switch (fence) { + default: + assert(false); + __unreachable(); + case mo_Relaxed: + return memory_order_relaxed; + case mo_AcquireRelease: + return memory_order_acquire; + case mo_SequentialConsistency: + return memory_order_seq_cst; + } +} +#endif /* MDBX_HAVE_C11ATOMICS */ + +static __maybe_unused __always_inline void mdbx_memory_fence(bool checkpoint, + bool write) { +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence( + checkpoint ? memory_order_seq_cst + : (write ? memory_order_release : memory_order_acquire)); +#else /* MDBX_HAVE_C11ATOMICS */ + mdbx_compiler_barrier(); + if (checkpoint || (write && MDBX_CPU_WRITEBACK_INCOHERENT)) + mdbx_memory_barrier(); +#endif /* MDBX_HAVE_C11ATOMICS */ +} + +static __maybe_unused __always_inline uint32_t +atomic_store32(MDBX_atomic_uint32_t *ptr, const uint32_t value, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(&ptr->c11a)); + atomic_store_explicit(&ptr->c11a, value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + ptr->weak = value; + if (order != mo_Relaxed) + mdbx_memory_fence(order == mo_SequentialConsistency, true); +#endif /* MDBX_HAVE_C11ATOMICS */ + return value; +} + +static __maybe_unused __always_inline uint32_t +atomic_load32(const MDBX_atomic_uint32_t *ptr, enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(&ptr->c11a)); + return atomic_load_explicit( + MDBX_WORKAROUND_CLANG_C11ATOMICS_BUG(uint32_t, &ptr->c11a), + mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_memory_fence(order == mo_SequentialConsistency, false); + const uint32_t value = ptr->weak; + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ +} + +static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *ptr, + const uint64_t value, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(&ptr->c11a)); + atomic_store_explicit(&ptr->c11a, value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + ptr->weak = value; + if (order != mo_Relaxed) + mdbx_memory_fence(order == mo_SequentialConsistency, true); +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + atomic_store32(&ptr->low, (uint32_t)value, + (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); + mdbx_jitter4testing(true); + atomic_store32(&ptr->high, (uint32_t)(value >> 32), order); + mdbx_jitter4testing(true); +#endif /* !MDBX_64BIT_ATOMIC */ + return value; +} + +static __always_inline uint64_t atomic_load64(const MDBX_atomic_uint64_t *ptr, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(&ptr->c11a)); + return atomic_load_explicit( + MDBX_WORKAROUND_CLANG_C11ATOMICS_BUG(uint64_t, &ptr->c11a), + mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_memory_fence(order == mo_SequentialConsistency, false); + const uint64_t value = ptr->weak; + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + uint64_t value = (uint64_t)atomic_load32(&ptr->high, order) << 32; + mdbx_jitter4testing(true); + value |= atomic_load32(&ptr->low, (order == mo_Relaxed) ? mo_Relaxed + : mo_AcquireRelease); + mdbx_jitter4testing(true); + for (;;) { + uint64_t again = (uint64_t)atomic_load32(&ptr->high, order) << 32; + mdbx_jitter4testing(true); + again |= atomic_load32( + &ptr->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); + mdbx_jitter4testing(true); + if (likely(value == again)) + return value; + value = again; + } +#endif /* !MDBX_64BIT_ATOMIC */ +} + static __always_inline void atomic_yield(void) { #if defined(_WIN32) || defined(_WIN64) YieldProcessor(); -#elif defined(__x86_64__) || defined(__i386__) || defined(__e2k__) +#elif defined(__ia32__) || defined(__e2k__) __builtin_ia32_pause(); #elif defined(__ia64__) #if defined(__HP_cc__) || defined(__HP_aCC__) @@ -3822,97 +3983,76 @@ static __always_inline void atomic_yield(void) { } #if MDBX_64BIT_CAS -static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, +static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c, uint64_t v) { -#if !defined(__STDC_NO_ATOMICS__) && \ - (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) || \ - __has_extension(c_atomic)) +#ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); #ifdef ATOMIC_LLONG_LOCK_FREE STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); #if ATOMIC_LLONG_LOCK_FREE < 2 - assert(atomic_is_lock_free(p)); -#endif -#else - assert(atomic_is_lock_free(p)); -#endif -#ifdef __clang__ - STATIC_ASSERT(sizeof(_Atomic uint64_t) == sizeof(uint64_t)); - return atomic_compare_exchange_strong((_Atomic volatile uint64_t *)p, &c, v); -#else - return atomic_compare_exchange_strong(p, &c, v); + assert(atomic_is_lock_free(&p->c11a)); +#endif /* ATOMIC_LLONG_LOCK_FREE < 2 */ +#else /* defined(ATOMIC_LLONG_LOCK_FREE) */ + assert(atomic_is_lock_free(&p->c11a)); #endif + return atomic_compare_exchange_strong(&p->c11a, &c, v); #elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(p, c, v); + return __sync_bool_compare_and_swap(&p->weak, c, v); #elif defined(_MSC_VER) - return c == - (uint64_t)_InterlockedCompareExchange64((volatile int64_t *)p, v, c); + return c == (uint64_t)_InterlockedCompareExchange64(&p->weak, v, c); #elif defined(__APPLE__) - return OSAtomicCompareAndSwap64Barrier(c, v, (volatile uint64_t *)p); + return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak); #else #error FIXME: Unsupported compiler #endif } #endif /* MDBX_64BIT_CAS */ -static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, +static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c, uint32_t v) { -#if !defined(__STDC_NO_ATOMICS__) && \ - (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ - __has_extension(c_atomic)) +#ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); #ifdef ATOMIC_INT_LOCK_FREE STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); #if ATOMIC_INT_LOCK_FREE < 2 - assert(atomic_is_lock_free(p)); -#endif -#else - assert(atomic_is_lock_free(p)); + assert(atomic_is_lock_free(&p->c11a)); #endif -#ifdef __clang__ - STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); - return atomic_compare_exchange_strong((_Atomic volatile uint32_t *)p, &c, v); #else - return atomic_compare_exchange_strong(p, &c, v); + assert(atomic_is_lock_free(&p->c11a)); #endif + return atomic_compare_exchange_strong(&p->c11a, &c, v); #elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(p, c, v); + return __sync_bool_compare_and_swap(&p->weak, c, v); #elif defined(_MSC_VER) STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return c == (uint32_t)_InterlockedCompareExchange((volatile long *)p, v, c); + return c == (uint32_t)_InterlockedCompareExchange(&p->weak, v, c); #elif defined(__APPLE__) - return OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); + return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak); #else #error FIXME: Unsupported compiler #endif } -static __always_inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { -#if !defined(__STDC_NO_ATOMICS__) && \ - (defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) || \ - __has_extension(c_atomic)) +static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p, + uint32_t v) { +#ifdef MDBX_HAVE_C11ATOMICS STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); #ifdef ATOMIC_INT_LOCK_FREE STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); #if ATOMIC_INT_LOCK_FREE < 2 - assert(atomic_is_lock_free(p)); + assert(atomic_is_lock_free(&p->c11a)); #endif #else - assert(atomic_is_lock_free(p)); -#endif -#ifdef __clang__ - STATIC_ASSERT(sizeof(_Atomic uint32_t) == sizeof(uint32_t)); - return atomic_fetch_add((_Atomic volatile uint32_t *)p, v); -#else - return atomic_fetch_add(p, v); + assert(atomic_is_lock_free(&p->c11a)); #endif + return atomic_fetch_add(&p->c11a, v); #elif defined(__GNUC__) || defined(__clang__) - return __sync_fetch_and_add(p, v); + return __sync_fetch_and_add(&p->weak, v); #elif defined(_MSC_VER) STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return _InterlockedExchangeAdd((volatile long *)p, v); + return _InterlockedExchangeAdd(&p->weak, v); #elif defined(__APPLE__) - return OSAtomicAdd32Barrier(v, (volatile int32_t *)p); + return OSAtomicAdd32Barrier(v, &p->weak); #else #error FIXME: Unsupported compiler #endif @@ -3920,24 +4060,6 @@ static __always_inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { #define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) -static __maybe_unused __always_inline bool safe64_is_valid(uint64_t v) { -#if MDBX_WORDBITS >= 64 - return v < SAFE64_INVALID_THRESHOLD; -#else - return (v >> 32) != UINT32_MAX; -#endif /* MDBX_WORDBITS */ -} - -static __maybe_unused __always_inline bool -safe64_is_valid_ptr(const mdbx_safe64_t *ptr) { - mdbx_compiler_barrier(); -#if MDBX_64BIT_ATOMIC - return ptr->atomic < SAFE64_INVALID_THRESHOLD; -#else - return ptr->high != UINT32_MAX; -#endif /* MDBX_64BIT_ATOMIC */ -} - static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { txnid += MDBX_TXNID_STEP; #if !MDBX_64BIT_CAS @@ -3947,35 +4069,34 @@ static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { return txnid; } -static __always_inline void safe64_reset(mdbx_safe64_t *ptr, +static __always_inline void safe64_reset(MDBX_atomic_uint64_t *ptr, bool single_writer) { - mdbx_compiler_barrier(); #if !MDBX_64BIT_CAS if (!single_writer) { STATIC_ASSERT(MDBX_TXNID_STEP > 1); /* it is safe to increment low-part to avoid ABA, since MDBX_TXNID_STEP > 1 * and overflow was preserved in safe64_txnid_next() */ atomic_add32(&ptr->low, 1) /* avoid ABA in safe64_reset_compare() */; - ptr->high = UINT32_MAX /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_store32( + &ptr->high, UINT32_MAX, + mo_AcquireRelease) /* atomically make >= SAFE64_INVALID_THRESHOLD */; atomic_add32(&ptr->low, 1) /* avoid ABA in safe64_reset_compare() */; } else +#elif MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(ptr, UINT64_MAX, + single_writer ? mo_AcquireRelease : mo_SequentialConsistency); #else - (void)single_writer; -#endif /* !MDBX_64BIT_CAS */ -#if MDBX_64BIT_ATOMIC - ptr->atomic = UINT64_MAX; -#else - /* atomically make value >= SAFE64_INVALID_THRESHOLD */ - ptr->high = UINT32_MAX; + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&ptr->high, UINT32_MAX, + single_writer ? mo_AcquireRelease : mo_SequentialConsistency); #endif /* MDBX_64BIT_ATOMIC */ - assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); - mdbx_flush_incoherent_cpu_writeback(); + assert(ptr->weak >= SAFE64_INVALID_THRESHOLD); mdbx_jitter4testing(true); } -static __always_inline bool safe64_reset_compare(mdbx_safe64_t *ptr, +static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *ptr, txnid_t compare) { - mdbx_compiler_barrier(); /* LY: This function is used to reset `mr_txnid` from hsr-handler in case * the asynchronously cancellation of read transaction. Therefore, * there may be a collision between the cleanup performed here and @@ -3983,16 +4104,16 @@ static __always_inline bool safe64_reset_compare(mdbx_safe64_t *ptr, * in another proces/thread. In general we MUST NOT reset the `mr_txnid` * if a new transaction was started (i.e. if `mr_txnid` was changed). */ #if MDBX_64BIT_CAS - bool rc = atomic_cas64(&ptr->inconsistent, compare, UINT64_MAX); - mdbx_flush_incoherent_cpu_writeback(); + bool rc = atomic_cas64(ptr, compare, UINT64_MAX); #else /* LY: There is no gold ratio here since shared mutex is too costly, * in such way we must acquire/release it for every update of mr_txnid, * i.e. twice for each read transaction). */ bool rc = false; - if (likely(ptr->low == (uint32_t)compare && + if (likely(atomic_load32(&ptr->low, mo_AcquireRelease) == (uint32_t)compare && atomic_cas32(&ptr->high, (uint32_t)(compare >> 32), UINT32_MAX))) { - if (unlikely(ptr->low != (uint32_t)compare)) + if (unlikely(atomic_load32(&ptr->low, mo_AcquireRelease) != + (uint32_t)compare)) atomic_cas32(&ptr->high, UINT32_MAX, (uint32_t)(compare >> 32)); else rc = true; @@ -4002,51 +4123,53 @@ static __always_inline bool safe64_reset_compare(mdbx_safe64_t *ptr, return rc; } -static __always_inline void safe64_write(mdbx_safe64_t *ptr, const uint64_t v) { - mdbx_compiler_barrier(); - assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); +static __always_inline void safe64_write(MDBX_atomic_uint64_t *ptr, + const uint64_t v) { + assert(ptr->weak >= SAFE64_INVALID_THRESHOLD); #if MDBX_64BIT_ATOMIC - ptr->atomic = v; + atomic_store64(ptr, v, mo_AcquireRelease); #else /* MDBX_64BIT_ATOMIC */ /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ - ptr->low = (uint32_t)v; - assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); - mdbx_flush_incoherent_cpu_writeback(); + atomic_store32(&ptr->low, (uint32_t)v, mo_AcquireRelease); + assert(ptr->weak >= SAFE64_INVALID_THRESHOLD); mdbx_jitter4testing(true); /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ - ptr->high = (uint32_t)(v >> 32); + atomic_store32(&ptr->high, (uint32_t)(v >> 32), mo_AcquireRelease); #endif /* MDBX_64BIT_ATOMIC */ - assert(ptr->inconsistent == v); - mdbx_flush_incoherent_cpu_writeback(); + assert(ptr->weak == v); mdbx_jitter4testing(true); } -static __always_inline uint64_t safe64_read(const mdbx_safe64_t *ptr) { - mdbx_compiler_barrier(); +static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *ptr) { mdbx_jitter4testing(true); - uint64_t v; -#if MDBX_64BIT_ATOMIC - v = ptr->atomic; -#else /* MDBX_64BIT_ATOMIC */ - uint32_t hi, lo; - do { - hi = ptr->high; - mdbx_compiler_barrier(); - mdbx_jitter4testing(true); - lo = ptr->low; - mdbx_compiler_barrier(); - mdbx_jitter4testing(true); - } while (unlikely(hi != ptr->high)); - v = lo | (uint64_t)hi << 32; -#endif /* MDBX_64BIT_ATOMIC */ + uint64_t v = atomic_load64(ptr, mo_AcquireRelease); mdbx_jitter4testing(true); return v; } -#if 0 /* unused for now */ -static __always_inline void safe64_update(mdbx_safe64_t *ptr, +#if 0 /* unused for now */ + static __maybe_unused __always_inline bool safe64_is_valid(uint64_t v) { +#if MDBX_WORDBITS >= 64 + return v < SAFE64_INVALID_THRESHOLD; +#else + return (v >> 32) != UINT32_MAX; +#endif /* MDBX_WORDBITS */ +} + + static __maybe_unused __always_inline bool + safe64_is_valid_ptr(const MDBX_atomic_uint64_t *ptr) { +#if MDBX_64BIT_ATOMIC + return atomic_load64(ptr, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; +#else + return atomic_load32(&ptr->high, mo_AcquireRelease) != UINT32_MAX; +#endif /* MDBX_64BIT_ATOMIC */ +} + +static __always_inline void safe64_update(MDBX_atomic_uint64_t *ptr, const uint64_t v) { +#if MDBX_64BIT_ATOMIC safe64_reset(ptr, true); +#endif /* MDBX_64BIT_ATOMIC */ safe64_write(ptr, v); } #endif /* unused for now */ @@ -4089,7 +4212,7 @@ static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; static mdbx_thread_key_t rthc_key; -static volatile uint32_t rthc_pending; +static MDBX_atomic_uint32_t rthc_pending; static void __cold workaround_glibc_bug21031(void) { /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 @@ -4246,11 +4369,11 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), i, __Wpedantic_format_voidptr(rthc_table[i].begin), __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid); - if (rthc->mr_pid == self_pid) { + (int)(rthc - rthc_table[i].begin), rthc->mr_pid.weak, self_pid); + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", mdbx_thread_self(), __Wpedantic_format_voidptr(rthc)); - rthc->mr_pid = 0; + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); } } @@ -4265,7 +4388,7 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { if (self_registration == MDBX_THREAD_RTHC_COUNTED) mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); - if (rthc_pending == 0) { + if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", mdbx_thread_self(), ptr, mdbx_getpid()); mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); @@ -4307,7 +4430,8 @@ __cold void mdbx_rthc_global_dtor(void) { abstime.tv_sec += 600; #endif - for (unsigned left; (left = rthc_pending) > 0;) { + for (unsigned left; + (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { mdbx_trace("pid %d, pending %u, wait for...", mdbx_getpid(), left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) @@ -4324,15 +4448,15 @@ __cold void mdbx_rthc_global_dtor(void) { thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " - "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), - (int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid); - if (rthc->mr_pid == self_pid) { - rthc->mr_pid = 0; + mdbx_trace( + "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), + rthc->mr_pid.weak, self_pid); + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } @@ -4420,8 +4544,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - if (rthc->mr_pid == self_pid) { - rthc->mr_pid = 0; + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } @@ -4475,7 +4599,7 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { int rc; uint64_t bait; if (pending->address) { - bait = pending->lck->mti_bait_uniqueness; + bait = atomic_load64(&pending->lck->mti_bait_uniqueness, mo_AcquireRelease); rc = MDBX_SUCCESS; } else { bait = 0 /* hush MSVC warning */; @@ -4485,7 +4609,8 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { mdbx_pread(pending->fd, &bait, sizeof(scan->lck->mti_bait_uniqueness), offsetof(MDBX_lockinfo, mti_bait_uniqueness)); } - if (likely(rc == MDBX_SUCCESS) && bait == scan->lck->mti_bait_uniqueness) + if (likely(rc == MDBX_SUCCESS) && + bait == atomic_load64(&scan->lck->mti_bait_uniqueness, mo_AcquireRelease)) rc = MDBX_RESULT_TRUE; mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", @@ -4507,8 +4632,8 @@ static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) << 24 | *abra >> 40; - scan->lck->mti_bait_uniqueness = cadabra; - mdbx_flush_incoherent_cpu_writeback(); + atomic_store64(&scan->lck->mti_bait_uniqueness, cadabra, + mo_SequentialConsistency); *abra = *abra * UINT64_C(6364136223846793005) + 1; return uniq_peek(pending, scan); } @@ -4518,7 +4643,8 @@ __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { uint64_t salt = 0; for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; scan = scan->me_lcklist_next) { - int err = scan->me_lck_mmap.lck->mti_bait_uniqueness + int err = atomic_load64(&scan->me_lck_mmap.lck->mti_bait_uniqueness, + mo_AcquireRelease) ? uniq_peek(pending, &scan->me_lck_mmap) : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { @@ -6070,7 +6196,7 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { /*----------------------------------------------------------------------------*/ static __always_inline size_t dpl2bytes(const ptrdiff_t size) { - assert(size > 2 && size <= MDBX_PGL_LIMIT); + assert(size > 2 && (size_t)size <= MDBX_PGL_LIMIT); size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + (size + 2) * sizeof(MDBX_dp), MDBX_PNL_GRANULATE * sizeof(void *) * 2) - @@ -6897,9 +7023,9 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { static void mdbx_dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; - const size_t n = dl->length; + const size_t len = dl->length; - for (size_t i = 1; i <= n; i++) { + for (size_t i = 1; i <= len; i++) { MDBX_page *dp = dl->items[i].ptr; mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); } @@ -7716,7 +7842,7 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, const int lck_exclusive) { return lck_exclusive ? /* exclusive lock */ meta_bootid_match(meta) : /* db already opened */ env->me_lck && - (env->me_lck->mti_envmode & MDBX_RDONLY) == 0; + (env->me_lck->mti_envmode.weak & MDBX_RDONLY) == 0; } #define METAPAGE(env, n) page_meta(pgno2page(env, n)) @@ -7724,6 +7850,7 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, const bool allow_volatile) { + mdbx_memory_fence(mo_AcquireRelease, false); txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); if (allow_volatile) @@ -7749,6 +7876,7 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; unaligned_poke_u64(4, meta->mm_txnid_b, 0); + mdbx_memory_fence(mo_AcquireRelease, true); unaligned_poke_u64(4, meta->mm_txnid_a, txnid); } @@ -7761,6 +7889,7 @@ static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, mdbx_jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); unaligned_poke_u64(4, meta->mm_txnid_b, txnid); + mdbx_memory_fence(mo_AcquireRelease, true); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, @@ -7920,25 +8049,29 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { MDBX_lockinfo *const lck = env->me_lck; if (unlikely(lck == NULL /* exclusive mode */)) - return env->me_lckless_stub.oldest = edge; + return atomic_store64(&env->me_lckless_stub.oldest, edge, + mo_AcquireRelease); - const txnid_t last_oldest = lck->mti_oldest_reader; + const txnid_t last_oldest = + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); mdbx_tassert(txn, edge >= last_oldest); if (likely(last_oldest == edge)) return edge; const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - const uint32_t snap_readers_refresh_flag = lck->mti_readers_refresh_flag; + const uint32_t snap_readers_refresh_flag = + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease); mdbx_jitter4testing(false); if (snap_readers_refresh_flag == nothing_changed) return last_oldest; txnid_t oldest = edge; - lck->mti_readers_refresh_flag = nothing_changed; - mdbx_flush_incoherent_cpu_writeback(); - const unsigned snap_nreaders = lck->mti_numreaders; + atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, + mo_AcquireRelease); + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { @@ -7951,8 +8084,8 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { if (oldest != last_oldest) { mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest); - mdbx_tassert(txn, oldest >= lck->mti_oldest_reader); - lck->mti_oldest_reader = oldest; + mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, oldest, mo_AcquireRelease); } return oldest; } @@ -7961,19 +8094,24 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck; if (likely(lck != NULL /* exclusive mode */)) { - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { retry: - if (lck->mti_readers[i].mr_pid) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ - const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used; + const pgno_t snap_pages = atomic_load32( + &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - mdbx_memory_barrier(); - if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + if (unlikely( + snap_pages != + atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; if (largest < snap_pages && - lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid && + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= + /* ignore pending updates */ snap_txnid && snap_txnid <= env->me_txn0->mt_txnid) largest = snap_pages; } @@ -8103,7 +8241,7 @@ static __cold int mdbx_set_readahead(MDBX_env *env, const size_t offset, static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, const pgno_t limit_pgno, const bool implicit) { - if ((env->me_flags & MDBX_WRITEMAP) && *env->me_unsynced_pages) { + if ((env->me_flags & MDBX_WRITEMAP) && env->me_unsynced_pages->weak) { int err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), true); if (unlikely(err != MDBX_SUCCESS)) @@ -8176,11 +8314,12 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, /* looking for readers from this process */ MDBX_lockinfo *const lck = env->me_lck; - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); mapping_can_be_moved = true; for (unsigned i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid == env->me_pid && - lck->mti_readers[i].mr_tid != mdbx_thread_self()) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { /* the base address of the mapping can't be changed since * the other reader thread from this process exists. */ mdbx_rdt_unlock(env); @@ -8223,8 +8362,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, #endif /* MADV_DONTNEED */ if (unlikely(MDBX_IS_ERROR(rc))) goto bailout; - if (*env->me_discarded_tail > size_pgno) - *env->me_discarded_tail = size_pgno; + if (env->me_discarded_tail->weak > size_pgno) + env->me_discarded_tail->weak = size_pgno; } rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes, @@ -8246,7 +8385,7 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, because it was remapped */ : prev_size; if (size_bytes > readahead_pivot) { - *env->me_discarded_tail = size_pgno; + env->me_discarded_tail->weak = size_pgno; rc = mdbx_set_readahead(env, readahead_pivot, size_bytes - readahead_pivot, true); } @@ -8383,7 +8522,8 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { if (likely(env->me_lck)) /* force oldest refresh */ - env->me_lck->mti_readers_refresh_flag = true; + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); return MDBX_SUCCESS; } @@ -8534,8 +8674,9 @@ no_loose: break /* reclaiming is prohibited for now */; /* Prepare to fetch more and coalesce */ - oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn) - : *env->me_oldest; + oldest = (flags & MDBX_LIFORECLAIM) + ? mdbx_find_oldest(txn) + : atomic_load64(env->me_oldest, mo_AcquireRelease); rc = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -8640,15 +8781,24 @@ no_loose: goto fail; } const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); - if (flags != MDBX_ALLOC_GC && - unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + if (unlikely(/* resulting list is tool long */ gc_len + + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > env->me_options.rp_augment_limit) && - (pgno_add(txn->mt_next_pgno, num) <= txn->mt_geo.upper || + (((/* not a slot-request from gc-update */ + mp || (flags & MDBX_LIFORECLAIM) == 0 || + (txn->tw.lifo_reclaimed && + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed))) && + /* have enough unallocated space */ pgno_add( + txn->mt_next_pgno, num) <= txn->mt_geo.upper) || gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= MDBX_PGL_LIMIT / 16 * 15)) { /* Stop reclaiming to avoid overflow the page list. * This is a rare case while search for a continuously multi-page region * in a large database. https://github.com/erthink/libmdbx/issues/123 */ + mdbx_debug("stop reclaiming to avoid PNL overflow: %u (current) + %u " + "(chunk) -> %u", + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); break; } @@ -8734,8 +8884,10 @@ no_loose: mdbx_meta_txnid_stable(env, steady), mdbx_durable_str(steady), oldest); rc = MDBX_RESULT_TRUE; - const pgno_t autosync_threshold = *env->me_autosync_threshold; - const uint64_t autosync_period = *env->me_autosync_period; + const pgno_t autosync_threshold = + atomic_load32(env->me_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(env->me_autosync_period, mo_Relaxed); /* wipe the last steady-point if one of: * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted @@ -8754,9 +8906,11 @@ no_loose: mdbx_assert(env, steady != mdbx_meta_steady(env)); } else if ((flags & MDBX_ALLOC_NEW) == 0 || (autosync_threshold && - *env->me_unsynced_pages >= autosync_threshold) || + atomic_load32(env->me_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - *env->me_sync_timestamp >= + mdbx_osal_monotime() - + atomic_load64(env->me_sync_timestamp, mo_Relaxed) >= autosync_period) || next >= txn->mt_geo.upper || (next >= txn->mt_end_pgno && @@ -9115,7 +9269,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; bool need_unlock = false; - if (nonblock && *env->me_unsynced_pages == 0) + if (nonblock && atomic_load32(env->me_unsynced_pages, mo_AcquireRelease) == 0) goto fastpath; const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self()); @@ -9127,13 +9281,17 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, } const MDBX_meta *head = mdbx_meta_head(env); - pgno_t unsynced_pages = *env->me_unsynced_pages; + pgno_t unsynced_pages = atomic_load32(env->me_unsynced_pages, mo_Relaxed); if (!META_IS_STEADY(head) || unsynced_pages) { - const pgno_t autosync_threshold = *env->me_autosync_threshold; - const uint64_t autosync_period = *env->me_autosync_period; + const pgno_t autosync_threshold = + atomic_load32(env->me_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(env->me_autosync_period, mo_Relaxed); if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - *env->me_sync_timestamp >= autosync_period)) + mdbx_osal_monotime() - + atomic_load64(env->me_sync_timestamp, mo_Relaxed) >= + autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (outside_txn) { @@ -9157,7 +9315,7 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, /* LY: head and unsynced_pages may be changed. */ head = mdbx_meta_head(env); - unsynced_pages = *env->me_unsynced_pages; + unsynced_pages = atomic_load32(env->me_unsynced_pages, mo_Relaxed); } env->me_txn0->mt_txnid = meta_txnid(env, head, false); mdbx_find_oldest(env->me_txn0); @@ -9185,13 +9343,15 @@ fastpath: * and someone was not synced above. */ if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) { const txnid_t head_txnid = mdbx_recent_committed_txnid(env); - if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) { + if (atomic_load32(env->me_meta_sync_txnid, mo_Relaxed) != + (uint32_t)head_txnid) { rc = (flags & MDBX_WRITEMAP) ? mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), false) : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) - *env->me_meta_sync_txnid = (uint32_t)head_txnid; + atomic_store32(env->me_meta_sync_txnid, (uint32_t)head_txnid, + mo_Relaxed); } } if (need_unlock) @@ -9203,7 +9363,7 @@ static __inline int check_env(const MDBX_env *env) { if (unlikely(!env)) return MDBX_EINVAL; - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; #if MDBX_ENV_CHECKPID @@ -9332,19 +9492,25 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck; if (likely(lck != NULL /* exclusive mode */)) { - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { retry: - if (lck->mti_readers[i].mr_pid == env->me_pid) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == + env->me_pid) { /* mdbx_jitter4testing(true); */ - const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used; + const pgno_t snap_pages = atomic_load32( + &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - mdbx_memory_barrier(); - if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + if (unlikely( + snap_pages != + atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; if (largest < snap_pages && - lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid && + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= + /* ignore pending updates */ snap_txnid && snap_txnid <= MAX_TXNID) largest = snap_pages; } @@ -9437,9 +9603,10 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { result.err = MDBX_SUCCESS; unsigned slot, nreaders; while (1) { - nreaders = env->me_lck->mti_numreaders; + nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed); for (slot = 0; slot < nreaders; slot++) - if (env->me_lck->mti_readers[slot].mr_pid == 0) + if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) == + 0) break; if (likely(slot < env->me_maxreaders)) @@ -9460,12 +9627,13 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ - result.rslot->mr_pid = 0; + atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) - env->me_lck->mti_numreaders = ++nreaders; - result.rslot->mr_tid = (env->me_flags & MDBX_NOTLS) ? 0 : tid; - result.rslot->mr_pid = env->me_pid; + atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed); + atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid, + mo_Relaxed); + atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed); mdbx_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { @@ -9492,9 +9660,9 @@ __cold int mdbx_thread_register(const MDBX_env *env) { MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { - mdbx_assert(env, r->mr_pid == env->me_pid); - mdbx_assert(env, r->mr_tid == mdbx_thread_self()); - if (unlikely(r->mr_pid != env->me_pid)) + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + if (unlikely(r->mr_pid.weak != env->me_pid)) return MDBX_BAD_RSLOT; return MDBX_RESULT_TRUE /* already registered */; } @@ -9524,17 +9692,17 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; - mdbx_assert(env, r->mr_pid == env->me_pid); - mdbx_assert(env, r->mr_tid == mdbx_thread_self()); - if (unlikely(r->mr_pid != env->me_pid || r->mr_tid != mdbx_thread_self())) + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + if (unlikely(r->mr_pid.weak != env->me_pid || + r->mr_tid.weak != mdbx_thread_self())) return MDBX_BAD_RSLOT; - if (unlikely(r->mr_txnid.inconsistent < SAFE64_INVALID_THRESHOLD)) + if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; - r->mr_pid = 0; - mdbx_compiler_barrier(); - env->me_lck->mti_readers_refresh_flag = true; + atomic_store32(&r->mr_pid, 0, mo_AcquireRelease); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); thread_rthc_set(env->me_txkey, nullptr); return MDBX_SUCCESS; } @@ -9570,18 +9738,18 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_flags = MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); MDBX_reader *r = txn->to.reader; - STATIC_ASSERT(sizeof(uintptr_t) == sizeof(r->mr_tid)); + STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { - if (unlikely(!r->mr_pid) && + if (unlikely(!r->mr_pid.weak) && (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { - mdbx_assert(env, r->mr_pid == env->me_pid); - mdbx_assert(env, r->mr_tid == mdbx_thread_self()); + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); } } } else { @@ -9589,8 +9757,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } if (likely(r)) { - if (unlikely(r->mr_pid != env->me_pid || - r->mr_txnid.inconsistent < SAFE64_INVALID_THRESHOLD)) + if (unlikely(r->mr_pid.weak != env->me_pid || + r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BAD_RSLOT; } else if (env->me_lck) { bind_rslot_result brs = bind_rslot(env, tid); @@ -9604,9 +9772,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { mdbx_assert(env, txn->mt_owner == 0); mdbx_assert(env, txn->mt_numdbs == 0); if (likely(r)) { - mdbx_assert(env, r->mr_snapshot_pages_used == 0); - mdbx_assert(env, r->mr_txnid.inconsistent >= SAFE64_INVALID_THRESHOLD); - r->mr_snapshot_pages_used = 0; + mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0); + mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); } txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; return MDBX_SUCCESS; @@ -9621,19 +9789,20 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { mdbx_jitter4testing(false); if (likely(r)) { safe64_reset(&r->mr_txnid, false); - r->mr_snapshot_pages_used = meta->mm_geo.next; - r->mr_snapshot_pages_retired = - unaligned_peek_u64(4, meta->mm_pages_retired); + atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, + mo_Relaxed); + atomic_store64(&r->mr_snapshot_pages_retired, + unaligned_peek_u64(4, meta->mm_pages_retired), + mo_Relaxed); safe64_write(&r->mr_txnid, snap); mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid == mdbx_getpid()); + mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); mdbx_assert( - env, r->mr_tid == + env, r->mr_tid.weak == ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.inconsistent == snap); - mdbx_compiler_barrier(); - env->me_lck->mti_readers_refresh_flag = true; - mdbx_flush_incoherent_cpu_writeback(); + mdbx_assert(env, r->mr_txnid.weak == snap); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); } mdbx_jitter4testing(true); @@ -9644,10 +9813,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - mdbx_compiler_barrier(); if (likely(meta == mdbx_meta_head(env) && snap == mdbx_meta_txnid_fluid(env, meta) && - snap >= *env->me_oldest)) { + snap >= atomic_load64(env->me_oldest, mo_AcquireRelease))) { mdbx_jitter4testing(false); break; } @@ -9660,19 +9828,20 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; if (likely(r)) { - r->mr_snapshot_pages_used = meta->mm_geo.next; - r->mr_snapshot_pages_retired = - unaligned_peek_u64(4, meta->mm_pages_retired); - r->mr_txnid.inconsistent = txn->mt_txnid; + atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, + mo_Relaxed); + atomic_store64(&r->mr_snapshot_pages_retired, + unaligned_peek_u64(4, meta->mm_pages_retired), + mo_Relaxed); + atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_AcquireRelease); mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid == mdbx_getpid()); + mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); mdbx_assert( - env, r->mr_tid == + env, r->mr_tid.weak == ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.inconsistent == txn->mt_txnid); - mdbx_compiler_barrier(); - env->me_lck->mti_readers_refresh_flag = true; - mdbx_flush_incoherent_cpu_writeback(); + mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); } } @@ -9681,10 +9850,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { rc = MDBX_CORRUPTED; goto bailout; } - mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); + mdbx_assert(env, txn->mt_txnid >= env->me_oldest->weak); txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ *env->me_oldest); + mdbx_ensure(env, + txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_oldest->weak); txn->mt_numdbs = env->me_numdbs; } else { mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | @@ -9695,10 +9865,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { MDBX_lockinfo *const lck = env->me_lck; if (lck && (env->me_flags & MDBX_NOTLS) == 0 && (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid == env->me_pid && - unlikely(lck->mti_readers[i].mr_tid == tid)) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == + env->me_pid && + unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == + tid)) { const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); if (txnid >= MIN_TXNID && txnid <= MAX_TXNID) return MDBX_TXN_OVERLAPPING; @@ -9986,8 +10159,10 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_dbiseqs = parent->mt_dbiseqs; rc = mdbx_dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { + const unsigned len = + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; txn->tw.reclaimed_pglist = - mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist)); + mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); if (unlikely(!txn->tw.reclaimed_pglist)) rc = MDBX_ENOMEM; } @@ -10026,6 +10201,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.spill_pages) mdbx_spill_purge(parent); + mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); mdbx_assert(env, mdbx_pnl_check4assert( @@ -10144,29 +10321,34 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_reader_lag = head_txnid - info->txn_id; info->txn_space_dirty = info->txn_space_retired = 0; + uint64_t reader_snapshot_pages_retired; if (txn->to.reader && - head_retired > txn->to.reader->mr_snapshot_pages_retired) { - info->txn_space_dirty = info->txn_space_retired = - pgno2bytes(env, (pgno_t)(head_retired - - txn->to.reader->mr_snapshot_pages_retired)); + head_retired > + (reader_snapshot_pages_retired = atomic_load64( + &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) { + info->txn_space_dirty = info->txn_space_retired = pgno2bytes( + env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); size_t retired_next_reader = 0; MDBX_lockinfo *const lck = env->me_lck; if (scan_rlt && info->txn_reader_lag > 1 && lck) { /* find next more recent reader */ txnid_t next_reader = head_txnid; - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { retry: - if (lck->mti_readers[i].mr_pid) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { mdbx_jitter4testing(true); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); const uint64_t snap_retired = - lck->mti_readers[i].mr_snapshot_pages_retired; - mdbx_compiler_barrier(); + atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired, + mo_AcquireRelease); if (unlikely(snap_retired != - lck->mti_readers[i].mr_snapshot_pages_retired) || + atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, + mo_Relaxed)) || snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)) goto retry; if (snap_txnid <= txn->mt_txnid) { @@ -10177,7 +10359,9 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { next_reader = snap_txnid; retired_next_reader = pgno2bytes( env, (pgno_t)(snap_retired - - txn->to.reader->mr_snapshot_pages_retired)); + atomic_load64( + &txn->to.reader->mr_snapshot_pages_retired, + mo_Relaxed))); } } } @@ -10197,14 +10381,15 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_lockinfo *const lck = env->me_lck; if (scan_rlt && lck) { txnid_t oldest_snapshot = txn->mt_txnid; - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { oldest_snapshot = mdbx_find_oldest(txn); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; for (unsigned i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid && + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { exists = true; break; @@ -10222,7 +10407,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE || - txn->mt_env->me_signature != MDBX_ME_SIGNATURE)) + txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE)) return NULL; return txn->mt_env; } @@ -10300,7 +10485,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { env->me_dbxs[i].md_name.iov_len = 0; - mdbx_compiler_barrier(); + mdbx_memory_fence(false, true); mdbx_assert(env, env->me_dbflags[i] == 0); env->me_dbiseqs[i]++; env->me_dbxs[i].md_name.iov_base = NULL; @@ -10416,32 +10601,31 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { txn->mt_dbs[FREE_DBI].md_root); mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ *env->me_oldest); + /* paranoia is appropriate here */ env->me_oldest->weak); int rc = MDBX_SUCCESS; if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->to.reader) { MDBX_reader *slot = txn->to.reader; - mdbx_assert(env, slot->mr_pid == env->me_pid); + mdbx_assert(env, slot->mr_pid.weak == env->me_pid); if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { - mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.inconsistent && - slot->mr_txnid.inconsistent >= - env->me_lck->mti_oldest_reader); + mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak && + slot->mr_txnid.weak >= + env->me_lck->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) mdbx_txn_valgrind(env, nullptr); #endif - slot->mr_snapshot_pages_used = 0; + atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); - env->me_lck->mti_readers_refresh_flag = true; - mdbx_flush_incoherent_cpu_writeback(); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); } else { - mdbx_assert(env, slot->mr_pid == env->me_pid); - mdbx_assert(env, - slot->mr_txnid.inconsistent >= SAFE64_INVALID_THRESHOLD); + mdbx_assert(env, slot->mr_pid.weak == env->me_pid); + mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) - slot->mr_pid = 0; + atomic_store32(&slot->mr_pid, 0, mo_AcquireRelease); txn->to.reader = NULL; } } @@ -10837,8 +11021,8 @@ retry_noaccount: /* LY: cleanup reclaimed records. */ do { cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; - mdbx_tassert(txn, - cleaned_gc_slot > 0 && cleaned_gc_id < *env->me_oldest); + mdbx_tassert(txn, cleaned_gc_slot > 0 && + cleaned_gc_id < env->me_oldest->weak); key.iov_base = &cleaned_gc_id; key.iov_len = sizeof(cleaned_gc_id); rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); @@ -10849,7 +11033,7 @@ retry_noaccount: rc = mdbx_prep_backlog(txn, &couple.outer, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); + mdbx_tassert(txn, cleaned_gc_id < env->me_oldest->weak); mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, cleaned_gc_slot, cleaned_gc_id); rc = mdbx_cursor_del(&couple.outer, 0); @@ -10887,7 +11071,7 @@ retry_noaccount: goto bailout; } mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); - mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); + mdbx_tassert(txn, cleaned_gc_id < env->me_oldest->weak); mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, cleaned_gc_id); rc = mdbx_cursor_del(&couple.outer, 0); @@ -11110,8 +11294,10 @@ retry_noaccount: goto bailout; if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { - if (need_cleanup) + if (need_cleanup) { mdbx_txl_sort(txn->tw.lifo_reclaimed); + cleaned_gc_slot = 0; + } gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); @@ -11144,7 +11330,39 @@ retry_noaccount: } mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); - rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, --gc_rid); + --gc_rid; + key.iov_base = &gc_rid; + key.iov_len = sizeof(gc_rid); + rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + if (unlikely(rc == MDBX_SUCCESS)) { + mdbx_debug("%s: GC's id %" PRIaTXN + " is used, continue bottom-up search", + dbg_prefix_mode, gc_rid); + ++gc_rid; + rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); + if (rc == MDBX_NOTFOUND) { + mdbx_debug("%s: GC is empty", dbg_prefix_mode); + break; + } + if (unlikely(rc != MDBX_SUCCESS || + key.iov_len != sizeof(mdbx_tid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { + rc = MDBX_CORRUPTED; + goto bailout; + } + if (gc_first < 2) { + mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN, + dbg_prefix_mode, gc_rid); + break; + } + gc_rid = gc_first - 1; + } + + rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -11255,16 +11473,17 @@ retry_noaccount: } mdbx_tassert(txn, chunk > 0); - mdbx_trace("%s: rc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " + mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " "%" PRIaTXN, dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); + mdbx_tassert(txn, reservation_gc_id < env->me_oldest->weak); if (unlikely(reservation_gc_id < 1 || - reservation_gc_id >= *env->me_oldest)) { + reservation_gc_id >= + atomic_load64(env->me_oldest, mo_Relaxed))) { mdbx_error("%s", "** internal error (reservation_gc_id)"); rc = MDBX_PROBLEM; goto bailout; @@ -11368,7 +11587,7 @@ retry_noaccount: (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); - mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); + mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < env->me_oldest->weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); @@ -11534,7 +11753,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { flush_begin = (flush_begin < dp->mp_pgno) ? flush_begin : dp->mp_pgno; flush_end = (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; - *env->me_unsynced_pages += npages; + env->me_unsynced_pages->weak += npages; dp->mp_flags -= P_DIRTY; dp->mp_txnid = pp_txnid2chk(txn); @@ -12521,10 +12740,12 @@ static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, continue; if ((env->me_stuck_meta < 0) - ? mdbx_meta_ot(prefer_steady, env, dest, meta) + ? mdbx_meta_ot(meta_bootid_match(meta) ? prefer_last + : prefer_steady, + env, dest, meta) : (meta_number == (unsigned)env->me_stuck_meta)) { *dest = *meta; - if (!META_IS_STEADY(dest)) + if (!lck_exclusive && !META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ mdbx_verbose("latch meta[%u]", meta_number); } @@ -12607,6 +12828,18 @@ static MDBX_meta *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { return page_meta(page2); } +static size_t mdbx_madvise_threshold(const MDBX_env *env, + const size_t largest_bytes) { + /* TODO: use options */ + const unsigned factor = 9; + const size_t threshold = (largest_bytes < (65536ul << factor)) + ? 65536 /* minimal threshold */ + : (largest_bytes > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : largest_bytes >> factor; + return bytes_align2os_bytes(env, threshold); +} + static int mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending) { mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); @@ -12624,11 +12857,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (flags & MDBX_SAFE_NOSYNC) { /* Check auto-sync conditions */ - const pgno_t autosync_threshold = *env->me_autosync_threshold; - const uint64_t autosync_period = *env->me_autosync_period; - if ((autosync_threshold && *env->me_unsynced_pages >= autosync_threshold) || + const pgno_t autosync_threshold = + atomic_load32(env->me_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(env->me_autosync_period, mo_Relaxed); + if ((autosync_threshold && + atomic_load32(env->me_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - *env->me_sync_timestamp >= autosync_period)) + mdbx_osal_monotime() - + atomic_load64(env->me_sync_timestamp, mo_Relaxed) >= + autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ } @@ -12652,23 +12891,21 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #if defined(MADV_DONTNEED) const size_t largest_bytes = pgno2bytes(env, largest_pgno); /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t madvise_threshold = (largest_bytes < 65536 * 256) - ? 65536 - : (largest_bytes > MEGABYTE * 4 * 256) - ? MEGABYTE * 4 - : largest_bytes >> 10; + const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes); const size_t discard_edge_bytes = bytes_align2os_bytes( env, ((MDBX_RDONLY & - (env->me_lck ? env->me_lck->mti_envmode : env->me_flags)) + (env->me_lck ? env->me_lck->mti_envmode.weak : env->me_flags)) ? largest_bytes : largest_bytes + madvise_threshold)); const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); - const pgno_t prev_discarded_pgno = *env->me_discarded_tail; + const pgno_t prev_discarded_pgno = + atomic_load32(env->me_discarded_tail, mo_AcquireRelease); if (prev_discarded_pgno >= discard_edge_pgno + bytes2pgno(env, madvise_threshold)) { - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", prev_discarded_pgno, largest_pgno); - *env->me_discarded_tail = discard_edge_pgno; + atomic_store32(env->me_discarded_tail, discard_edge_pgno, + mo_AcquireRelease); const size_t prev_discarded_bytes = ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); @@ -12726,7 +12963,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; - if (*env->me_unsynced_pages) { + if (atomic_load32(env->me_unsynced_pages, mo_AcquireRelease)) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; if ((flags & MDBX_SAFE_NOSYNC) == 0) { @@ -12750,9 +12987,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - *env->me_unsynced_pages = 0; - *env->me_sync_timestamp = mdbx_osal_monotime(); + atomic_store64(env->me_sync_timestamp, mdbx_osal_monotime(), mo_Relaxed); unaligned_poke_u64(4, pending->mm_datasync_sign, mdbx_meta_sign(pending)); + atomic_store32(env->me_unsynced_pages, 0, mo_AcquireRelease); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); @@ -12841,7 +13078,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); mdbx_jitter4testing(true); - mdbx_flush_incoherent_cpu_writeback(); /* LY: 'commit' the meta */ mdbx_meta_update_end(env, target, @@ -12895,9 +13131,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } } if (flags & MDBX_NOMETASYNC) - *env->me_unsynced_pages += 1; + env->me_unsynced_pages->weak += 1; else - *env->me_meta_sync_txnid = + env->me_meta_sync_txnid->weak = (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a); /* LY: shrink datafile if needed */ @@ -12912,7 +13148,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (likely(env->me_lck)) /* toggle oldest refresh */ - env->me_lck->mti_readers_refresh_flag = false; + atomic_store32(&env->me_lck->mti_readers_refresh_flag, false, + mo_AcquireRelease); return MDBX_SUCCESS; @@ -12990,7 +13227,8 @@ __cold int mdbx_env_create(MDBX_env **penv) { goto bailout; } env->me_os_psize = (unsigned)os_psize; - mdbx_setup_pagesize(env, env->me_os_psize); + mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize + : MAX_PAGESIZE); rc = mdbx_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) @@ -13017,7 +13255,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { #endif /* Windows */ VALGRIND_CREATE_MEMPOOL(env, 0, 0); - env->me_signature = MDBX_ME_SIGNATURE; + env->me_signature.weak = MDBX_ME_SIGNATURE; *penv = env; return MDBX_SUCCESS; @@ -13320,10 +13558,12 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, /* Check if there are any reading threads that do not use the SRWL */ const size_t CurrentTid = GetCurrentThreadId(); const MDBX_reader *const begin = env->me_lck->mti_readers; - const MDBX_reader *const end = begin + env->me_lck->mti_numreaders; + const MDBX_reader *const end = + begin + + atomic_load32(&env->me_lck->mti_numreaders, mo_AcquireRelease); for (const MDBX_reader *reader = begin; reader < end; ++reader) { - if (reader->mr_pid == env->me_pid && reader->mr_tid && - reader->mr_tid != CurrentTid) { + if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak && + reader->mr_tid.weak != CurrentTid) { /* At least one thread may don't use SRWL */ rc = MDBX_EPERM; break; @@ -13627,7 +13867,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { break; if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_assert(env, META_IS_STEADY(&meta) && !META_IS_STEADY(head)); + mdbx_assert(env, META_IS_STEADY(steady) && !META_IS_STEADY(head)); if (meta_bootid_match(head)) { MDBX_meta clone = *head; uint64_t filesize = env->me_dbgeo.now; @@ -13646,7 +13886,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (env->me_flags & MDBX_RDONLY) return MDBX_WANNA_RECOVERY /* LY: could not recovery/sync */; meta = clone; - *env->me_unsynced_pages = meta.mm_geo.next; + atomic_store32(env->me_unsynced_pages, meta.mm_geo.next, mo_Relaxed); break; } mdbx_warning("opening after an unclean shutdown, " @@ -13790,13 +14030,14 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { } } - *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes); + atomic_store32(env->me_discarded_tail, bytes2pgno(env, used_aligned2os_bytes), + mo_AcquireRelease); if (used_aligned2os_bytes < env->me_dxb_mmap.current) { #if defined(MADV_REMOVE) if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && /* not recovery mode */ env->me_stuck_meta < 0) { mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", - *env->me_discarded_tail, + env->me_discarded_tail->weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, @@ -13808,7 +14049,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { } #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", env->me_discarded_tail->weak, bytes2pgno(env, env->me_dxb_mmap.current)); err = madvise(env->me_map + used_aligned2os_bytes, @@ -13955,12 +14196,14 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, const size_t maxreaders = ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); - if (size > 65536 || maxreaders < 2 || maxreaders > MDBX_READERS_LIMIT) { - mdbx_error("lck-size too big (up to %" PRIuPTR " readers)", maxreaders); + if (maxreaders < 4) { + mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); err = MDBX_PROBLEM; goto bailout; } - env->me_maxreaders = (unsigned)maxreaders; + env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) + ? (unsigned)maxreaders + : (unsigned)MDBX_READERS_LIMIT; err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size, @@ -14374,7 +14617,8 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { memset(&dummy_env, 0, sizeof(dummy_env)); dummy_env.me_flags = (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; - dummy_env.me_psize = dummy_env.me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env.me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env.me_psize = (unsigned)mdbx_default_pagesize(); dummy_env.me_pathname = (char *)pathname; MDBX_handle_env_pathname env_pathname; @@ -14555,7 +14799,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (env->me_lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - while (env->me_lck->mti_envmode == MDBX_RDONLY) { + while (atomic_load32(&env->me_lck->mti_envmode, mo_AcquireRelease) == + MDBX_RDONLY) { if (atomic_cas32(&env->me_lck->mti_envmode, MDBX_RDONLY, env->me_flags & mode_flags)) break; @@ -14566,13 +14811,13 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, /* pickup current mode-flags, including MDBX_LIFORECLAIM | * MDBX_COALESCE | MDBX_NORDAHEAD */ const unsigned diff = - (env->me_lck->mti_envmode ^ env->me_flags) & mode_flags; + (env->me_lck->mti_envmode.weak ^ env->me_flags) & mode_flags; mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, env->me_flags ^ diff); env->me_flags ^= diff; } - if ((env->me_lck->mti_envmode ^ env->me_flags) & rigorous_flags) { + if ((env->me_lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { mdbx_error("%s", "current mode/flags incompatible with requested"); rc = MDBX_INCOMPATIBLE; goto bailout; @@ -14596,7 +14841,8 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, mdbx_debug("opened dbenv %p", (void *)env); if (env->me_lck) { if (lck_rc == MDBX_RESULT_TRUE) { - env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); + env->me_lck->mti_envmode.weak = + env->me_flags & (mode_flags | MDBX_RDONLY); rc = mdbx_lck_downgrade(env); mdbx_debug("lck-downgrade-%s: rc %i", (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); @@ -14761,7 +15007,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { if (unlikely(!env)) return MDBX_EINVAL; - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; #if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) @@ -14804,7 +15050,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif } - env->me_signature = 0; + mdbx_assert(env, env->me_signature.weak == 0); rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) @@ -14820,7 +15066,7 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { - ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); + ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dp_reserve = dp->mp_next; mdbx_free(dp); @@ -18734,7 +18980,7 @@ bailout: * [out] cdst The cursor to copy to. */ static void cursor_copy_internal(const MDBX_cursor *csrc, MDBX_cursor *cdst) { mdbx_cassert(csrc, - csrc->mc_txn->mt_txnid >= *csrc->mc_txn->mt_env->me_oldest); + csrc->mc_txn->mt_txnid >= csrc->mc_txn->mt_env->me_oldest->weak); cdst->mc_txn = csrc->mc_txn; cdst->mc_dbi = csrc->mc_dbi; cdst->mc_db = csrc->mc_db; @@ -21019,8 +21265,9 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.shrink = pgno2bytes(env, txn_meta->mm_geo.shrink); arg->mi_geo.grow = pgno2bytes(env, txn_meta->mm_geo.grow); - unsynced_pages = *env->me_unsynced_pages + - (*env->me_meta_sync_txnid != (uint32_t)arg->mi_last_pgno); + unsynced_pages = atomic_load32(env->me_unsynced_pages, mo_Relaxed) + + (atomic_load32(env->me_meta_sync_txnid, mo_Relaxed) != + (uint32_t)arg->mi_last_pgno); arg->mi_mapsize = env->me_dxb_mmap.limit; mdbx_compiler_barrier(); @@ -21039,7 +21286,9 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, } arg->mi_maxreaders = env->me_maxreaders; - arg->mi_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX; + arg->mi_numreaders = + env->me_lck ? atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed) + : INT32_MAX; arg->mi_dxb_pagesize = env->me_psize; arg->mi_sys_pagesize = env->me_os_psize; @@ -21047,18 +21296,20 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, if (likely(bytes > size_before_bootid)) { arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); const uint64_t monotime_now = mdbx_osal_monotime(); - arg->mi_since_sync_seconds16dot16 = - mdbx_osal_monotime_to_16dot16(monotime_now - *env->me_sync_timestamp); + arg->mi_since_sync_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + monotime_now - atomic_load64(env->me_sync_timestamp, mo_Relaxed)); arg->mi_since_reader_check_seconds16dot16 = - lck ? mdbx_osal_monotime_to_16dot16(monotime_now - - lck->mti_reader_check_timestamp) + lck ? mdbx_osal_monotime_to_16dot16( + monotime_now - + atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed)) : 0; - arg->mi_autosync_threshold = pgno2bytes(env, *env->me_autosync_threshold); - arg->mi_autosync_period_seconds16dot16 = - mdbx_osal_monotime_to_16dot16(*env->me_autosync_period); + arg->mi_autosync_threshold = + pgno2bytes(env, atomic_load32(env->me_autosync_threshold, mo_Relaxed)); + arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + atomic_load64(env->me_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; - arg->mi_mode = lck ? lck->mti_envmode : env->me_flags; + arg->mi_mode = lck ? lck->mti_envmode.weak : env->me_flags; } arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = 0; @@ -21066,7 +21317,8 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = arg->mi_recent_txnid; for (unsigned i = 0; i < arg->mi_numreaders; ++i) { - const uint32_t pid = lck->mti_readers[i].mr_pid; + const uint32_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid) { const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); if (arg->mi_latter_reader_txnid > txnid) @@ -21406,7 +21658,7 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { env->me_dbflags[dbi] = 0; env->me_dbiseqs[dbi]++; env->me_dbxs[dbi].md_name.iov_len = 0; - mdbx_compiler_barrier(); + mdbx_memory_fence(false, true); env->me_dbxs[dbi].md_name.iov_base = NULL; mdbx_free(ptr); @@ -21654,22 +21906,28 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, rc = MDBX_RESULT_TRUE; int serial = 0; if (likely(env->me_lck)) { - const unsigned snap_nreaders = env->me_lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&env->me_lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; i++) { const MDBX_reader *r = env->me_lck->mti_readers + i; retry_reader:; - const uint32_t pid = r->mr_pid; + const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); if (!pid) continue; txnid_t txnid = safe64_read(&r->mr_txnid); - const size_t tid = r->mr_tid; - const pgno_t pages_used = r->mr_snapshot_pages_used; - const uint64_t reader_pages_retired = r->mr_snapshot_pages_retired; - mdbx_compiler_barrier(); - if (unlikely(tid != r->mr_tid || - pages_used != r->mr_snapshot_pages_used || - reader_pages_retired != r->mr_snapshot_pages_retired || - txnid != safe64_read(&r->mr_txnid) || pid != r->mr_pid)) + const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed); + const pgno_t pages_used = + atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed); + const uint64_t reader_pages_retired = + atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed); + if (unlikely( + txnid != safe64_read(&r->mr_txnid) || + pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) || + tid != atomic_load64(&r->mr_tid, mo_Relaxed) || + pages_used != + atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) || + reader_pages_retired != + atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) goto retry_reader; mdbx_assert(env, txnid > 0); @@ -21770,8 +22028,8 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { return MDBX_SUCCESS; } - lck->mti_reader_check_timestamp = mdbx_osal_monotime(); - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); uint32_t pidsbuf_onstask[142]; uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) @@ -21783,7 +22041,8 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { pids[0] = 0; int count = 0; for (unsigned i = 0; i < snap_nreaders; i++) { - const uint32_t pid = lck->mti_readers[i].mr_pid; + const uint32_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid == 0) continue /* skip empty */; if (pid == env->me_pid) @@ -21816,7 +22075,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { } /* a other process may have clean and reused slot, recheck */ - if (lck->mti_readers[i].mr_pid != pid) + if (lck->mti_readers[i].mr_pid.weak != pid) continue; err = mdbx_rpid_check(env, pid); @@ -21831,17 +22090,20 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { - if (lck->mti_readers[j].mr_pid == pid) { + if (lck->mti_readers[j].mr_pid.weak == pid) { mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, - (size_t)pid, lck->mti_readers[j].mr_txnid.inconsistent); - lck->mti_readers[j].mr_pid = 0; - mdbx_compiler_barrier(); - lck->mti_readers_refresh_flag = true; + (size_t)pid, lck->mti_readers[j].mr_txnid.weak); + atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_AcquireRelease); + atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); count++; } } } + if (likely(!MDBX_IS_ERROR(rc))) + atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(), + mo_AcquireRelease); + if (rdt_locked < 0) mdbx_rdt_unlock(env); @@ -21882,7 +22144,7 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, txnid_t oldest = mdbx_recent_steady_txnid(env); mdbx_assert(env, oldest < env->me_txn0->mt_txnid); mdbx_assert(env, oldest >= laggard); - mdbx_assert(env, oldest >= *env->me_oldest); + mdbx_assert(env, oldest >= env->me_oldest->weak); if (oldest == laggard || unlikely(!env->me_lck /* without-LCK mode */)) return oldest; @@ -21892,17 +22154,19 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, MDBX_reader *asleep = nullptr; MDBX_lockinfo *const lck = env->me_lck; uint64_t oldest_retired = UINT64_MAX; - const unsigned snap_nreaders = lck->mti_numreaders; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (unsigned i = 0; i < snap_nreaders; ++i) { retry: - if (lck->mti_readers[i].mr_pid) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { /* mdbx_jitter4testing(true); */ - const uint64_t snap_retired = - lck->mti_readers[i].mr_snapshot_pages_retired; + const uint64_t snap_retired = atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - mdbx_memory_barrier(); if (unlikely(snap_retired != - lck->mti_readers[i].mr_snapshot_pages_retired || + atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, + mo_AcquireRelease) || snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; if (oldest > snap_txnid && @@ -21923,16 +22187,16 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, -retry); } mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, - *env->me_oldest, oldest); - mdbx_assert(env, *env->me_oldest <= oldest); - return *env->me_oldest = oldest; + env->me_oldest->weak, oldest); + mdbx_assert(env, env->me_oldest->weak <= oldest); + return atomic_store64(env->me_oldest, oldest, mo_AcquireRelease); } if (!env->me_hsr_callback) break; - uint32_t pid = asleep->mr_pid; - size_t tid = asleep->mr_tid; + uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease); + uint64_t tid = asleep->mr_tid.weak; if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) continue; @@ -21956,12 +22220,10 @@ static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, safe64_reset_compare(&asleep->mr_txnid, laggard); } else { safe64_reset(&asleep->mr_txnid, true); - asleep->mr_tid = 0; - asleep->mr_pid = 0; + atomic_store64(&asleep->mr_tid, 0, mo_Relaxed); + atomic_store32(&asleep->mr_pid, 0, mo_Relaxed); } - mdbx_compiler_barrier(); - lck->mti_readers_refresh_flag = true; - mdbx_flush_incoherent_cpu_writeback(); + atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); } } @@ -21990,7 +22252,7 @@ __cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { } MDBX_hsr_func *__cold mdbx_env_get_hsr(const MDBX_env *env) { - return likely(env && env->me_signature == MDBX_ME_SIGNATURE) + return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE) ? env->me_hsr_callback : NULL; } @@ -23105,7 +23367,7 @@ __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); + pagesize = (intptr_t)mdbx_default_pagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !is_powerof2((size_t)pagesize))) @@ -23116,7 +23378,7 @@ __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); + pagesize = (intptr_t)mdbx_default_pagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !is_powerof2((size_t)pagesize))) @@ -23130,7 +23392,7 @@ __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); + pagesize = (intptr_t)mdbx_default_pagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || !is_powerof2((size_t)pagesize))) @@ -23398,8 +23660,9 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EPERM; if (sizeof(value) > sizeof(size_t) && unlikely(value != (size_t)value)) return MDBX_TOO_LARGE; - if ((*env->me_autosync_threshold = - bytes2pgno(env, (size_t)value + env->me_psize - 1)) != 0) { + if (atomic_store32(env->me_autosync_threshold, + bytes2pgno(env, (size_t)value + env->me_psize - 1), + mo_Relaxed) != 0) { err = mdbx_env_sync_poll(env); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -23413,8 +23676,9 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EPERM; if (unlikely(value > UINT32_MAX)) return MDBX_TOO_LARGE; - if ((*env->me_autosync_period = - mdbx_osal_16dot16_to_monotime((uint32_t)value)) != 0) { + if (atomic_store64(env->me_autosync_period, + mdbx_osal_16dot16_to_monotime((uint32_t)value), + mo_Relaxed) != 0) { err = mdbx_env_sync_poll(env); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -23450,11 +23714,12 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { mdbx_assert(env, env->me_dp_reserve != NULL); - MDBX_page *mp = env->me_dp_reserve; - ASAN_UNPOISON_MEMORY_REGION(&mp->mp_next, sizeof(mp->mp_next)); - VALGRIND_MAKE_MEM_DEFINED(&mp->mp_next, sizeof(mp->mp_next)); - env->me_dp_reserve = mp->mp_next; - mdbx_free(mp); + MDBX_page *dp = env->me_dp_reserve; + ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dp_reserve = dp->mp_next; + VALGRIND_MEMPOOL_FREE(env, dp); + mdbx_free(dp); env->me_dp_reserve_len -= 1; } } @@ -23551,13 +23816,15 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, case MDBX_opt_sync_bytes: if (unlikely(!env->me_autosync_threshold)) return MDBX_EPERM; - *value = *env->me_autosync_threshold; + *value = + pgno2bytes(env, atomic_load32(env->me_autosync_threshold, mo_Relaxed)); break; case MDBX_opt_sync_period: if (unlikely(!env->me_autosync_period)) return MDBX_EPERM; - *value = mdbx_osal_monotime_to_16dot16(*env->me_autosync_period); + *value = mdbx_osal_monotime_to_16dot16( + atomic_load64(env->me_autosync_period, mo_Relaxed)); break; case MDBX_opt_max_db: @@ -23979,7 +24246,7 @@ LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -26391,9 +26658,9 @@ __dll_export 0, 9, 2, - 110, - {"2021-01-23T01:13:20+03:00", "8a2017c7d169c0ee1b9b21cc67ea939d7abe5f08", "4e13d1239ab6580ed04cab377b746e7f8c214a9b", - "v0.9.2-110-g4e13d12"}, + 136, + {"2021-01-30T02:28:12+03:00", "1ab2fa795a0dee8678b168ffcd961ca5a0fff7c5", "9f0ff865e860435b243d3d1e936e3067097a294f", + "v0.9.2-136-g9f0ff86"}, sourcery}; __dll_export @@ -26410,7 +26677,7 @@ __dll_export #endif const char *const mdbx_sourcery_anchor = sourcery; /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -26675,23 +26942,25 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { if (env->me_lck) { /* Scan LCK for threads of the current process */ const MDBX_reader *const begin = env->me_lck->mti_readers; - const MDBX_reader *const end = begin + env->me_lck->mti_numreaders; + const MDBX_reader *const end = + begin + atomic_load32(&env->me_lck->mti_numreaders, mo_AcquireRelease); const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; for (const MDBX_reader *reader = begin; reader < end; ++reader) { - if (reader->mr_pid != env->me_pid || !reader->mr_tid) { + if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) { skip_lck: continue; } - if (reader->mr_tid == CurrentTid || reader->mr_tid == WriteTxnOwner) + if (reader->mr_tid.weak == CurrentTid || + reader->mr_tid.weak == WriteTxnOwner) goto skip_lck; if (env->me_flags & MDBX_NOTLS) { /* Skip duplicates in no-tls mode */ for (const MDBX_reader *scan = reader; --scan >= begin;) - if (scan->mr_tid == reader->mr_tid) + if (scan->mr_tid.weak == reader->mr_tid.weak) goto skip_lck; } - rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid); + rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); if (rc != MDBX_SUCCESS) { bailout_lck: (void)mdbx_resume_threads_after_remap(*array); @@ -27009,7 +27278,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, if (env->me_map) mdbx_munmap(&env->me_dxb_mmap); if (env->me_lck) { - const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages == 0; + const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; mdbx_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && mdbx_lck_upgrade(env) == MDBX_SUCCESS) @@ -27250,7 +27519,7 @@ static void mdbx_winnt_import(void) { #endif /* Windows LCK-implementation */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -27746,7 +28015,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, mdbx_assert(env, rc == 0); if (rc == 0) { - const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages == 0; + const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; mdbx_munmap(&env->me_lck_mmap); if (synced) rc = ftruncate(env->me_lfd, 0) ? errno : 0; diff --git a/libs/libmdbx/src/mdbx.c++ b/libs/libmdbx/src/mdbx.c++ index 049c93d588..e29b579cef 100644 --- a/libs/libmdbx/src/mdbx.c++ +++ b/libs/libmdbx/src/mdbx.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * <http://www.OpenLDAP.org/license.html>. */ #define MDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -104,7 +104,7 @@ #include "mdbx.h++" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -536,7 +536,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -999,15 +999,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1043,14 +1045,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1065,21 +1059,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1809,6 +1805,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1847,6 +1868,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1855,6 +1877,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1881,24 +1904,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2101,7 +2106,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2113,23 +2118,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2142,25 +2142,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2169,21 +2169,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2195,8 +2195,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2251,7 +2251,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2535,7 +2535,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2581,11 +2581,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2593,12 +2593,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2615,13 +2615,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ @@ -3043,7 +3043,7 @@ static __maybe_unused void static_checks(void) { } #endif // -// Copyright (c) 2020, Leonid Yuriev <leo@yuriev.ru>. +// Copyright (c) 2020-2021, Leonid Yuriev <leo@yuriev.ru>. // SPDX-License-Identifier: Apache-2.0 // // Non-inline part of the libmdbx C++ API (preliminary) @@ -4026,8 +4026,6 @@ template class LIBMDBX_API_TYPE buffer<polymorphic_allocator>; //------------------------------------------------------------------------------ -size_t env::default_pagesize() noexcept { return ::mdbx_syspagesize(); } - static inline MDBX_env_flags_t mode2flags(env::mode mode) { switch (mode) { default: diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index d5c5ec99ce..9d7dba99e9 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -19,7 +19,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2020, Leonid Yuriev <leo@yuriev.ru> +\authors Copyright (c) 2015-2021, Leonid Yuriev <leo@yuriev.ru> and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without @@ -1380,7 +1380,7 @@ DEFINE_ENUM_FLAG_OPERATORS(MDBX_db_flags_t) /** \brief Data changing flags * \ingroup c_crud - * \see c_crud_hint + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" * \see mdbx_put() \see mdbx_cursor_put() \see mdbx_replace() */ enum MDBX_put_flags_t { /** Upsertion by default (without any other flags) */ @@ -2852,6 +2852,12 @@ LIBMDBX_INLINE_API(int, mdbx_env_get_maxdbs, return rc; } +/** \brief Returns the default size of database page for the current system. + * \ingroup c_statinfo + * \details Default size of database page depends on the size of the system + * page and usually exactly match it. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API size_t mdbx_default_pagesize(void); + /** \brief Get the maximum size of keys can write. * \ingroup c_statinfo * @@ -3823,6 +3829,8 @@ LIBMDBX_API int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, * the count of the number of elements actually written. The `iov_base` of * the second \ref MDBX_val is unused. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_THREAD_MISMATCH Given transaction is not owned @@ -3876,6 +3884,8 @@ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * combination for selection particular item from * multi-value/duplicates. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, @@ -3902,6 +3912,8 @@ LIBMDBX_API int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, * This function will return \ref MDBX_NOTFOUND if the specified key/data * pair is not in the database. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). * \param [in] key The key to delete from the database. @@ -3942,7 +3954,7 @@ LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, LIBMDBX_API MDBX_cursor *mdbx_cursor_create(void *context); /** \brief Set application information associated with the \ref MDBX_cursor. - * \ingroup c_crud + * \ingroup c_cursors * \see mdbx_cursor_get_userctx() * * \param [in] cursor An cursor handle returned by \ref mdbx_cursor_create() @@ -3953,7 +3965,7 @@ LIBMDBX_API MDBX_cursor *mdbx_cursor_create(void *context); LIBMDBX_API int mdbx_cursor_set_userctx(MDBX_cursor *cursor, void *ctx); /** \brief Get the application information associated with the MDBX_cursor. - * \ingroup c_crud + * \ingroup c_cursors * \see mdbx_cursor_set_userctx() * * \param [in] cursor An cursor handle returned by \ref mdbx_cursor_create() @@ -4096,7 +4108,7 @@ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *cursor); LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); /** \brief Retrieve by cursor. - * \ingroup c_crud + * \ingroup c_cursors c_crud * * This function retrieves key/data pairs from the database. The address and * length of the key are returned in the object to which key refers (except @@ -4120,7 +4132,7 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); /** \brief Store by cursor. - * \ingroup c_crud + * \ingroup c_cursors c_crud * * This function stores key/data pairs into the database. The cursor is * positioned at the new item, or on failure usually near it. @@ -4185,6 +4197,8 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, * the count of the number of elements actually written. The `iov_base` of * the second \ref MDBX_val is unused. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_THREAD_MISMATCH Given transaction is not owned @@ -4201,7 +4215,7 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, const MDBX_val *key, MDBX_val *data, MDBX_put_flags_t flags); /** \brief Delete current key/data pair. - * \ingroup c_crud + * \ingroup c_cursors c_crud * * This function deletes the key/data pair to which the cursor refers. This * does not invalidate the cursor, so operations such as \ref MDBX_NEXT can @@ -4218,6 +4232,8 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, const MDBX_val *key, * Delete all of the data items for the current key. This flag has effect * only for database(s) was created with \ref MDBX_DUPSORT. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_THREAD_MISMATCH Given transaction is not owned @@ -4231,7 +4247,7 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, const MDBX_val *key, LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, MDBX_put_flags_t flags); /** \brief Return count of duplicates for current key. - * \ingroup c_crud + * \ingroup c_cursors c_crud * * This call is valid for all databases, but reasonable only for that support * sorted duplicate data items \ref MDBX_DUPSORT. @@ -4803,6 +4819,8 @@ typedef uint_fast64_t mdbx_attr_t; * keys are already known to be in the correct order. Loading unsorted * keys with this flag will cause a \ref MDBX_KEYEXIST error. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_EKEYMISMATCH @@ -4850,6 +4868,8 @@ LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, * correct order. Loading unsorted keys with this flag will cause * a \ref MDBX_EKEYMISMATCH error. * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_KEYEXIST diff --git a/libs/libmdbx/src/mdbx.h++ b/libs/libmdbx/src/mdbx.h++ index 29d3f6fdca..5ae3838aeb 100644 --- a/libs/libmdbx/src/mdbx.h++ +++ b/libs/libmdbx/src/mdbx.h++ @@ -1,7 +1,7 @@ /// \file mdbx.h++ /// \brief The libmdbx C++ API header file (preliminary). /// -/// \author Copyright (c) 2020, Leonid Yuriev <leo@yuriev.ru>. +/// \author Copyright (c) 2020-2021, Leonid Yuriev <leo@yuriev.ru>. /// \copyright SPDX-License-Identifier: Apache-2.0 /// /// Tested with: @@ -1838,7 +1838,9 @@ public: bool is_empty() const; /// \brief Returns default page size for current system/platform. - static size_t default_pagesize() noexcept; + static size_t default_pagesize() noexcept { + return ::mdbx_default_pagesize(); + } struct limits { limits() = delete; diff --git a/libs/libmdbx/src/mdbx_chk.c b/libs/libmdbx/src/mdbx_chk.c index 906c187347..a30003be28 100644 --- a/libs/libmdbx/src/mdbx_chk.c +++ b/libs/libmdbx/src/mdbx_chk.c @@ -1,7 +1,7 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -126,7 +126,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -558,7 +558,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1021,15 +1021,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1065,14 +1067,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1087,21 +1081,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1831,6 +1827,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1869,6 +1890,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1877,6 +1899,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1903,24 +1926,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2123,7 +2128,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2135,23 +2140,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2164,25 +2164,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2191,21 +2191,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2217,8 +2217,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2273,7 +2273,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2557,7 +2557,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2603,11 +2603,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2615,12 +2615,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2637,13 +2637,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ @@ -4695,7 +4695,7 @@ int main(int argc, char *argv[]) { value = envinfo.mi_mapsize / envstat.ms_psize - alloc_pages; print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); - value = alloc_pages - gc_pages; + value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount; print(", used %" PRIu64 " (%.1f%%)", value, value / percent); print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent); diff --git a/libs/libmdbx/src/mdbx_copy.c b/libs/libmdbx/src/mdbx_copy.c index 0c831b4528..faec17ac15 100644 --- a/libs/libmdbx/src/mdbx_copy.c +++ b/libs/libmdbx/src/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -126,7 +126,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -558,7 +558,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1021,15 +1021,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1065,14 +1067,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1087,21 +1081,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1831,6 +1827,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1869,6 +1890,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1877,6 +1899,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1903,24 +1926,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2123,7 +2128,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2135,23 +2140,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2164,25 +2164,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2191,21 +2191,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2217,8 +2217,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2273,7 +2273,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2557,7 +2557,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2603,11 +2603,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2615,12 +2615,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2637,13 +2637,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ diff --git a/libs/libmdbx/src/mdbx_dump.c b/libs/libmdbx/src/mdbx_dump.c index 6b7a5aaa40..abf82b0d7c 100644 --- a/libs/libmdbx/src/mdbx_dump.c +++ b/libs/libmdbx/src/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -126,7 +126,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -558,7 +558,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1021,15 +1021,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1065,14 +1067,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1087,21 +1081,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1831,6 +1827,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1869,6 +1890,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1877,6 +1899,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1903,24 +1926,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2123,7 +2128,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2135,23 +2140,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2164,25 +2164,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2191,21 +2191,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2217,8 +2217,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2273,7 +2273,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2557,7 +2557,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2603,11 +2603,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2615,12 +2615,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2637,13 +2637,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ diff --git a/libs/libmdbx/src/mdbx_load.c b/libs/libmdbx/src/mdbx_load.c index ad9ddc90e6..fc0021bf1c 100644 --- a/libs/libmdbx/src/mdbx_load.c +++ b/libs/libmdbx/src/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -126,7 +126,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -558,7 +558,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1021,15 +1021,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1065,14 +1067,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1087,21 +1081,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1831,6 +1827,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1869,6 +1890,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1877,6 +1899,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1903,24 +1926,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2123,7 +2128,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2135,23 +2140,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2164,25 +2164,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2191,21 +2191,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2217,8 +2217,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2273,7 +2273,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2557,7 +2557,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2603,11 +2603,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2615,12 +2615,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2637,13 +2637,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ diff --git a/libs/libmdbx/src/mdbx_stat.c b/libs/libmdbx/src/mdbx_stat.c index d19c672ff0..30e9e5cc29 100644 --- a/libs/libmdbx/src/mdbx_stat.c +++ b/libs/libmdbx/src/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -#define MDBX_BUILD_SOURCERY 69c11536d7625297e4095e26adca038aaec4105753b20848ae5dfd8bc130d747_v0_9_2_110_g4e13d12 +#define MDBX_BUILD_SOURCERY 37ec6726f69ef69c4e52fcfb87d700f44f37246b1b635f455d2a429776a8eec6_v0_9_2_136_g9f0ff86 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -126,7 +126,7 @@ #include "mdbx.h" /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -558,7 +558,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2020 Leonid Yuriev <leo@yuriev.ru> + * Copyright 2015-2021 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1021,15 +1021,17 @@ extern void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ /* Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic)) #include <cstdatomic> -#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \ +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) #include <stdatomic.h> +#define MDBX_HAVE_C11ATOMICS #elif defined(__GNUC__) || defined(__clang__) -/* LY: nothing required */ #elif defined(_MSC_VER) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ @@ -1065,14 +1067,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { _ReadWriteBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -#else -#error "Unknown target for Intel Compiler, please report to us." -#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __compiler_barrier(); #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -1087,21 +1081,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { } static __maybe_unused __inline void mdbx_memory_barrier(void) { -#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__) - atomic_thread_fence(__ATOMIC_SEQ_CST); +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif #elif defined(__clang__) || defined(__GNUC__) __sync_synchronize(); -#elif defined(_MSC_VER) +#elif defined(_WIN32) || defined(_WIN64) MemoryBarrier(); #elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__ia32__) _mm_mfence(); #else -#error "Unknown target for Intel Compiler, please report to us." + __mf(); #endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) __machine_rw_barrier(); @@ -1831,6 +1827,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /*----------------------------------------------------------------------------*/ /* Basic constants and types */ +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -1869,6 +1890,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS @@ -1877,6 +1899,7 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) @@ -1903,24 +1926,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -typedef union mdbx_safe64 { - volatile uint64_t inconsistent; -#if MDBX_64BIT_ATOMIC - volatile uint64_t atomic; -#endif /* MDBX_64BIT_ATOMIC */ - __anonymous_struct_extension__ struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - volatile uint32_t low; - volatile uint32_t high; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - volatile uint32_t high; - volatile uint32_t low; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - }; -} mdbx_safe64_t; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -2123,7 +2128,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - mdbx_safe64_t /* txnid_t */ mr_txnid; + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -2135,23 +2140,18 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ -#if MDBX_WORDBITS >= 64 - volatile uint64_t mr_tid; -#else - volatile uint32_t mr_tid; - volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, - unused for now */ -#endif + MDBX_atomic_uint64_t mr_tid; + /* The process ID of the process owning this reader txn. */ - volatile uint32_t mr_pid; + MDBX_atomic_uint32_t mr_pid; /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - volatile pgno_t mr_snapshot_pages_used; + atomic_pgno_t mr_snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, * at any time the difference mm_pages_retired - mr_snapshot_pages_retired * will give the number of pages which this reader restraining from reuse. */ - volatile uint64_t mr_snapshot_pages_retired; + MDBX_atomic_uint64_t mr_snapshot_pages_retired; } MDBX_reader; /* The header for the reader table (a memory-mapped lock file). */ @@ -2164,25 +2164,25 @@ typedef struct MDBX_lockinfo { uint32_t mti_os_and_format; /* Flags which environment was opened. */ - volatile uint32_t mti_envmode; + MDBX_atomic_uint32_t mti_envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - volatile pgno_t mti_autosync_threshold; + atomic_pgno_t mti_autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ - volatile uint32_t mti_meta_sync_txnid; + MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - volatile uint64_t mti_autosync_period; + MDBX_atomic_uint64_t mti_autosync_period; /* Marker to distinguish uniqueness of DB/CLK.*/ - volatile uint64_t mti_bait_uniqueness; + MDBX_atomic_uint64_t mti_bait_uniqueness; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2191,21 +2191,21 @@ typedef struct MDBX_lockinfo { mdbx_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ - volatile txnid_t mti_oldest_reader; + atomic_txnid_t mti_oldest_reader; /* Timestamp of the last steady sync. Value is represented in a suitable * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * clock_gettime(CLOCK_MONOTONIC). */ - volatile uint64_t mti_sync_timestamp; + MDBX_atomic_uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - volatile pgno_t mti_unsynced_pages; + atomic_pgno_t mti_unsynced_pages; /* Number of page which was discarded last time by madvise(MADV_FREE). */ - volatile pgno_t mti_discarded_tail; + atomic_pgno_t mti_discarded_tail; /* Timestamp of the last readers check. */ - volatile uint64_t mti_reader_check_timestamp; + MDBX_atomic_uint64_t mti_reader_check_timestamp; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ @@ -2217,8 +2217,8 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - volatile unsigned mti_numreaders; - volatile unsigned mti_readers_refresh_flag; + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) @@ -2273,7 +2273,7 @@ typedef struct MDBX_lockinfo { #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 #define MDBX_READERS_LIMIT \ - ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) #define MDBX_PGL_LIMIT MAX_PAGENO #else #define MDBX_READERS_LIMIT 1024 @@ -2557,7 +2557,7 @@ typedef struct MDBX_cursor_couple { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + MDBX_atomic_uint32_t me_signature; /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -2603,11 +2603,11 @@ struct MDBX_env { mdbx_ipclock_t *me_wlock; #endif /* MDBX_LOCKING > 0 */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */ /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; /* Number of freelist items that can fit in a single overflow page */ @@ -2615,12 +2615,12 @@ struct MDBX_env { unsigned me_branch_nodemax; /* max size of a branch-node */ uint32_t me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ - volatile uint64_t *me_sync_timestamp; - volatile uint64_t *me_autosync_period; - volatile pgno_t *me_unsynced_pages; - volatile pgno_t *me_autosync_threshold; - volatile pgno_t *me_discarded_tail; - volatile uint32_t *me_meta_sync_txnid; + MDBX_atomic_uint64_t *me_sync_timestamp; + MDBX_atomic_uint64_t *me_autosync_period; + atomic_pgno_t *me_unsynced_pages; + atomic_pgno_t *me_autosync_threshold; + atomic_pgno_t *me_discarded_tail; + MDBX_atomic_uint32_t *me_meta_sync_txnid; MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ unsigned me_dp_reserve_len; struct { @@ -2637,13 +2637,13 @@ struct MDBX_env { #if MDBX_LOCKING > 0 mdbx_ipclock_t wlock; #endif /* MDBX_LOCKING > 0 */ - txnid_t oldest; - uint64_t sync_timestamp; - uint64_t autosync_period; - pgno_t autosync_pending; - pgno_t autosync_threshold; - pgno_t discarded_tail; - uint32_t meta_sync_txnid; + atomic_txnid_t oldest; + MDBX_atomic_uint64_t sync_timestamp; + MDBX_atomic_uint64_t autosync_period; + atomic_pgno_t autosync_pending; + atomic_pgno_t autosync_threshold; + atomic_pgno_t discarded_tail; + MDBX_atomic_uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ |