diff options
author | George Hazan <ghazan@miranda.im> | 2020-01-07 22:56:32 +0300 |
---|---|---|
committer | George Hazan <ghazan@miranda.im> | 2020-01-07 22:56:39 +0300 |
commit | 7524acda0a8778572262429f27b359d7ee91fc90 (patch) | |
tree | 6443228a439b83c5bc6d15923a9ccd91fa4718e3 /libs/libmdbx | |
parent | 62c842df372abe0349f025ac4cbbc8426526ac8b (diff) |
libmdbx: update to 0.5.0
Diffstat (limited to 'libs/libmdbx')
45 files changed, 2841 insertions, 1795 deletions
diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt index 46f827ebc2..3fb33d3f31 100644 --- a/libs/libmdbx/src/CMakeLists.txt +++ b/libs/libmdbx/src/CMakeLists.txt @@ -326,6 +326,7 @@ mark_as_advanced(MDBX_LOCKING) add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake." AUTO) mark_as_advanced(MDBX_TRUST_RTC) option(MDBX_ENABLE_TESTS "Build MDBX tests." ${BUILD_TESTING}) +option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking." OFF) ################################################################################ ################################################################################ diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md index 5c32049f79..e578f893ed 100644 --- a/libs/libmdbx/src/README.md +++ b/libs/libmdbx/src/README.md @@ -1,45 +1,71 @@ -### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans. -<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> ------ - libmdbx -====================================== +======= _libmdbx_ is an extremely fast, compact, powerful, embedded -transactional [key-value -store](https://en.wikipedia.org/wiki/Key-value_database) -database, with permissive [OpenLDAP Public License](LICENSE). -_libmdbx_ has a specific set of properties and capabilities, -focused on creating unique lightweight solutions with -extraordinary performance. - -The next version is under active non-public development and will be +transactional [key-value store](https://en.wikipedia.org/wiki/Key-value_database) +database, with [permissive license](LICENSE). +_MDBX_ has a specific set of properties and capabilities, +focused on creating unique lightweight solutions with extraordinary performance. + +1. Allows **swarm of multi-threaded processes to [ACID]((https://en.wikipedia.org/wiki/ACID))ly read and update** several key-value [maps](https://en.wikipedia.org/wiki/Associative_array) and [multimaps](https://en.wikipedia.org/wiki/Multimap) in a localy-shared database. + +2. Provides **extraordinary performance**, minimal overhead through [Memory-Mapping](https://en.wikipedia.org/wiki/Memory-mapped_file) and `Olog(N)` operations costs by virtue of [B+ tree](https://en.wikipedia.org/wiki/B%2B_tree). + +3. Requires **no maintenance and no crash recovery** since doesn't use [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging), but that might be a caveat for some workloads. + +4. **Compact and friendly for fully embeddeding**. Only 25KLOC of `C11`, 64K x86 binary code, +no internal threads neither processes, but implements a simplified variant of the +[Berkeley DB](https://en.wikipedia.org/wiki/Berkeley_DB) and +[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API. + +5. Enforces [serializability](https://en.wikipedia.org/wiki/Serializability) for +writers just by single +[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords +[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom) +for parallel readers without atomic/interlocked operations, while +**writing and reading transactions do not block each other**. + +6. **Guarantee data integrity** after crash unless this was explicitly +neglected in favour of write performance. + +7. Supports Linux, Windows, MacOS, FreeBSD, DragonFly, Solaris, +OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with +**POSIX.1-2008**. + +Historically, _MDBX_ is deeply revised and extended descendant of amazing +[Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). +_MDBX_ inherits all benefits from _LMDB_, but resolves some issues and adds set of improvements. + +The next version is under active non-public development from scratch and will be released as **_MithrilDB_** and `libmithrildb` for libraries & packages. Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is resembling silver but being stronger and lighter than steel. Therefore _MithrilDB_ is rightly relevant name. -> _MithrilDB_ will be radically different from _libmdbx_ by the new -> database format and API based on C++17, as well as the [Apache 2.0 -> License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this -> revolution is to provide a clearer and robust API, add more features and -> new valuable properties of database. - -*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.* + > _MithrilDB_ will be radically different from _libmdbx_ by the new + > database format and API based on C++17, as well as the [Apache 2.0 + > License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this + > revolution is to provide a clearer and robust API, add more features and + > new valuable properties of database. [](https://travis-ci.org/leo-yuriev/libmdbx) [](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) [](https://scan.coverity.com/projects/reopen-libmdbx) +*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.* + +----- + ## Table of Contents - [Overview](#overview) + - [Features](#features) + - [Limitations](#limitations) + - [Caveats & Gotchas](#caveats--gotchas) - [Comparison with other databases](#comparison-with-other-databases) + - [Improvements beyond LMDB](#improvements-beyond-lmdb) - [History & Acknowledgments](#history) -- [Description](#description) - - [Key features](#key-features) - - [Improvements over LMDB](#improvements-over-lmdb) - - [Gotchas](#gotchas) - [Usage](#usage) - [Building](#building) + - [API description](#api-description) - [Bindings](#bindings) - [Performance comparison](#performance-comparison) - [Integral performance](#integral-performance) @@ -49,202 +75,179 @@ _MithrilDB_ is rightly relevant name. - [Async-write mode](#async-write-mode) - [Cost comparison](#cost-comparison) ------ +# Overview -## Overview +## Features -_libmdbx_ is revised and extended descendant of amazing [Lightning -Memory-Mapped -Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). -_libmdbx_ inherits all features and characteristics from -[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database), -but resolves some issues and adds several features. +- Key-value data model, keys are always sorted. - - _libmdbx_ guarantee data integrity after crash unless this was explicitly -neglected in favour of write performance. +- Fully [ACID](https://en.wikipedia.org/wiki/ACID)-compliant, through to +[MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) +and [CoW](https://en.wikipedia.org/wiki/Copy-on-write). - - _libmdbx_ allows multiple processes to read and update several key-value -tables concurrently, while being -[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, with minimal -overhead and Olog(N) operation cost. +- Multiple key-value sub-databases within a single datafile. - - _libmdbx_ enforce -[serializability](https://en.wikipedia.org/wiki/Serializability) for -writers by single -[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords -[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom) -for parallel readers without atomic/interlocked operations, while -writing and reading transactions do not block each other. +- Range lookups, including range query estimation. - - _libmdbx_ uses [B+Trees](https://en.wikipedia.org/wiki/B%2B_tree) and -[Memory-Mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), -doesn't use [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) -which might be a caveat for some workloads. +- Efficient support for short fixed length keys, including native 32/64-bit integers. - - _libmdbx_ implements a simplified variant of the [Berkeley -DB](https://en.wikipedia.org/wiki/Berkeley_DB) and/or -[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API. +- Ultra-efficient support for [multimaps](https://en.wikipedia.org/wiki/Multimap). Multi-values sorted, searchable and iterable. Keys stored without duplication. - - _libmdbx_ supports Linux, Windows, MacOS, FreeBSD, DragonFly, Solaris, -OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with -POSIX.1-2008. +- Data is [memory-mapped](https://en.wikipedia.org/wiki/Memory-mapped_file) and accessible directly/zero-copy. Traversal of database records is extremely-fast. -### Comparison with other databases -For now please refer to [chapter of "BoltDB comparison with other -databases"](https://github.com/coreos/bbolt#comparison-with-other-databases) -which is also (mostly) applicable to _libmdbx_. +- Transactions for readers and writers, ones do not block others. -### History -At first the development was carried out within the -[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project. About a -year later _libmdbx_ was separated into standalone project, which was -[presented at Highload++ 2015 -conference](http://www.highload.ru/2015/abstracts/1831.html). +- Writes are strongly serialized. No transactions conflicts nor deadlocks. -Since 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta), -and development is funded by [Positive Technologies](https://www.ptsecurity.com). +- Readers are [non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm), notwithstanding [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation). -### Acknowledgments -Howard Chu <hyc@openldap.org> is the author of LMDB, from which -originated the MDBX in 2015. +- Nested write transactions. -Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which -was used for begin development of LMDB. +- Reads scales linearly across CPUs. ------ +- Continuous zero-overhead database compactification. -Description -=========== +- Automatic on-the-fly database size adjustment. -## Key features +- Customizable database page size. -1. Key-value pairs are stored in ordered map(s), keys are always sorted, -range lookups are supported. +- `Olog(N)` cost of lookup, insert, update, and delete operations by virtue of [B+ tree characteristics](https://en.wikipedia.org/wiki/B%2B_tree#Characteristics). -2. Data is [memory-mapped](https://en.wikipedia.org/wiki/Memory-mapped_file) -into each worker DB process, and could be accessed zero-copy from transactions. +- Online hot backup. -3. Transactions are -[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, through to -[MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) -and [CoW](https://en.wikipedia.org/wiki/Copy-on-write). Writes are -strongly serialized and aren't blocked by reads, transactions can't -conflict with each other. Reads are guaranteed to get only commited data -([relaxing serializability](https://en.wikipedia.org/wiki/Serializability#Relaxing_serializability)). +- Append operation for efficient bulk insertion of pre-sorted data. + +- No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) nor any +transaction journal. No crash recovery needed. No maintenance is required. -4. Read transactions are -[non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm), -don't use [atomic operations](https://en.wikipedia.org/wiki/Linearizability#High-level_atomic_operations). -Readers don't block each other and aren't blocked by writers. Read -performance scales linearly with CPU core count. - > Nonetheless, "connect to DB" (starting the first read transaction in a thread) and - > "disconnect from DB" (closing DB or thread termination) requires a lock - > acquisition to register/unregister at the "readers table". +- No internal cache and/or memory management, all done by basic OS services. -5. Keys with multiple values are stored efficiently without key -duplication, sorted by value, including integers (valuable for -secondary indexes). +## Limitations -6. Efficient operation on short fixed length keys, -including 32/64-bit integer types. +- **Page size**: a power of 2, maximum `65536` bytes, default `4096` bytes. +- **Key size**: minimum 0, maximum ≈¼ pagesize (`1300` bytes for default 4K pagesize, `21780` bytes for 64K pagesize). +- **Value size**: minimum 0, maximum `2146435072` (`0x7FF00000`) bytes for maps, ≈¼ pagesize for multimaps (`1348` bytes default 4K pagesize, `21828` bytes for 64K pagesize). +- **Write transaction size**: up to `4194301` (`0x3FFFFD`) pages (16 [GiB](https://en.wikipedia.org/wiki/Gibibyte) for pagesize, 256 [GiB](https://en.wikipedia.org/wiki/Gibibyte) for 64K pagesize). +- **Database size**: up to `2147483648` pages (8 [TiB](https://en.wikipedia.org/wiki/Tebibyte) for default 4K pagesize, 128 [TiB](https://en.wikipedia.org/wiki/Tebibyte) for 64K pagesize). +- **Maximum sub-databases**: `32765`. -7. [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write -Amplification Factor) и RAF (Read Amplification Factor) are Olog(N). +## Caveats & Gotchas -8. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) and -transaction journal. In case of a crash no recovery needed. No need for -regular maintenance. Backups can be made on the fly on working DB -without freezing writers. +1. There cannot be more than one writer at a time, i.e. no more than one write transaction at a time. -9. No additional memory management, all done by basic OS services. +2. MDBX is based on [B+ tree](https://en.wikipedia.org/wiki/B%2B_tree), so access to database pages is mostly random. +Thus SSDs provide a significant performance boost over spinning disks for large databases. +3. MDBX uses [shadow paging](https://en.wikipedia.org/wiki/Shadow_paging) instead of [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). Thus syncing data to disk might be bottleneck for write intensive workload. -## Improvements over LMDB +4. MDBX uses [copy-on-write](https://en.wikipedia.org/wiki/Copy-on-write) for [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) during updates, but read transactions prevents recycling an old retired/freed pages, since it read ones. Thus altering of data during a parallel +long-lived read operation will increase the process work set, may exhaust entire free database space, +the database can grow quickly, and result in performance degradation. +Try to avoid long running read transactions. -_libmdbx_ is superior to _legendary [LMDB](https://symas.com/lmdb/)_ in +5. MDBX is extraordinarily fast and provides minimal overhead for data access, +so you should reconsider about use brute force techniques and double check your code. +On the one hand, in the case of MDBX, a simple linear search may be more profitable than complex indexes. +On the other hand, if you make something suboptimally, you can notice a detrimentally only on sufficiently large data. + +### Comparison with other databases +For now please refer to [chapter of "BoltDB comparison with other +databases"](https://github.com/coreos/bbolt#comparison-with-other-databases) +which is also (mostly) applicable to _libmdbx_. + +Improvements beyond LMDB +======================== + +_libmdbx_ is superior to legendary _[LMDB](https://symas.com/lmdb/)_ in terms of features and reliability, not inferior in performance. In -comparison to LMDB, _libmdbx_ make things "just work" perfectly and +comparison to _LMDB_, _libmdbx_ make things "just work" perfectly and out-of-the-box, not silently and catastrophically break down. The list below is pruned down to the improvements most notable and obvious from the user's point of view. -1. Automatic on-the-fly database size control by preset parameters, both -reduction and increment. +### Added Features: + +1. Keys could be more than 2 times longer than _LMDB_. + > For DB with default page size _libmdbx_ support keys up to 1300 bytes + > and up to 21780 bytes for 64K page size. _LMDB_ allows key size up to + > 511 bytes and may silently loses data with large values. + +2. Up to 20% faster than _LMDB_ in [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) benchmarks. + > Benchmarks of the in-[tmpfs](https://en.wikipedia.org/wiki/Tmpfs) scenarios, + > that tests the speed of engine itself, shown that _libmdbx_ 10-20% faster than _LMDB_. + > These and other results could be easily reproduced with [ioArena](https://github.com/pmwkaa/ioarena) just by `make bench-quartet`, + > including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB) + > and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger). + +3. Automatic on-the-fly database size adjustment, both increment and reduction. > _libmdbx_ manage the database size according to parameters specified > by `mdbx_env_set_geometry()` function, > ones include the growth step and the truncation threshold. -2. Automatic continuous zero-overhead database compactification. - > _libmdbx_ logically move as possible a freed pages - > at end of allocation area into unallocated space, - > and then release such space if a lot of. +4. Automatic continuous zero-overhead database compactification. + > During each commit _libmdbx_ merges suitable freeing pages into unallocated area + > at the end of file, and then truncate unused space when a lot enough of. + +5. The same database format for 32- and 64-bit builds. + > _libmdbx_ database format depends only on the [endianness](https://en.wikipedia.org/wiki/Endianness) but not on the [bitness](https://en.wiktionary.org/wiki/bitness). -3. LIFO policy for recycling a Garbage Collection items. On systems with a disk -write-back cache, this can significantly increase write performance, up to -several times in a best case scenario. - > LIFO means that for reuse pages will be taken which became unused the lastest. +6. LIFO policy for Garbage Collection recycling. This can significantly increase write performance due write-back disk cache up to several times in a best case scenario. + > LIFO means that for reuse will be taken latest became unused pages. > Therefore the loop of database pages circulation becomes as short as possible. - > In other words, the number of pages, that are overwritten in memory - > and on disk during a series of write transactions, will be as small as possible. - > Thus creates ideal conditions for the efficient operation of the disk write-back cache. + > In other words, the set of pages, that are (over)written in memory and on disk during a series of write transactions, will be as small as possible. + > Thus creates ideal conditions for the battery-backed or flash-backed disk cache efficiency. -4. Fast estimation of range query result volume, i.e. how many items can +7. Fast estimation of range query result volume, i.e. how many items can be found between a `KEY1` and a `KEY2`. This is prerequisite for build and/or optimize query execution plans. - > _libmdbx_ performs a rough estimate based only on b-tree pages that - > are common for the both stacks of cursors that were set to corresponing - > keys. + > _libmdbx_ performs a rough estimate based on common B-tree pages of the paths from root to corresponding keys. -5. `mdbx_chk` tool for database integrity check. +8. `mdbx_chk` tool for database integrity check. -6. Guarantee of database integrity even in asynchronous unordered write-to-disk mode. - > _libmdbx_ propose additional trade-off by implementing append-like manner for updates - > in `NOSYNC` and `MAPASYNC` modes, that avoid database corruption after a system crash - > contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode available to match LMDB behaviour, - > and for a special use-cases. +9. Automated steady sync-to-disk upon several thresholds and/or timeout via cheap polling. + +10. Sequence generation and three persistent 64-bit markers. -7. Automated steady flush to disk upon volume of changes and/or by -timeout via cheap polling. +11. Callback for lack-of-space condition of database that allows you to control and/or resolve such situations. -8. Sequence generation and three cheap persistent 64-bit markers with ACID. +12. Support for opening database in the exclusive mode, including on a network share. -9. Support for keys and values of zero length, including multi-values -(aka sorted duplicates). +### Added Abilities: -10. The handler of lack-of-space condition with a callback, -that allow you to control and resolve such situations. +1. Zero-length for keys and values. -11. Support for opening a database in the exclusive mode, including on a network share. +2. Ability to determine whether the particular data is on a dirty page +or not, that allows to avoid copy-out before updates. -12. Extended transaction info, including dirty and leftover space info -for a write transaction, reading lag and hold over space for read -transactions. +3. Ability to determine whether the cursor is pointed to a key-value +pair, to the first, to the last, or not set to anything. -13. Extended whole-database info (aka environment) and reader enumeration. +4. Extended information of whole-database, sub-databases, transactions, readers enumeration. + > _libmdbx_ provides a lot of information, including dirty and leftover pages + > for a write transaction, reading lag and holdover space for read transactions. -14. Extended update or delete, _at once_ with getting previous value -and addressing the particular item from multi-value with the same key. +5. Extended update and delete operations. + > _libmdbx_ allows ones _at once_ with getting previous value + > and addressing the particular item from multi-value with the same key. -15. Support for explicitly updating the existing record, not insertion a new one. +### Other fixes and specifics: -16. All cursors are uniformly, can be reused and should be closed explicitly, -regardless ones were opened within write or read transaction. +1. Fixed more than 10 significant errors, in particular: page leaks, wrong sub-database statistics, segfault in several conditions, unoptimal page merge strategy, updating an existing record with a change in data size (including for multimap), etc. -17. Correct update of current record with `MDBX_CURRENT` flag when size -of key or data was changed, including sorted duplicated. +2. All cursors can be reused and should be closed explicitly, regardless ones were opened within write or read transaction. -18. Opening database handles is spared from race conditions and +3. Opening database handles are spared from race conditions and pre-opening is not needed. -19. Ability to determine whether the particular data is on a dirty page -or not, that allows to avoid copy-out before updates. - -20. Ability to determine whether the cursor is pointed to a key-value -pair, to the first, to the last, or not set to anything. +4. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete. -21. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete. +5. Guarantee of database integrity even in asynchronous unordered write-to-disk mode. + > _libmdbx_ propose additional trade-off by implementing append-like manner for updates + > in `MDBX_SAFE_NOSYNC` and `MDBX_WRITEMAP|MDBX_MAPASYNC` modes, that avoid database corruption after a system crash + > contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode available to match LMDB behaviour, + > and for a special use-cases. -22. On **MacOS** the `fcntl(F_FULLFSYNC)` syscall is used _by +6. On **MacOS** the `fcntl(F_FULLFSYNC)` syscall is used _by default_ to synchronize data with the disk, as this is [the only way to guarantee data durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) @@ -254,59 +257,29 @@ compared to LMDB, where the `fsync()` syscall is used. Therefore, _libmdbx_ allows you to override this behavior by defining the `MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library. -23. On **Windows** the `LockFileEx()` syscall is used for locking, since +7. On **Windows** the `LockFileEx()` syscall is used for locking, since it allows place the database on network drives, and provides protection against incompetent user actions (aka [poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore _libmdbx_ may be a little lag in performance tests from LMDB where a named mutexes are used. +### History +At first the development was carried out within the +[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project. About a +year later _libmdbx_ was separated into standalone project, which was +[presented at Highload++ 2015 +conference](http://www.highload.ru/2015/abstracts/1831.html). + +Since 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta), +and development is funded by [Positive Technologies](https://www.ptsecurity.com). -## Gotchas - -1. There cannot be more than one writer at a time. - > On the other hand, this allows serialize an updates and eliminate any - > possibility of conflicts, deadlocks or logical errors. - -2. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) means -relatively big [WAF](https://en.wikipedia.org/wiki/Write_amplification) -(Write Amplification Factor). Because of this syncing data to disk might -be quite resource intensive and be main performance bottleneck during -intensive write workload. - > As compromise _libmdbx_ allows several modes of lazy and/or periodic - > syncing, including `MAPASYNC` mode, which modificate data in memory and - > asynchronously syncs data to disk, moment to sync is picked by OS. - > - > Although this should be used with care, synchronous transactions in a DB - > with transaction journal will require 2 IOPS minimum (probably 3-4 in - > practice) because of filesystem overhead, overhead depends on - > filesystem, not on record count or record size. In _libmdbx_ IOPS count - > will grow logarithmically depending on record count in DB (height of B+ - > tree) and will require at least 2 IOPS per transaction too. - -3. [CoW](https://en.wikipedia.org/wiki/Copy-on-write) for -[MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) -is done on memory page level with -[B+trees](https://ru.wikipedia.org/wiki/B-%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D0%BE). -Therefore altering data requires to copy about Olog(N) memory pages, -which uses [memory bandwidth](https://en.wikipedia.org/wiki/Memory_bandwidth) and is main -performance bottleneck in `MDBX_MAPASYNC` mode. - > This is unavoidable, but isn't that bad. Syncing data to disk requires - > much more similar operations which will be done by OS, therefore this is - > noticeable only if data sync to persistent storage is fully disabled. - > _libmdbx_ allows to safely save data to persistent storage with minimal - > performance overhead. If there is no need to save data to persistent - > storage then it's much more preferable to use `std::map`. - -4. Massive altering of data during a parallel long read operation will -increase the process work set, may exhaust entire free database space and -result in subsequent write performance degradation. - > _libmdbx_ mostly solve this issue by lack-of-space callback and `MDBX_LIFORECLAIM` mode. - > See [`mdbx.h`](mdbx.h) with API description for details. - > The "next" version of libmdbx (MithrilDB) will completely solve this. - -5. There are no built-in checksums or digests to verify database integrity. - > The "next" version of _libmdbx_ (MithrilDB) will solve this issue employing [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree). +### Acknowledgments +Howard Chu <hyc@openldap.org> is the author of LMDB, from which +originated the MDBX in 2015. + +Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which +was used for begin development of LMDB. -------------------------------------------------------------------------------- @@ -427,20 +400,23 @@ will need to install the current (not outdated) version of recommend that you install [Homebrew](https://brew.sh/) and then execute `brew install bash`. +## API description +For more information and API description see the [mdbx.h](mdbx.h) header. + ## Bindings | Runtime | GitHub | Author | | -------- | ------ | ------ | | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | - + | Rust | [mdbx-rs](https://github.com/Kerollmops/mdbx-rs) | [Clément Renault](https://github.com/Kerollmops) | -------------------------------------------------------------------------------- Performance comparison ====================== -All benchmarks were done by [IOArena](https://github.com/pmwkaa/ioarena) +All benchmarks were done in 2015 by [IOArena](https://github.com/pmwkaa/ioarena) and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz, 8 Gb RAM, SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb. @@ -557,7 +533,7 @@ and after full run the database contains 10,000 small key-value records. Summary of used resources during lazy-write mode benchmarks: - - Read and write IOPS; + - Read and write IOPs; - Sum of user CPU time and sys CPU time; @@ -566,7 +542,7 @@ Summary of used resources during lazy-write mode benchmarks: compactification, etc). _ForestDB_ is excluded because benchmark showed it's resource -consumption for each resource (CPU, IOPS) much higher than other engines +consumption for each resource (CPU, IOPs) much higher than other engines which prevents to meaningfully compare it with them. All benchmark data is gathered by @@ -577,16 +553,5 @@ syscall and by scanning data directory. -------------------------------------------------------------------------------- -``` -$ objdump -f -h -j .text libmdbx.so - -libmdbx.so: file format elf64-x86-64 -architecture: i386:x86-64, flags 0x00000150: -HAS_SYMS, DYNAMIC, D_PAGED -start address 0x0000000000003710 - -Sections: -Idx Name Size VMA LMA File off Algn - 11 .text 00015eff 0000000000003710 0000000000003710 00003710 2**4 - CONTENTS, ALLOC, LOAD, READONLY, CODE -``` +### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans. +<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> diff --git a/libs/libmdbx/src/appveyor.yml b/libs/libmdbx/src/appveyor.yml index f42224d582..0cf8f8633f 100644 --- a/libs/libmdbx/src/appveyor.yml +++ b/libs/libmdbx/src/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.4.0.{build} +version: 0.5.0.{build} environment: matrix: diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index dcbe608b29..a522501a65 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -1,4 +1,4 @@ -/**** BRIEFLY ****************************************************************** +/**** BRIEFLY ****************************************************************** * * libmdbx is superior to LMDB (https://bit.ly/26ts7tL) in terms of features * and reliability, not inferior in performance. In comparison to LMDB, libmdbx @@ -55,10 +55,10 @@ * transaction logs or append-only data writes, MDBX requires no maintenance * during operation. Both write-ahead loggers and append-only databases require * periodic checkpointing and/or compaction of their log or database files - * otherwise they grow without bound. MDBX tracks free pages within the database - * and re-uses them for new write operations, so the database size does not grow - * without bound in normal use. It is worth noting that the "next" version - * libmdbx (MithrilDB) will solve this problem. + * otherwise they grow without bound. MDBX tracks retired/freed pages within the + * database and re-uses them for new write operations, so the database size does + * not grow without bound in normal use. It is worth noting that the "next" + * version libmdbx (MithrilDB) will solve this problem. * * The memory map can be used as a read-only or read-write map. It is read-only * by default as this provides total immunity to corruption. Using read-write @@ -403,17 +403,16 @@ * the lock was restored - we have to wait until such a process releases the * database, and so on. * - * - Avoid long-lived transactions, especially in the scenarios with a high - * rate of write transactions. Read transactions prevent reuse of pages - * freed by newer write transactions, thus the database can grow quickly. - * Write transactions prevent other write transactions, since writes are - * serialized. + * - Avoid long-lived read transactions, especially in the scenarios with a + * high rate of write transactions. Long-lived read transactions prevents + * recycling pages retired/freed by newer write transactions, thus the + * database can grow quickly. * * Understanding the problem of long-lived read transactions requires some * explanation, but can be difficult for quick perception. So is is * reasonable to simplify this as follows: * 1. Garbage collection problem exists in all databases one way or - * another, e.g. VACUUM in PostgreSQL. But in _libmdbx_ it's even more + * another, e.g. VACUUM in PostgreSQL. But in MDBX it's even more * discernible because of high transaction rate and intentional * internals simplification in favor of performance. * @@ -644,7 +643,7 @@ typedef pthread_t mdbx_tid_t; /*----------------------------------------------------------------------------*/ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 4 +#define MDBX_VERSION_MINOR 5 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) @@ -826,7 +825,9 @@ typedef struct iovec MDBX_val; #define MDBX_DBG_ASSERT 1 /* Enable assertion checks */ #define MDBX_DBG_AUDIT 2 /* Enable pages usage audit at commit transactions */ #define MDBX_DBG_JITTER 4 /* Enable small random delays in critical points */ -#define MDBX_DBG_DUMP 8 /* Include or not database(s) in coredump files */ +#define MDBX_DBG_DUMP /* Include or not meta-pages in coredump files, MAY \ + affect performance in MDBX_WRITEMAP mode */ \ + 8 #define MDBX_DBG_LEGACY_MULTIOPEN 16 /* Enable multi-opening environment(s) */ /* A debug-logger callback function, @@ -1076,11 +1077,11 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * conditions for the efficient operation of the disk write-back cache. * * MDBX_LIFORECLAIM is compatible with all no-sync flags (i.e. MDBX_NOMETASYNC, - * MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC), but gives no noticeable - * impact in combination with MDB_NOSYNC and MDX_MAPASYNC. Because MDBX will - * not reused paged from the last "steady" MVCC-snapshot and later, i.e. the - * loop length of database pages circulation will be mostly defined by frequency - * of calling mdbx_env_sync() rather than LIFO and FIFO difference. + * MDBX_SAFE_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC), but gives no + * noticeable impact in combination with MDBX_SAFE_NOSYNC. Because MDBX will + * reused pages only before the last "steady" MVCC-snapshot, i.e. the loop + * length of database pages circulation will be mostly defined by frequency of + * calling mdbx_env_sync() rather than LIFO and FIFO difference. * * This flag may be changed at any time using mdbx_env_set_flags(). */ #define MDBX_LIFORECLAIM 0x4000000u @@ -1089,12 +1090,12 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, #define MDBX_PAGEPERTURB 0x8000000u /**** SYNC MODES *************************************************************** - * (!!!) Using any combination of MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC - * and especially MDBX_UTTERLY_NOSYNC is always a deal to reduce durability - * for gain write performance. You must know exactly what you are doing and - * what risks you are taking! + * (!!!) Using any combination of MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC, + * MDBX_MAPASYNC and especially MDBX_UTTERLY_NOSYNC is always a deal to reduce + * durability for gain write performance. You must know exactly what you are + * doing and what risks you are taking! * - * NOTE for LMDB users: MDBX_NOSYNC is NOT similar to LMDB_NOSYNC, but + * NOTE for LMDB users: MDBX_SAFE_NOSYNC is NOT similar to LMDB_NOSYNC, but * MDBX_UTTERLY_NOSYNC is exactly match LMDB_NOSYNC. * See details below. * @@ -1184,24 +1185,25 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * for particular write transaction. * * - * MDBX_NOSYNC = don't sync anything but keep previous steady commits. + * MDBX_SAFE_NOSYNC = don't sync anything but keep previous steady commits. * - * Like MDBX_UTTERLY_NOSYNC the MDBX_NOSYNC flag similarly disable flush - * system buffers to disk when committing a transaction. But there is a - * huge difference in how are recycled the MVCC snapshots corresponding - * to previous "steady" transactions (see below). + * Like MDBX_UTTERLY_NOSYNC the MDBX_SAFE_NOSYNC flag similarly disable + * flush system buffers to disk when committing a transaction. But there + * is a huge difference in how are recycled the MVCC snapshots + * corresponding to previous "steady" transactions (see below). * - * Depending on the platform and hardware, with MDBX_NOSYNC you may get - * a multiple increase of write performance, even 10 times or more. - * NOTE that (MDBX_NOSYNC | MDBX_WRITEMAP) leaves the system with no hint - * for when to write transactions to disk. Therefore the (MDBX_MAPASYNC | - * MDBX_WRITEMAP) may be preferable, but without MDBX_NOSYNC because - * the (MDBX_MAPASYNC | MDBX_NOSYNC) actually gives MDBX_UTTERLY_NOSYNC. + * Depending on the platform and hardware, with MDBX_SAFE_NOSYNC you may + * get a multiple increase of write performance, even 10 times or more. + * NOTE that (MDBX_SAFE_NOSYNC | MDBX_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk. Therefore the + * (MDBX_MAPASYNC | MDBX_WRITEMAP) may be preferable, but without + * MDBX_SAFE_NOSYNC because the (MDBX_MAPASYNC | MDBX_SAFE_NOSYNC) actually + * gives MDBX_UTTERLY_NOSYNC. * - * In contrast to MDBX_UTTERLY_NOSYNC mode, with MDBX_NOSYNC flag MDBX will - * keeps untouched pages within B-tree of the last transaction "steady" - * which was synced to disk completely. This has big implications for both - * data durability and (unfortunately) performance: + * In contrast to MDBX_UTTERLY_NOSYNC mode, with MDBX_SAFE_NOSYNC flag MDBX + * will keeps untouched pages within B-tree of the last transaction + * "steady" which was synced to disk completely. This has big implications + * for both data durability and (unfortunately) performance: * - a system crash can't corrupt the database, but you will lose the * last transactions; because MDBX will rollback to last steady commit * since it kept explicitly. @@ -1214,22 +1216,22 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * insufficient space and before increasing the size of the file on * disk. * - * In other words, with MDBX_NOSYNC flag MDBX insures you from the whole - * database corruption, at the cost increasing database size and/or number - * of disk IOPS. So, MDBX_NOSYNC flag could be used with mdbx_env_synv() - * as alternatively for batch committing or nested transaction (in some - * cases). As well, auto-sync feature exposed by mdbx_env_set_syncbytes() - * and mdbx_env_set_syncperiod() functions could be very usefull with - * MDBX_NOSYNC flag. - * - * The number and volume of of disk IOPS with MDBX_NOSYNC flag will - * exactly the as without any no-sync flags. However, you should expect - * a larger process's work set (https://bit.ly/2kA2tFX) and significantly - * worse a locality of reference (https://bit.ly/2mbYq2J), due to the - * more intensive allocation of previously unused pages and increase the - * size of the database. - * - * MDBX_NOSYNC flag may be changed at any time using + * In other words, with MDBX_SAFE_NOSYNC flag MDBX insures you from the + * whole database corruption, at the cost increasing database size and/or + * number of disk IOPS. So, MDBX_SAFE_NOSYNC flag could be used with + * mdbx_env_synv() as alternatively for batch committing or nested + * transaction (in some cases). As well, auto-sync feature exposed by + * mdbx_env_set_syncbytes() and mdbx_env_set_syncperiod() functions could + * be very usefull with MDBX_SAFE_NOSYNC flag. + * + * The number and volume of of disk IOPS with MDBX_SAFE_NOSYNC flag will + * exactly the as without any no-sync flags. However, you should expect a + * larger process's work set (https://bit.ly/2kA2tFX) and significantly + * worse a locality of reference (https://bit.ly/2mbYq2J), due to the more + * intensive allocation of previously unused pages and increase the size of + * the database. + * + * MDBX_SAFE_NOSYNC flag may be changed at any time using * mdbx_env_set_flags() or by passing to mdbx_txn_begin() for particular * write transaction. * @@ -1237,12 +1239,13 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * MDBX_MAPASYNC = use asynchronous msync when MDBX_WRITEMAP is used. * * MDBX_MAPASYNC meaningful and give effect only in conjunction - * with MDBX_WRITEMAP or MDBX_NOSYNC: - * - with MDBX_NOSYNC actually gives MDBX_UTTERLY_NOSYNC, which + * with MDBX_WRITEMAP or MDBX_SAFE_NOSYNC: + * - with MDBX_SAFE_NOSYNC actually gives MDBX_UTTERLY_NOSYNC, which * wipe previous steady commits for reuse pages as described above. - * - with MDBX_WRITEMAP but without MDBX_NOSYNC instructs MDBX to use + * - with MDBX_WRITEMAP but without MDBX_SAFE_NOSYNC instructs MDBX to use * asynchronous mmap-flushes to disk as described below. - * - with both MDBX_WRITEMAP and MDBX_NOSYNC you get the both effects. + * - with both MDBX_WRITEMAP and MDBX_SAFE_NOSYNC you get the both + * effects. * * Asynchronous mmap-flushes means that actually all writes will scheduled * and performed by operation system on it own manner, i.e. unordered. @@ -1250,10 +1253,10 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * data to disk, but no more. * * With MDBX_MAPASYNC flag, but without MDBX_UTTERLY_NOSYNC (i.e. without - * OR'ing with MDBX_NOSYNC) MDBX will keeps untouched pages within B-tree - * of the last transaction "steady" which was synced to disk completely. - * So, this makes exactly the same "long-lived" impact and the same - * consequences as described above for MDBX_NOSYNC flag. + * OR'ing with MDBX_SAFE_NOSYNC) MDBX will keeps untouched pages within + * B-tree of the last transaction "steady" which was synced to disk + * completely. So, this makes exactly the same "long-lived" impact and the + * same consequences as described above for MDBX_SAFE_NOSYNC flag. * * Depending on the platform and hardware, with combination of * MDBX_WRITEMAP and MDBX_MAPASYNC you may get a multiple increase of write @@ -1271,18 +1274,18 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, * * (!) don't combine this flag with MDBX_MAPASYNC * since you will got MDBX_UTTERLY_NOSYNC in that way (see below) */ -#define MDBX_NOSYNC 0x10000u +#define MDBX_SAFE_NOSYNC 0x10000u /* Use asynchronous msync when MDBX_WRITEMAP is used, * see description in the "SYNC MODES" section above. * - * (!) don't combine this flag with MDBX_NOSYNC + * (!) don't combine this flag with MDBX_SAFE_NOSYNC * since you will got MDBX_UTTERLY_NOSYNC in that way (see below) */ #define MDBX_MAPASYNC 0x100000u /* Don't sync anything and wipe previous steady commits, * see description in the "SYNC MODES" section above. */ -#define MDBX_UTTERLY_NOSYNC (MDBX_NOSYNC | MDBX_MAPASYNC) +#define MDBX_UTTERLY_NOSYNC (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC) /**** DATABASE FLAGS **********************************************************/ /* Use reverse string keys */ @@ -1519,14 +1522,14 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * MDBX_NORDAHEAD, MDBX_NOMEMINIT, MDBX_COALESCE, MDBX_LIFORECLAIM. * See "ENVIRONMENT FLAGS" section above. * - * - MDBX_NOMETASYNC, MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC. + * - MDBX_NOMETASYNC, MDBX_SAFE_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC. * See "SYNC MODES" section above. * * NOTE: MDB_NOLOCK flag don't supported by MDBX, * try use MDBX_EXCLUSIVE as a replacement. * * NOTE: MDBX don't allow to mix processes with different MDBX_WRITEMAP, - * MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC flags onthe same + * MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC flags on the same * environment. In such case MDBX_INCOMPATIBLE will be returned. * * If the database is already exist and parameters specified early by @@ -1552,7 +1555,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * more than once. * - MDBX_INCOMPATIBLE = Environment is already opened by another process, * but with different set of MDBX_WRITEMAP, - * MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC + * MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC * flags. * Or if the database is already exist and * parameters specified early by @@ -1727,7 +1730,7 @@ __deprecated LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, /* Flush the environment data buffers to disk. * * Unless the environment was opened with no-sync flags (MDBX_NOMETASYNC, - * MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC and MDBX_MAPASYNC), then data is always + * MDBX_SAFE_NOSYNC, MDBX_UTTERLY_NOSYNC and MDBX_MAPASYNC), then data is always * written an flushed to disk when mdbx_txn_commit() is called. Otherwise * mdbx_env_sync() may be called to manually write and flush unsynced data to * disk. @@ -1763,14 +1766,15 @@ LIBMDBX_API int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock); LIBMDBX_API int mdbx_env_sync(MDBX_env *env); LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); -/* Sets threshold to force flush the data buffers to disk, even of MDBX_NOSYNC, - * MDBX_NOMETASYNC and MDBX_MAPASYNC flags in the environment. The threshold - * value affects all processes which operates with given environment until the - * last process close environment or a new value will be settled. +/* Sets threshold to force flush the data buffers to disk, even of + * MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags in the environment. + * The threshold value affects all processes which operates with given + * environment until the last process close environment or a new value will be + * settled. * * Data is always written to disk when mdbx_txn_commit() is called, but the * operating system may keep it buffered. MDBX always flushes the OS buffers - * upon commit as well, unless the environment was opened with MDBX_NOSYNC, + * upon commit as well, unless the environment was opened with MDBX_SAFE_NOSYNC, * MDBX_MAPASYNC or in part MDBX_NOMETASYNC. * * The default is 0, than mean no any threshold checked, and no additional @@ -1784,14 +1788,14 @@ LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); /* Sets relative period since the last unsteay commit to force flush the data - * buffers to disk, even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags - * in the environment. The relative period value affects all processes which - * operates with given environment until the last process close environment or a - * new value will be settled. + * buffers to disk, even of MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC + * flags in the environment. The relative period value affects all processes + * which operates with given environment until the last process close + * environment or a new value will be settled. * * Data is always written to disk when mdbx_txn_commit() is called, but the * operating system may keep it buffered. MDBX always flushes the OS buffers - * upon commit as well, unless the environment was opened with MDBX_NOSYNC, + * upon commit as well, unless the environment was opened with MDBX_SAFE_NOSYNC, * MDBX_MAPASYNC or in part MDBX_NOMETASYNC. * * Settled period don't checked asynchronously, but only by the @@ -2223,9 +2227,9 @@ LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); * - MDBX_TRYTXN * Do not block when starting a write transaction. * - * - MDBX_NOSYNC, MDBX_NOMETASYNC or MDBX_MAPASYNC + * - MDBX_SAFE_NOSYNC, MDBX_NOMETASYNC or MDBX_MAPASYNC * Do not sync data to disk corresponding to MDBX_NOMETASYNC - * or MDBX_NOSYNC description (see abobe). + * or MDBX_SAFE_NOSYNC description (see abobe). * * [out] txn Address where the new MDBX_txn handle will be stored * @@ -2299,7 +2303,7 @@ typedef struct MDBX_txn_info { * [in] txn A transaction handle returned by mdbx_txn_begin(). * [out] stat The address of an MDBX_txn_info structure * where the information will be copied. - * [in[ scan_rlt The boolean flag controls the scan of the read lock table to + * [in] scan_rlt The boolean flag controls the scan of the read lock table to * provide complete information. Such scan is relatively * expensive and you can avoid it if corresponding fields are * not needed (see description of MDBX_txn_info above). @@ -3517,16 +3521,10 @@ LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, #endif /* MDBX_NEXENTA_ATTRS */ /******************************************************************************* - * LY: temporary workaround for Elbrus's memcmp() bug. */ -#ifndef __GLIBC_PREREQ -#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) -#define __GLIBC_PREREQ(maj, min) \ - ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __GLIBC_PREREQ(maj, min) (0) -#endif -#endif /* __GLIBC_PREREQ */ -#if defined(__e2k__) && !__GLIBC_PREREQ(2, 24) + * Workaround for mmaped-lookahead-cross-page-boundary bug + * in an obsolete versions of Elbrus's libc and kernels. */ +#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ + MDBX_E2K_MLHCPB_WORKAROUND LIBMDBX_API int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, size_t n); LIBMDBX_API int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2); @@ -3549,8 +3547,7 @@ LIBMDBX_API size_t mdbx_e2k_strnlen_bug_workaround(const char *s, #define strlen mdbx_e2k_strlen_bug_workaround #undef strnlen #define strnlen mdbx_e2k_strnlen_bug_workaround - -#endif /* Elbrus's memcmp() bug. */ +#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ #ifdef __cplusplus } diff --git a/libs/libmdbx/src/src/alloy.c b/libs/libmdbx/src/src/alloy.c index 98f3aac3a9..c40446732b 100644 --- a/libs/libmdbx/src/src/alloy.c +++ b/libs/libmdbx/src/src/alloy.c @@ -11,9 +11,8 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. */ -/* Amalgamated build */ -#define MDBX_ALLOY 1 -#include "elements/internals.h" /* must be included fisrt */ +#define MDBX_ALLOY 1 /* amalgamated build */ +#include "elements/internals.h" /* must be included first */ #include "elements/core.c" #include "elements/osal.c" diff --git a/libs/libmdbx/src/src/elements/config.h.in b/libs/libmdbx/src/src/elements/config.h.in index 1079899d9a..b8776d98a2 100644 --- a/libs/libmdbx/src/src/elements/config.h.in +++ b/libs/libmdbx/src/src/elements/config.h.in @@ -12,6 +12,7 @@ #cmakedefine ENABLE_GPROF #cmakedefine ENABLE_GCOV #cmakedefine ENABLE_ASAN +#cmakedefine MDBX_FORCE_ASSERTIONS /* Common */ #cmakedefine01 MDBX_TXN_CHECKOWNER diff --git a/libs/libmdbx/src/src/elements/core.c b/libs/libmdbx/src/src/elements/core.c index b45db5178b..b5a27ddccf 100644 --- a/libs/libmdbx/src/src/elements/core.c +++ b/libs/libmdbx/src/src/elements/core.c @@ -40,12 +40,12 @@ /*------------------------------------------------------------------------------ * Internal inlines */ -static __pure_function __inline bool is_powerof2(size_t x) { +static __pure_function __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -static __pure_function __inline size_t roundup_powerof2(size_t value, - size_t granularity) { +static __pure_function __always_inline size_t +roundup_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return (value + granularity - 1) & ~(granularity - 1); } @@ -70,22 +70,26 @@ static __pure_function unsigned log2n(size_t value) { /*------------------------------------------------------------------------------ * Unaligned access */ -static __pure_function __maybe_unused __inline unsigned +static __pure_function __maybe_unused __always_inline unsigned field_alignment(unsigned alignment_baseline, size_t field_offset) { unsigned merge = alignment_baseline | (unsigned)field_offset; return merge & -(int)merge; } /* read-thunk for UB-sanitizer */ -static __pure_function __inline uint8_t peek_u8(const uint8_t *ptr) { +static __pure_function __always_inline uint8_t +peek_u8(const uint8_t *const __restrict ptr) { return *ptr; } /* write-thunk for UB-sanitizer */ -static __inline void poke_u8(uint8_t *ptr, const uint8_t v) { *ptr = v; } +static __always_inline void poke_u8(uint8_t *const __restrict ptr, + const uint8_t v) { + *ptr = v; +} -static __pure_function __inline uint16_t -unaligned_peek_u16(const unsigned expected_alignment, const void *ptr) { +static __pure_function __always_inline uint16_t +unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint16_t)) == 0) return *(const uint16_t *)ptr; @@ -96,8 +100,9 @@ unaligned_peek_u16(const unsigned expected_alignment, const void *ptr) { } } -static __inline void unaligned_poke_u16(const unsigned expected_alignment, - void *ptr, const uint16_t v) { +static __always_inline void +unaligned_poke_u16(const unsigned expected_alignment, + void *const __restrict ptr, const uint16_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) *(uint16_t *)ptr = v; @@ -105,8 +110,8 @@ static __inline void unaligned_poke_u16(const unsigned expected_alignment, memcpy(ptr, &v, sizeof(v)); } -static __pure_function __inline uint32_t -unaligned_peek_u32(const unsigned expected_alignment, const void *ptr) { +static __pure_function __always_inline uint32_t unaligned_peek_u32( + const unsigned expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint32_t)) == 0) return *(const uint32_t *)ptr; @@ -123,8 +128,9 @@ unaligned_peek_u32(const unsigned expected_alignment, const void *ptr) { } } -static __inline void unaligned_poke_u32(const unsigned expected_alignment, - void *ptr, const uint32_t v) { +static __always_inline void +unaligned_poke_u32(const unsigned expected_alignment, + void *const __restrict ptr, const uint32_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) *(uint32_t *)ptr = v; @@ -136,8 +142,8 @@ static __inline void unaligned_poke_u32(const unsigned expected_alignment, memcpy(ptr, &v, sizeof(v)); } -static __pure_function __inline uint64_t -unaligned_peek_u64(const unsigned expected_alignment, const void *ptr) { +static __pure_function __always_inline uint64_t unaligned_peek_u64( + const unsigned expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint64_t)) == 0) return *(const uint64_t *)ptr; @@ -154,8 +160,9 @@ unaligned_peek_u64(const unsigned expected_alignment, const void *ptr) { } } -static __inline void unaligned_poke_u64(const unsigned expected_alignment, - void *ptr, const uint64_t v) { +static __always_inline void +unaligned_poke_u64(const unsigned expected_alignment, + void *const __restrict ptr, const uint64_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) *(uint64_t *)ptr = v; @@ -188,7 +195,8 @@ static __inline void unaligned_poke_u64(const unsigned expected_alignment, unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) /* Get the page number pointed to by a branch node */ -static __pure_function __inline pgno_t node_pgno(const MDBX_node *node) { +static __pure_function __always_inline pgno_t +node_pgno(const MDBX_node *const __restrict node) { pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32); if (sizeof(pgno) > 4) pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32; @@ -196,7 +204,8 @@ static __pure_function __inline pgno_t node_pgno(const MDBX_node *node) { } /* Set the page number in a branch node */ -static __inline void node_set_pgno(MDBX_node *node, pgno_t pgno) { +static __always_inline void node_set_pgno(MDBX_node *const __restrict node, + pgno_t pgno) { assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno); @@ -206,32 +215,38 @@ static __inline void node_set_pgno(MDBX_node *node, pgno_t pgno) { } /* Get the size of the data in a leaf node */ -static __pure_function __inline size_t node_ds(const MDBX_node *node) { +static __pure_function __always_inline size_t +node_ds(const MDBX_node *const __restrict node) { return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize); } /* Set the size of the data for a leaf node */ -static __inline void node_set_ds(MDBX_node *node, size_t size) { +static __always_inline void node_set_ds(MDBX_node *const __restrict node, + size_t size) { assert(size < INT_MAX); UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size); } /* The size of a key in a node */ -static __pure_function __inline size_t node_ks(const MDBX_node *node) { +static __pure_function __always_inline size_t +node_ks(const MDBX_node *const __restrict node) { return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize); } /* Set the size of the key for a leaf node */ -static __inline void node_set_ks(MDBX_node *node, size_t size) { +static __always_inline void node_set_ks(MDBX_node *const __restrict node, + size_t size) { assert(size < INT16_MAX); UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size); } -static __pure_function __inline uint8_t node_flags(const MDBX_node *node) { +static __pure_function __always_inline uint8_t +node_flags(const MDBX_node *const __restrict node) { return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags); } -static __inline void node_set_flags(MDBX_node *node, uint8_t flags) { +static __always_inline void node_set_flags(MDBX_node *const __restrict node, + uint8_t flags) { UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags); } @@ -239,27 +254,30 @@ static __inline void node_set_flags(MDBX_node *node, uint8_t flags) { #define NODESIZE offsetof(MDBX_node, mn_data) /* Address of the key for the node */ -static __pure_function __inline void *node_key(const MDBX_node *node) { +static __pure_function __always_inline void * +node_key(const MDBX_node *const __restrict node) { return (char *)node + NODESIZE; } /* Address of the data for a node */ -static __pure_function __inline void *node_data(const MDBX_node *node) { +static __pure_function __always_inline void * +node_data(const MDBX_node *const __restrict node) { return (char *)node_key(node) + node_ks(node); } /* Size of a node in a leaf page with a given key and data. * This is node header plus key plus data size. */ -static __pure_function __inline size_t node_size_len(const size_t key_len, - const size_t value_len) { +static __pure_function __always_inline size_t +node_size_len(const size_t key_len, const size_t value_len) { return NODESIZE + EVEN(key_len + value_len); } -static __pure_function __inline size_t node_size(const MDBX_val *key, - const MDBX_val *value) { +static __pure_function __always_inline size_t node_size(const MDBX_val *key, + const MDBX_val *value) { return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); } -static __pure_function __inline pgno_t peek_pgno(const void *ptr) { +static __pure_function __always_inline pgno_t +peek_pgno(const void *const __restrict ptr) { if (sizeof(pgno_t) == sizeof(uint32_t)) return (pgno_t)unaligned_peek_u32(1, ptr); else if (sizeof(pgno_t) == sizeof(uint64_t)) @@ -271,7 +289,8 @@ static __pure_function __inline pgno_t peek_pgno(const void *ptr) { } } -static __inline void poke_pgno(void *ptr, const pgno_t pgno) { +static __always_inline void poke_pgno(void *const __restrict ptr, + const pgno_t pgno) { if (sizeof(pgno) == sizeof(uint32_t)) unaligned_poke_u32(1, ptr, pgno); else if (sizeof(pgno) == sizeof(uint64_t)) @@ -280,8 +299,8 @@ static __inline void poke_pgno(void *ptr, const pgno_t pgno) { memcpy(ptr, &pgno, sizeof(pgno)); } -static __pure_function __inline pgno_t -node_largedata_pgno(const MDBX_node *node) { +static __pure_function __always_inline pgno_t +node_largedata_pgno(const MDBX_node *const __restrict node) { assert(node_flags(node) & F_BIGDATA); return peek_pgno(node_data(node)); } @@ -399,8 +418,9 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags) { * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment * of the MDBX_node headers. */ -static __pure_function __inline size_t -leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { +static __pure_function __always_inline size_t leaf_size(const MDBX_env *env, + const MDBX_val *key, + const MDBX_val *data) { size_t node_bytes = node_size(key, data); /* NOTE: The actual limit is LEAF_NODEMAX(env->me_psize), but it reasonable to * use env->me_branch_nodemax (which is 3 times less) as the treshold because: @@ -442,8 +462,8 @@ leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { * [in] key The key for the node. * * Returns The number of bytes needed to store the node. */ -static __pure_function __inline size_t branch_size(const MDBX_env *env, - const MDBX_val *key) { +static __pure_function __always_inline size_t branch_size(const MDBX_env *env, + const MDBX_val *key) { /* Size of a node in a branch page with a given key. * This is just the node header plus the key, there is no data. */ size_t node_bytes = node_size(key, nullptr); @@ -460,69 +480,72 @@ static __pure_function __inline size_t branch_size(const MDBX_env *env, /*----------------------------------------------------------------------------*/ -static __pure_function __inline size_t pgno2bytes(const MDBX_env *env, - pgno_t pgno) { +static __pure_function __always_inline size_t pgno2bytes(const MDBX_env *env, + pgno_t pgno) { mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } -static __pure_function __inline MDBX_page *pgno2page(const MDBX_env *env, - pgno_t pgno) { +static __pure_function __always_inline MDBX_page *pgno2page(const MDBX_env *env, + pgno_t pgno) { return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); } -static __pure_function __inline pgno_t bytes2pgno(const MDBX_env *env, - size_t bytes) { +static __pure_function __always_inline pgno_t bytes2pgno(const MDBX_env *env, + size_t bytes) { mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); return (pgno_t)(bytes >> env->me_psize2log); } -static __pure_function __inline size_t pgno_align2os_bytes(const MDBX_env *env, - pgno_t pgno) { +static __pure_function size_t pgno_align2os_bytes(const MDBX_env *env, + pgno_t pgno) { return roundup_powerof2(pgno2bytes(env, pgno), env->me_os_psize); } -static __pure_function __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, - pgno_t pgno) { +static __pure_function pgno_t pgno_align2os_pgno(const MDBX_env *env, + pgno_t pgno) { return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); } -static __pure_function __inline size_t bytes_align2os_bytes(const MDBX_env *env, - size_t bytes) { +static __pure_function size_t bytes_align2os_bytes(const MDBX_env *env, + size_t bytes) { return roundup_powerof2(roundup_powerof2(bytes, env->me_psize), env->me_os_psize); } /* Address of first usable data byte in a page, after the header */ -static __pure_function __inline void *page_data(const MDBX_page *mp) { +static __pure_function __always_inline void *page_data(const MDBX_page *mp) { return (char *)mp + PAGEHDRSZ; } -static __pure_function __inline const MDBX_page *data_page(const void *data) { +static __pure_function __always_inline const MDBX_page * +data_page(const void *data) { return container_of(data, MDBX_page, mp_ptrs); } -static __pure_function __inline MDBX_meta *page_meta(MDBX_page *mp) { +static __pure_function __always_inline MDBX_meta *page_meta(MDBX_page *mp) { return (MDBX_meta *)page_data(mp); } /* Number of nodes on a page */ -static __pure_function __inline unsigned page_numkeys(const MDBX_page *mp) { +static __pure_function __always_inline unsigned +page_numkeys(const MDBX_page *mp) { return mp->mp_lower >> 1; } /* The amount of space remaining in the page */ -static __pure_function __inline unsigned page_room(const MDBX_page *mp) { +static __pure_function __always_inline unsigned page_room(const MDBX_page *mp) { return mp->mp_upper - mp->mp_lower; } -static __pure_function __inline unsigned page_space(const MDBX_env *env) { +static __pure_function __always_inline unsigned +page_space(const MDBX_env *env) { STATIC_ASSERT(PAGEHDRSZ % 2 == 0); return env->me_psize - PAGEHDRSZ; } -static __pure_function __inline unsigned page_used(const MDBX_env *env, - const MDBX_page *mp) { +static __pure_function __always_inline unsigned page_used(const MDBX_env *env, + const MDBX_page *mp) { return page_space(env) - page_room(mp); } @@ -540,14 +563,14 @@ page_fill_enough(const MDBX_page *mp, unsigned spaceleft_threshold, } /* The number of overflow pages needed to store the given size. */ -static __pure_function __inline pgno_t number_of_ovpages(const MDBX_env *env, - size_t bytes) { +static __pure_function __always_inline pgno_t +number_of_ovpages(const MDBX_env *env, size_t bytes) { return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; } /* Address of node i in page p */ -static __pure_function __inline MDBX_node *page_node(const MDBX_page *mp, - unsigned i) { +static __pure_function __always_inline MDBX_node *page_node(const MDBX_page *mp, + unsigned i) { assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); assert(page_numkeys(mp) > (unsigned)(i)); assert(mp->mp_ptrs[i] % 2 == 0); @@ -557,7 +580,7 @@ static __pure_function __inline MDBX_node *page_node(const MDBX_page *mp, /* The address of a key in a LEAF2 page. * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. * There are no node headers, keys are stored contiguously. */ -static __pure_function __inline void * +static __pure_function __always_inline void * page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { assert(mp->mp_leaf2_ksize == keysize); (void)keysize; @@ -565,22 +588,23 @@ page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { } /* Set the node's key into keyptr. */ -static __inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { +static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { keyptr->iov_len = node_ks(node); keyptr->iov_base = node_key(node); } /* Set the node's key into keyptr, if requested. */ -static __inline void get_key_optional(const MDBX_node *node, - MDBX_val *keyptr /* __may_null */) { +static __always_inline void +get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { if (keyptr) get_key(node, keyptr); } /*------------------------------------------------------------------------------ - * LY: temporary workaround for Elbrus's memcmp() bug. */ - -#if defined(__e2k__) && !__GLIBC_PREREQ(2, 24) + * Workaround for mmaped-lookahead-cross-page-boundary bug + * in an obsolete versions of Elbrus's libc and kernels. */ +#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ + MDBX_E2K_MLHCPB_WORKAROUND int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, size_t n) { if (unlikely(n > 42 @@ -688,12 +712,12 @@ size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { } return n; } -#endif /* Elbrus's memcmp() bug. */ +#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ /*------------------------------------------------------------------------------ * safe read/write volatile 64-bit fields on 32-bit architectures. */ -static __inline void atomic_yield(void) { +static __always_inline void atomic_yield(void) { #if defined(_WIN32) || defined(_WIN64) YieldProcessor(); #elif defined(__x86_64__) || defined(__i386__) || defined(__e2k__) @@ -723,8 +747,8 @@ static __inline void atomic_yield(void) { } #if MDBX_64BIT_CAS -static __inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, - uint64_t v) { +static __always_inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, + uint64_t v) { #if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); STATIC_ASSERT(atomic_is_lock_free(p)); @@ -742,8 +766,8 @@ static __inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, } #endif /* MDBX_64BIT_CAS */ -static __inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, - uint32_t v) { +static __always_inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, + uint32_t v) { #if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); STATIC_ASSERT(atomic_is_lock_free(p)); @@ -760,7 +784,7 @@ static __inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, #endif } -static __inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { +static __always_inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { #if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); STATIC_ASSERT(atomic_is_lock_free(p)); @@ -779,7 +803,7 @@ static __inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { #define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) -static __maybe_unused __inline bool safe64_is_valid(uint64_t v) { +static __maybe_unused __always_inline bool safe64_is_valid(uint64_t v) { #if MDBX_WORDBITS >= 64 return v < SAFE64_INVALID_THRESHOLD; #else @@ -787,7 +811,7 @@ static __maybe_unused __inline bool safe64_is_valid(uint64_t v) { #endif /* MDBX_WORDBITS */ } -static __maybe_unused __inline bool +static __maybe_unused __always_inline bool safe64_is_valid_ptr(const mdbx_safe64_t *ptr) { mdbx_compiler_barrier(); #if MDBX_64BIT_ATOMIC @@ -797,7 +821,7 @@ safe64_is_valid_ptr(const mdbx_safe64_t *ptr) { #endif /* MDBX_64BIT_ATOMIC */ } -static __inline uint64_t safe64_txnid_next(uint64_t txnid) { +static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { txnid += MDBX_TXNID_STEP; #if !MDBX_64BIT_CAS /* avoid overflow of low-part in safe64_reset() */ @@ -806,7 +830,8 @@ static __inline uint64_t safe64_txnid_next(uint64_t txnid) { return txnid; } -static __inline void safe64_reset(mdbx_safe64_t *ptr, bool single_writer) { +static __always_inline void safe64_reset(mdbx_safe64_t *ptr, + bool single_writer) { mdbx_compiler_barrier(); #if !MDBX_64BIT_CAS if (!single_writer) { @@ -824,14 +849,15 @@ static __inline void safe64_reset(mdbx_safe64_t *ptr, bool single_writer) { ptr->atomic = UINT64_MAX; #else /* atomically make value >= SAFE64_INVALID_THRESHOLD */ - ptr->high = ptr->low = UINT32_MAX; + ptr->high = UINT32_MAX; #endif /* MDBX_64BIT_ATOMIC */ assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); } -static __inline bool safe64_reset_compare(mdbx_safe64_t *ptr, txnid_t compare) { +static __always_inline bool safe64_reset_compare(mdbx_safe64_t *ptr, + txnid_t compare) { mdbx_compiler_barrier(); /* LY: This function is used to reset `mr_txnid` from OOM-kick in case * the asynchronously cancellation of read transaction. Therefore, @@ -859,7 +885,7 @@ static __inline bool safe64_reset_compare(mdbx_safe64_t *ptr, txnid_t compare) { return rc; } -static __inline void safe64_write(mdbx_safe64_t *ptr, const uint64_t v) { +static __always_inline void safe64_write(mdbx_safe64_t *ptr, const uint64_t v) { mdbx_compiler_barrier(); assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); #if MDBX_64BIT_ATOMIC @@ -867,7 +893,7 @@ static __inline void safe64_write(mdbx_safe64_t *ptr, const uint64_t v) { #else /* MDBX_64BIT_ATOMIC */ /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ ptr->low = (uint32_t)v; - // assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); + assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ @@ -900,7 +926,8 @@ static __always_inline uint64_t safe64_read(const mdbx_safe64_t *ptr) { return v; } -static __inline void safe64_update(mdbx_safe64_t *ptr, const uint64_t v) { +static __always_inline void safe64_update(mdbx_safe64_t *ptr, + const uint64_t v) { safe64_reset(ptr, true); safe64_write(ptr, v); } @@ -1604,7 +1631,7 @@ static int lcklist_detach_locked(MDBX_env *env) { /*----------------------------------------------------------------------------*/ -static __inline size_t pnl2bytes(const size_t size) { +static __always_inline size_t pnl2bytes(const size_t size) { assert(size > 0 && size <= MDBX_PNL_MAX * 2); size_t bytes = roundup_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), @@ -1613,7 +1640,7 @@ static __inline size_t pnl2bytes(const size_t size) { return bytes; } -static __inline pgno_t bytes2pnl(const size_t bytes) { +static __always_inline pgno_t bytes2pnl(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); assert(size > 2 && size <= MDBX_PNL_MAX * 2); return (pgno_t)size - 2; @@ -1688,8 +1715,8 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, - size_t num) { +static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, + size_t num) { assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PNL_MAX && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); assert(num <= MDBX_PNL_MAX); @@ -1699,7 +1726,7 @@ static __inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, : mdbx_pnl_reserve(ppl, wanna); } -static __inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { +static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); if (mdbx_audit_enabled()) { for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) @@ -1789,7 +1816,7 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, return MDBX_SUCCESS; } -static bool __hot mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { +static bool mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { assert(limit >= MIN_PAGENO && limit <= MAX_PAGENO + 1); if (likely(MDBX_PNL_SIZE(pl))) { assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); @@ -1812,8 +1839,8 @@ static bool __hot mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { return true; } -static __inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, - const pgno_t limit) { +static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, + const pgno_t limit) { if (unlikely(pl == nullptr)) return true; assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); @@ -1871,7 +1898,7 @@ static __hot unsigned mdbx_pnl_exist(MDBX_PNL pnl, pgno_t id) { /*----------------------------------------------------------------------------*/ -static __inline size_t txl2bytes(const size_t size) { +static __always_inline size_t txl2bytes(const size_t size) { assert(size > 0 && size <= MDBX_TXL_MAX * 2); size_t bytes = roundup_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), @@ -1880,7 +1907,7 @@ static __inline size_t txl2bytes(const size_t size) { return bytes; } -static __inline size_t bytes2txl(const size_t bytes) { +static __always_inline size_t bytes2txl(const size_t bytes) { size_t size = bytes / sizeof(txnid_t); assert(size > 2 && size <= MDBX_TXL_MAX * 2); return size - 2; @@ -1933,8 +1960,8 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, - size_t num) { +static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, + size_t num) { assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); assert(num <= MDBX_PNL_MAX); @@ -1944,7 +1971,7 @@ static __inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, : mdbx_txl_reserve(ptl, wanna); } -static __inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { +static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); MDBX_PNL_SIZE(tl) += 1; MDBX_PNL_LAST(tl) = id; @@ -1970,7 +1997,7 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) SORT_IMPL(dp_sort, MDBX_DP, DP_SORT_CMP) -static __inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) { +static __always_inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) { assert(dl->length <= MDBX_DPL_TXNFULL); assert(dl->sorted <= dl->length); if (dl->sorted != dl->length) { @@ -2043,7 +2070,7 @@ static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) { return (unsigned)(it - dl); } -static __inline MDBX_page *mdbx_dpl_find(MDBX_DPL dl, pgno_t pgno) { +static __always_inline MDBX_page *mdbx_dpl_find(MDBX_DPL dl, pgno_t pgno) { const unsigned i = mdbx_dpl_search(dl, pgno); assert((int)i > 0); return (i <= dl->length && dl[i].pgno == pgno) ? dl[i].ptr : nullptr; @@ -2065,9 +2092,8 @@ static __hot MDBX_page *mdbx_dpl_remove(MDBX_DPL dl, pgno_t prno) { return mp; } -static __inline int __must_check_result mdbx_dpl_append(MDBX_DPL dl, - pgno_t pgno, - MDBX_page *page) { +static __always_inline int __must_check_result +mdbx_dpl_append(MDBX_DPL dl, pgno_t pgno, MDBX_page *page) { assert(dl->length <= MDBX_DPL_TXNFULL); if (mdbx_audit_enabled()) { for (unsigned i = dl->length; i > 0; --i) { @@ -2090,7 +2116,7 @@ static __inline int __must_check_result mdbx_dpl_append(MDBX_DPL dl, return MDBX_SUCCESS; } -static __inline void mdbx_dpl_clear(MDBX_DPL dl) { +static __always_inline void mdbx_dpl_clear(MDBX_DPL dl) { dl->sorted = dl->length = 0; } @@ -2196,7 +2222,7 @@ static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, bool dont_filter_gc); -static __maybe_unused __inline int __must_check_result +static __maybe_unused __always_inline int __must_check_result mdbx_audit(MDBX_txn *txn) { return mdbx_audit_ex(txn, 0, (txn->mt_flags & MDBX_RDONLY) != 0); } @@ -2664,7 +2690,7 @@ static void mdbx_dlist_free(MDBX_txn *txn) { mdbx_dpl_clear(dl); } -static __inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { +static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0); MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); @@ -2823,7 +2849,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { most -= 1; } const unsigned refunded = txn->mt_next_pgno - most; - mdbx_verbose("refund-sorted %u pages %" PRIaPGNO " -> %" PRIaPGNO, + mdbx_verbose("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; @@ -2863,7 +2889,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 && dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { MDBX_page *dp = dl[dl->length].ptr; - mdbx_verbose("refund-unsorted page %" PRIaPGNO, dp->mp_pgno); + mdbx_verbose("refund-sorted page %" PRIaPGNO, dp->mp_pgno); mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno); dl->length -= 1; } @@ -2932,10 +2958,10 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, mdbx_assert(env, pgno >= NUM_METAS && npages); if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) { const size_t bytes = pgno2bytes(env, npages); - memset(mp, 0, bytes); + memset(mp, -1, bytes); mp->mp_pgno = pgno; if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_pwrite(env->me_fd, mp, bytes, pgno2bytes(env, pgno)); + mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { struct iovec iov[MDBX_COMMIT_PAGES]; iov[0].iov_len = env->me_psize; @@ -2945,13 +2971,13 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, while (--npages) { iov[n] = iov[0]; if (++n == MDBX_COMMIT_PAGES) { - mdbx_pwritev(env->me_fd, iov, MDBX_COMMIT_PAGES, iov_off, + mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, pgno2bytes(env, MDBX_COMMIT_PAGES)); iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); n = 0; } } - mdbx_pwritev(env->me_fd, iov, n, iov_off, pgno2bytes(env, n)); + mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); } } @@ -2966,10 +2992,11 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static __hot int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { +static int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const pgno_t pgno = mp->mp_pgno; + mp->mp_txnid = INVALID_TXNID; if (txn->mt_parent) { mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno)); @@ -3046,7 +3073,7 @@ static __hot int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { return MDBX_SUCCESS; } -static __hot int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { +static int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const pgno_t pgno = mp->mp_pgno; MDBX_txn *const txn = mc->mc_txn; @@ -3095,8 +3122,8 @@ static __hot int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { return rc; } -static __must_check_result __inline int mdbx_retire_pgno(MDBX_cursor *mc, - const pgno_t pgno) { +static __must_check_result __always_inline int +mdbx_retire_pgno(MDBX_cursor *mc, const pgno_t pgno) { MDBX_page *mp; int rc = mdbx_page_get(mc, pgno, &mp, NULL); if (likely(rc == MDBX_SUCCESS)) @@ -3296,7 +3323,7 @@ bailout: /*----------------------------------------------------------------------------*/ -static __inline bool meta_bootid_match(const MDBX_meta *meta) { +static __always_inline bool meta_bootid_match(const MDBX_meta *meta) { return meta->mm_bootid.x == bootid.x && meta->mm_bootid.y == bootid.y && (bootid.x | bootid.y) != 0; } @@ -3458,15 +3485,15 @@ mdbx_meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { return head; } -static __hot MDBX_meta *mdbx_meta_steady(const MDBX_env *env) { +static MDBX_meta *mdbx_meta_steady(const MDBX_env *env) { return mdbx_meta_mostrecent(prefer_steady, env); } -static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { +static MDBX_meta *mdbx_meta_head(const MDBX_env *env) { return mdbx_meta_mostrecent(prefer_last, env); } -static __hot txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { +static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { while (true) { const MDBX_meta *head = mdbx_meta_head(env); const txnid_t recent = mdbx_meta_txnid_fluid(env, head); @@ -3477,7 +3504,7 @@ static __hot txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { } } -static __hot txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { +static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { while (true) { const MDBX_meta *head = mdbx_meta_steady(env); const txnid_t recent = mdbx_meta_txnid_fluid(env, head); @@ -3571,6 +3598,8 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { /* Add a page to the txn's dirty list */ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags |= P_DIRTY; const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; @@ -3584,7 +3613,7 @@ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { } #if !(defined(_WIN32) || defined(_WIN64)) -static __inline __maybe_unused int ignore_enosys(int err) { +static __always_inline __maybe_unused int ignore_enosys(int err) { #ifdef ENOSYS if (err == ENOSYS) return MDBX_RESULT_TRUE; @@ -3619,7 +3648,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, bytes2pgno(env, offset), bytes2pgno(env, offset + length)); #if defined(F_RDAHEAD) - if (unlikely(fcntl(env->me_fd, F_RDAHEAD, enable) == -1)) + if (unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) return errno; #endif /* F_RDAHEAD */ @@ -3629,7 +3658,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, hint.ra_offset = offset; hint.ra_count = length; (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( - env->me_fd, F_RDADVISE, &hint); + env->me_lazy_fd, F_RDADVISE, &hint); #endif /* F_RDADVISE */ #if defined(MADV_WILLNEED) int err = madvise(env->me_map + offset, length, MADV_WILLNEED) @@ -3651,7 +3680,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, } #elif defined(POSIX_FADV_WILLNEED) int err = ignore_enosys( - posix_fadvise(env->me_fd, offset, length, POSIX_FADV_WILLNEED)); + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; #endif /* MADV_WILLNEED */ @@ -3669,7 +3698,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, return err; #elif defined(POSIX_FADV_RANDOM) int err = ignore_enosys( - posix_fadvise(env->me_fd, offset, length, POSIX_FADV_RANDOM)); + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) return err; #endif /* MADV_RANDOM */ @@ -3767,8 +3796,9 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, POSIX_MADV_DONTNEED)); #elif defined(POSIX_FADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_fadvise( - env->me_fd, size_bytes, prev_size - size_bytes, POSIX_FADV_DONTNEED)); + rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes, + prev_size - size_bytes, + POSIX_FADV_DONTNEED)); #endif /* MADV_DONTNEED */ if (unlikely(MDBX_IS_ERROR(rc))) goto bailout; @@ -3866,7 +3896,7 @@ static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, if (env->me_flags & MDBX_WRITEMAP) meta->mm_datasync_sign = wipe; else - return mdbx_pwrite(env->me_fd, &wipe, sizeof(meta->mm_datasync_sign), + return mdbx_pwrite(env->me_lazy_fd, &wipe, sizeof(meta->mm_datasync_sign), (uint8_t *)&meta->mm_datasync_sign - env->me_map); } return MDBX_SUCCESS; @@ -3885,20 +3915,27 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { if (env->me_flags & MDBX_WRITEMAP) { mdbx_flush_incoherent_cpu_writeback(); - return mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS), false); - } - -#if defined(__linux__) || defined(__gnu_linux__) - if (sync_file_range(env->me_fd, 0, pgno2bytes(env, NUM_METAS), - SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) - err = errno; + err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } else { +#if (defined(__linux__) || defined(__gnu_linux__)) && !defined(MDBX_SAFE4QEMU) + if (sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), + SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) + err = errno; #else - err = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); + err = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA); #endif - if (unlikely(err != MDBX_SUCCESS)) - return err; - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + if (unlikely(err != MDBX_SUCCESS)) + return err; + mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + } + + if (likely(env->me_lck)) + /* force oldest refresh */ + env->me_lck->mti_readers_refresh_flag = true; return MDBX_SUCCESS; } @@ -3925,8 +3962,8 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { #define MDBX_ALLOC_NEW 4 #define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, - MDBX_page **const mp, int flags) { +__hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, + MDBX_page **const mp, int flags) { int rc; MDBX_txn *txn = mc->mc_txn; MDBX_env *env = txn->mt_env; @@ -3963,6 +4000,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, mdbx_ensure(env, np->mp_pgno >= NUM_METAS); VALGRIND_MAKE_MEM_UNDEFINED(page_data(np), page_space(txn->mt_env)); ASAN_UNPOISON_MEMORY_REGION(page_data(np), page_space(txn->mt_env)); + np->mp_flags = P_DIRTY; + np->mp_txnid = INVALID_TXNID; *mp = np; return MDBX_SUCCESS; } @@ -4034,7 +4073,7 @@ skip_cache: goto fail; if (flags & MDBX_LIFORECLAIM) { /* Begin from oldest reader if any */ - if (oldest > 2) { + if (oldest > MIN_TXNID) { last = oldest - 1; op = MDBX_SET_RANGE; } @@ -4241,15 +4280,17 @@ skip_cache: rc = MDBX_RESULT_TRUE; const pgno_t autosync_threshold = *env->me_autosync_threshold; const uint64_t autosync_period = *env->me_autosync_period; - /* wipe the last steady-point if: + /* wipe the last steady-point if one of: * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * otherwise, make a new steady-point if: + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: * - auto-sync threshold is specified and reached; - * - OR upper limit of database size is reached; - * - OR database is full (with the current file size) + * - upper limit of database size is reached; + * - database is full (with the current file size) * AND auto-sync threshold it NOT specified */ if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - (autosync_threshold | autosync_period) == 0) { + ((autosync_threshold | autosync_period) == 0 || + next >= steady->mm_geo.now)) { /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode * without any auto-sync treshold(s). */ rc = mdbx_wipe_steady(env, oldest); @@ -4312,9 +4353,10 @@ skip_cache: goto done; } - mdbx_warning("unable growth datafile to %" PRIaPGNO - " pages (+%" PRIaPGNO "), errcode %d", - aligned, aligned - txn->mt_end_pgno, rc); + mdbx_error("unable growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO + "), errcode %d", + aligned, aligned - txn->mt_end_pgno, rc); + rc = (rc == MDBX_RESULT_TRUE) ? MDBX_MAP_FULL : rc; } else { mdbx_debug("gc-alloc: next %u > upper %u", next, txn->mt_geo.upper); } @@ -4416,9 +4458,8 @@ __hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, * [in] mp the page being referenced. It must not be dirty. * [out] ret the writable page, if any. * ret is unchanged if mp wasn't spilled. */ -__hot static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, - MDBX_page *mp, - MDBX_page **ret) { +static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, + MDBX_page **ret) { MDBX_env *env = txn->mt_env; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; @@ -4453,10 +4494,8 @@ __hot static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, * page remains spilled until child commits */ int rc = mdbx_page_dirty(txn, np); - if (likely(rc == MDBX_SUCCESS)) { - np->mp_flags |= P_DIRTY; + if (likely(rc == MDBX_SUCCESS)) *ret = np; - } return rc; } return MDBX_SUCCESS; @@ -4540,6 +4579,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { mdbx_page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; + np->mp_txnid = INVALID_TXNID; np->mp_flags |= P_DIRTY; done: @@ -4620,7 +4660,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { if (outside_txn) { if (unsynced_pages > /* FIXME: define threshold */ 16 && - (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0) { + (flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); @@ -4629,7 +4669,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { /* LY: pre-sync without holding lock to reduce latency for writer(s) */ int err = (flags & MDBX_WRITEMAP) ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) - : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); + : mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -4647,7 +4687,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { } if (!META_IS_STEADY(head) || - ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) { + ((flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) { mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, data_page(head)->mp_pgno, mdbx_durable_str(head), unsynced_pages); @@ -4668,10 +4708,11 @@ fastpath: if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) { const txnid_t head_txnid = mdbx_recent_committed_txnid(env); if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) { - rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS), - false) - : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = + (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, NUM_METAS), false) + : mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) *env->me_meta_sync_txnid = (uint32_t)head_txnid; } @@ -4843,7 +4884,7 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { mdbx_txn_unlock(env); } } -#endif /* MDBX_USE_VALGRIND */ +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { @@ -4873,7 +4914,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, (flags & ~(MDBX_TXN_BEGIN_FLAGS | MDBX_TXN_SPILLS | MDBX_WRITEMAP)) == 0); if (flags & MDBX_RDONLY) { - txn->mt_flags = MDBX_RDONLY | (env->me_flags & MDBX_NOTLS); + txn->mt_flags = + MDBX_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(size_t) == sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { @@ -5004,6 +5046,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ mdbx_ensure(env, txn->mt_txnid >= /* paranoia is appropriate here */ *env->me_oldest); + txn->mt_numdbs = env->me_numdbs; } else { /* Not yet touching txn == env->me_txn0, it may be active */ mdbx_jitter4testing(false); @@ -5046,7 +5089,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (txn->tw.lifo_reclaimed) MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; env->me_txn = txn; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); + txn->mt_numdbs = env->me_numdbs; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); /* Copy the DB info and flags */ memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ @@ -5055,7 +5099,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } /* Setup db info */ - txn->mt_numdbs = env->me_numdbs; mdbx_compiler_barrier(); for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { unsigned x = env->me_dbflags[i]; @@ -5092,9 +5135,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; mdbx_srwlock_AcquireShared(&env->me_remap_guard); } -#endif +#endif /* Windows */ } else { env->me_dxb_mmap.current = size; +#if defined(_WIN32) || defined(_WIN64) + env->me_dxb_mmap.filesize = + (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; +#endif /* Windows */ } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) mdbx_txn_valgrind(env, txn); @@ -5318,9 +5365,10 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (txn != env->me_txn0) mdbx_free(txn); } else { - mdbx_assert(env, (txn->mt_flags & - ~(MDBX_RDONLY | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_MAPASYNC)) == 0); + mdbx_assert(env, + (txn->mt_flags & ~(MDBX_NOTLS | MDBX_RDONLY | MDBX_WRITEMAP | + MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) == 0); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO @@ -5811,43 +5859,44 @@ static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_PROBLEM; } -static __inline unsigned backlog_size(MDBX_txn *txn) { +static __always_inline unsigned backlog_size(MDBX_txn *txn) { return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; } -static __inline unsigned gctree_backlog(MDBX_txn *txn) { - return /* for split upto root page */ txn->mt_dbs[FREE_DBI].md_depth + - /* for rebalance */ 2 + /* for grow */ 1; -} - /* LY: Prepare a backlog of pages to modify GC itself, * while reclaiming is prohibited. It should be enough to prevent search * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, const size_t pnl_bytes) { - const unsigned linear = number_of_ovpages( - txn->mt_env, - pnl_bytes ? pnl_bytes : MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - const unsigned backlog = linear + gctree_backlog(txn); - - if (likely( - linear == 1 && - backlog_size(txn) > - (pnl_bytes - ? backlog - : backlog + /* for COW */ txn->mt_dbs[FREE_DBI].md_depth))) + const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); + const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; + const unsigned backlog4rebalance = backlog4cow + 1; + + if (likely(linear4list == 1 && + backlog_size(txn) > (pnl_bytes + ? backlog4rebalance + : (backlog4cow + backlog4rebalance)))) return MDBX_SUCCESS; - gc_cursor->mc_flags &= ~C_RECLAIMING; + mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", + pnl_bytes, backlog_size(txn), linear4list, backlog4cow, + backlog4rebalance); + gc_cursor->mc_flags &= ~C_RECLAIMING; int err = mdbx_cursor_touch(gc_cursor); - if (err == MDBX_SUCCESS && linear > 1) - err = mdbx_page_alloc(gc_cursor, linear, nullptr, MDBX_ALLOC_ALL); + mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); + + if (linear4list > 1 && err == MDBX_SUCCESS) { + err = mdbx_page_alloc(gc_cursor, linear4list, nullptr, + MDBX_ALLOC_GC | MDBX_ALLOC_CACHE); + mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); + } - while (err == MDBX_SUCCESS && backlog_size(txn) < backlog) + while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) err = mdbx_page_alloc(gc_cursor, 1, NULL, MDBX_ALLOC_GC); gc_cursor->mc_flags |= C_RECLAIMING; + mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -5897,6 +5946,10 @@ retry: goto bailout; } + rc = mdbx_prep_backlog(txn, &mc, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, filled_gc_slot = ~0u; txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; @@ -6104,8 +6157,11 @@ retry: mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); mdbx_debug_extra_print("%s", "\n"); } - if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) + if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { + mdbx_trace("%s.reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, + amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry /* rare case, but avoids GC fragmentation and one loop. */; + } continue; } @@ -6521,7 +6577,7 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, unsigned iov_items, size_t iov_off, size_t iov_bytes) { MDBX_env *const env = txn->mt_env; - int rc = mdbx_pwritev(env->me_fd, iov, iov_items, iov_off, iov_bytes); + int rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_error("Write error: %s", mdbx_strerror(rc)); txn->mt_flags |= MDBX_TXN_ERROR; @@ -6538,7 +6594,7 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, * [in] txn the transaction that's being committed * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { +__hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { struct iovec iov[MDBX_COMMIT_PAGES]; const MDBX_DPL dl = (keep || txn->tw.loose_count > 1) ? mdbx_dpl_sort(txn->tw.dirtylist) @@ -6557,11 +6613,15 @@ static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { mdbx_tassert(txn, dp->mp_flags & P_DIRTY); /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + if (dp->mp_flags & P_KEEP) { dp->mp_flags &= ~P_KEEP; dl[++w] = dl[r]; continue; } + if (dp->mp_flags & P_LOOSE) { + dl[++w] = dl[r]; + continue; + } const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; flush_begin = (flush_begin < dp->mp_pgno) ? flush_begin : dp->mp_pgno; @@ -6569,7 +6629,7 @@ static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; *env->me_unsynced_pages += npages; dp->mp_flags &= ~P_DIRTY; - dp->mp_validator = 0 /* TODO */; + dp->mp_txnid = txn->mt_txnid; if ((env->me_flags & MDBX_WRITEMAP) == 0) { const size_t size = pgno2bytes(env, npages); @@ -6658,8 +6718,8 @@ static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) { } /* Check txn and dbi arguments to a function */ -static __inline bool TXN_DBI_EXIST(MDBX_txn *txn, MDBX_dbi dbi, - unsigned validity) { +static __always_inline bool mdbx_txn_dbi_exists(MDBX_txn *txn, MDBX_dbi dbi, + unsigned validity) { if (likely(dbi < txn->mt_numdbs && (txn->mt_dbflags[dbi] & validity))) return true; @@ -6957,6 +7017,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { for (int i = txn->mt_numdbs; --i >= 0;) mdbx_tassert(txn, (txn->mt_dbflags[i] & DB_DIRTY) == 0); + rc = MDBX_SUCCESS; goto done; } @@ -7029,8 +7090,6 @@ int mdbx_txn_commit(MDBX_txn *txn) { goto fail; } - if (likely(env->me_lck)) - env->me_lck->mti_readers_refresh_flag = false; end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: @@ -7128,7 +7187,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; if (used_bytes > *filesize) { /* Here could be a race with DB-shrinking performed by other process */ - int err = mdbx_filesize(env->me_fd, filesize); + int err = mdbx_filesize(env->me_lazy_fd, filesize); if (unlikely(err != MDBX_SUCCESS)) return err; if (used_bytes > *filesize) { @@ -7230,7 +7289,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, uint64_t *filesize, const int lck_exclusive) { - int rc = mdbx_filesize(env->me_fd, filesize); + int rc = mdbx_filesize(env->me_lazy_fd, filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -7256,7 +7315,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, while (1) { mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, offset, MIN_PAGESIZE, retryleft); - int err = mdbx_pread(env->me_fd, buffer, MIN_PAGESIZE, offset); + int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && *filesize == 0 && (env->me_flags & MDBX_RDONLY) == 0) @@ -7269,7 +7328,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, } char again[MIN_PAGESIZE]; - err = mdbx_pread(env->me_fd, again, MIN_PAGESIZE, offset); + err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, mdbx_strerror(err)); @@ -7385,7 +7444,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); - if (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) { + if (flags & (MDBX_SAFE_NOSYNC | MDBX_MAPASYNC)) { /* Check auto-sync conditions */ const pgno_t autosync_threshold = *env->me_autosync_threshold; const uint64_t autosync_period = *env->me_autosync_period; @@ -7411,23 +7470,31 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, largest_pgno), pgno2bytes(env, edge - largest_pgno)); } -#endif /* MDBX_USE_VALGRIND */ +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #if defined(MADV_DONTNEED) - const size_t largest_aligned2os_bytes = - pgno_align2os_bytes(env, largest_pgno); - const pgno_t largest_aligned2os_pgno = - bytes2pgno(env, largest_aligned2os_bytes); + const size_t largest_bytes = pgno2bytes(env, largest_pgno); + const size_t madvise_gap = (largest_bytes < 65536 * 256) + ? 65536 + : (largest_bytes > MEGABYTE * 4 * 256) + ? MEGABYTE * 4 + : largest_bytes >> 8; + const size_t discard_edge_bytes = bytes_align2os_bytes( + env, + (MDBX_RDONLY & (env->me_lck ? env->me_lck->mti_envmode : env->me_flags)) + ? largest_bytes + : largest_bytes + madvise_gap); + const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); const pgno_t prev_discarded_pgno = *env->me_discarded_tail; if (prev_discarded_pgno > - largest_aligned2os_pgno + - /* 1M threshold to avoid unreasonable madvise() call */ - bytes2pgno(env, MEGABYTE)) { + discard_edge_pgno + + /* threshold to avoid unreasonable frequent madvise() calls */ + bytes2pgno(env, madvise_gap)) { mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, largest_pgno); - *env->me_discarded_tail = largest_aligned2os_pgno; + *env->me_discarded_tail = discard_edge_pgno; const size_t prev_discarded_bytes = pgno2bytes(env, prev_discarded_pgno) & ~(env->me_os_psize - 1); - mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes); + mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ @@ -7435,8 +7502,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_linux_kernel_version > 0x04050000) advise = MADV_FREE; #endif /* MADV_FREE */ - int err = madvise(env->me_map + largest_aligned2os_bytes, - prev_discarded_bytes - largest_aligned2os_bytes, advise) + int err = madvise(env->me_map + discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, advise) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) @@ -7477,25 +7544,29 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#1 - sync previously written/updated data-pages */ int rc = *env->me_unsynced_pages ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; - if (rc != MDBX_RESULT_FALSE && (flags & MDBX_NOSYNC) == 0) { + if (rc != MDBX_RESULT_FALSE && (flags & MDBX_SAFE_NOSYNC) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env); if (flags & MDBX_WRITEMAP) { - const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next); - rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + const size_t begin = pgno2bytes(env, NUM_METAS) & ~(env->me_os_psize - 1); + const size_t end = pgno_align2os_bytes(env, pending->mm_geo.next); + if (end > begin) { + rc = mdbx_msync(&env->me_dxb_mmap, begin, end - begin, + flags & MDBX_MAPASYNC); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } rc = MDBX_RESULT_TRUE /* carry non-steady */; if ((flags & MDBX_MAPASYNC) == 0) { if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) { - rc = mdbx_filesync(env->me_fd, MDBX_SYNC_SIZE); + rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_SIZE); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } rc = MDBX_RESULT_FALSE /* carry steady */; } } else { - rc = mdbx_filesync(env->me_fd, + rc = mdbx_filesync(env->me_lazy_fd, (pending->mm_geo.next > recent_steady_meta->mm_geo.now) ? MDBX_SYNC_DATA | MDBX_SYNC_SIZE : MDBX_SYNC_DATA); @@ -7571,7 +7642,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_a.inconsistent); - if (env->me_flags & MDBX_WRITEMAP) { + if (flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ @@ -7614,40 +7685,50 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_datasync_sign = pending->mm_datasync_sign; mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); + if ((flags & MDBX_SAFE_NOSYNC) == 0) { + /* sync meta-pages */ + const bool weak = (flags & (MDBX_MAPASYNC | MDBX_NOMETASYNC)) != 0; + rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + weak); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + if (!weak) { +#if defined(__APPLE__) && \ + MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY + rc = likely(fcntl(env->me_lazy_fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS + : errno; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; +#endif /* MacOS */ + *env->me_meta_sync_txnid = pending->mm_txnid_a.low; + } + } } else { - rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), + const MDBX_meta undo_meta = *target; + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: mdbx_debug("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), + mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); - } - - /* LY: step#3 - sync meta-pages. */ - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - if (flags & MDBX_WRITEMAP) { - const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb; - const size_t paged_offset = offset & ~(env->me_os_psize - 1); - const size_t paged_length = roundup_powerof2( - env->me_psize + offset - paged_offset, env->me_os_psize); - rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, - flags & MDBX_MAPASYNC); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } else { - rc = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (rc != MDBX_SUCCESS) - goto undo; + if ((flags & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { + /* sync meta-pages */ + if (fd == env->me_lazy_fd) { + rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (rc != MDBX_SUCCESS) + goto undo; + } + *env->me_meta_sync_txnid = pending->mm_txnid_a.low; } - *env->me_meta_sync_txnid = (uint32_t)pending->mm_txnid_a.inconsistent; } /* LY: shrink datafile if needed */ @@ -7660,6 +7741,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, goto fail; } + if (likely(env->me_lck)) + /* toggle oldest refresh */ + env->me_lck->mti_readers_refresh_flag = false; + return MDBX_SUCCESS; fail: @@ -7691,12 +7776,10 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { mdbx_ensure(env, branch_nodemax > 42 && branch_nodemax < (int)UINT16_MAX && branch_nodemax % 2 == 0); env->me_branch_nodemax = (unsigned)branch_nodemax; - env->me_maxkey_nd = (uint16_t)mdbx_limits_keysize_max(env->me_psize, 0); - env->me_maxkey_ds = - (uint16_t)mdbx_limits_keysize_max(env->me_psize, MDBX_DUPSORT); - env->me_maxval_nd = (unsigned)mdbx_limits_valsize_max(env->me_psize, 0); - env->me_maxval_ds = - (unsigned)mdbx_limits_valsize_max(env->me_psize, MDBX_DUPSORT); + env->me_maxkey_nd = (uint16_t)mdbx_limits_keysize_max(pagesize, 0); + env->me_maxkey_ds = (uint16_t)mdbx_limits_keysize_max(pagesize, MDBX_DUPSORT); + env->me_maxval_nd = (unsigned)mdbx_limits_valsize_max(pagesize, 0); + env->me_maxval_ds = (unsigned)mdbx_limits_valsize_max(pagesize, MDBX_DUPSORT); mdbx_ensure(env, env->me_maxkey_nd == env->me_branch_nodemax - NODESIZE - sizeof(pgno_t)); mdbx_ensure(env, env->me_maxkey_ds == @@ -7714,7 +7797,8 @@ int __cold mdbx_env_create(MDBX_env **penv) { env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_fd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = INVALID_HANDLE_VALUE; + env->me_dsync_fd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE; env->me_pid = mdbx_getpid(); @@ -7906,22 +7990,24 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, goto bailout; } - size_lower = roundup_powerof2(size_lower, env->me_os_psize); - size_upper = roundup_powerof2(size_upper, env->me_os_psize); - size_now = roundup_powerof2(size_now, env->me_os_psize); + const size_t unit = + (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize : pagesize; + size_lower = roundup_powerof2(size_lower, unit); + size_upper = roundup_powerof2(size_upper, unit); + size_now = roundup_powerof2(size_now, unit); /* LY: подбираем значение size_upper: - * - кратное размеру системной страницы + * - кратное размеру страницы * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ while (unlikely((size_t)size_upper > MAX_MAPSIZE || (uint64_t)size_upper / pagesize > MAX_PAGENO)) { - if ((size_t)size_upper < env->me_os_psize + MIN_MAPSIZE || - (size_t)size_upper < env->me_os_psize * (MIN_PAGENO + 1)) { + if ((size_t)size_upper < unit + MIN_MAPSIZE || + (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) { /* паранойа на случай переполнения при невероятных значениях */ rc = MDBX_EINVAL; goto bailout; } - size_upper -= env->me_os_psize; + size_upper -= unit; if ((size_t)size_upper < (size_t)size_lower) size_lower = size_upper; } @@ -7943,13 +8029,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } if (growth_step == 0 && shrink_threshold > 0) growth_step = 1; - growth_step = roundup_powerof2(growth_step, env->me_os_psize); + growth_step = roundup_powerof2(growth_step, unit); if (bytes2pgno(env, growth_step) > UINT16_MAX) growth_step = pgno2bytes(env, UINT16_MAX); if (shrink_threshold < 0) shrink_threshold = growth_step + growth_step; - shrink_threshold = roundup_powerof2(shrink_threshold, env->me_os_psize); + shrink_threshold = roundup_powerof2(shrink_threshold, unit); if (bytes2pgno(env, shrink_threshold) > UINT16_MAX) shrink_threshold = pgno2bytes(env, UINT16_MAX); @@ -8162,12 +8248,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { return MDBX_ENOMEM; meta = *mdbx_init_metas(env, buffer); - err = mdbx_pwrite(env->me_fd, buffer, env->me_psize * NUM_METAS, 0); + err = mdbx_pwrite(env->me_lazy_fd, buffer, env->me_psize * NUM_METAS, 0); mdbx_free(buffer); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_fd, filesize_before = env->me_dbgeo.now); + err = mdbx_ftruncate(env->me_lazy_fd, filesize_before = env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -8309,27 +8395,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (unlikely(err != MDBX_SUCCESS)) return err; -#if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) - const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); - err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) +#if defined(MADV_DONTDUMP) + err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; - err = madvise(env->me_map + meta_length_aligned2os, - env->me_dxb_mmap.current - meta_length_aligned2os, - (mdbx_runtime_flags & MDBX_DBG_DUMP) ? MADV_DODUMP - : MADV_DONTDUMP) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif +#endif /* MADV_DONTDUMP */ +#if defined(MADV_DODUMP) + if (mdbx_runtime_flags & MDBX_DBG_DUMP) { + const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); + err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_DODUMP */ #ifdef MDBX_USE_VALGRIND env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); -#endif +#endif /* MDBX_USE_VALGRIND */ mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && used_bytes <= env->me_dxb_mmap.limit); @@ -8339,7 +8426,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, env->me_dxb_mmap.limit - used_bytes); env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit); -#endif /* MDBX_USE_VALGRIND */ +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ /* NOTE: AddressSanitizer (at least GCC 7.x, 8.x) could generate * false-positive alarm here. I have no other explanation for this @@ -8425,7 +8512,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, undo_txnid); rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK; - err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), + err = mdbx_pwrite(env->me_lazy_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); } if (err) { @@ -8536,7 +8623,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { return err; #elif defined(POSIX_FADV_DONTNEED) err = ignore_enosys(posix_fadvise( - env->me_fd, used_aligned2os_bytes, + env->me_lazy_fd, used_aligned2os_bytes, env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -8558,11 +8645,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mode_t mode) { - mdbx_assert(env, env->me_fd != INVALID_HANDLE_VALUE); + mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); - int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd, - (env->me_flags & MDBX_EXCLUSIVE) ? true : false); + int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); if (err != MDBX_SUCCESS) { if (!(err == MDBX_ENOFILE && (env->me_flags & MDBX_EXCLUSIVE)) && !((err == MDBX_EROFS || err == MDBX_EACCESS || err == MDBX_EPERM) && @@ -8570,7 +8656,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, return err; /* ensure the file system is read-only */ - err = mdbx_check_fs_rdonly(env->me_fd, lck_pathname, err); + err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); if (err != MDBX_SUCCESS) return err; @@ -8862,7 +8948,7 @@ __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define CHANGEABLE \ - (MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \ + (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \ MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) #define CHANGELESS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ @@ -8872,9 +8958,9 @@ __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { #error "Persistent DB flags & env flags overlap, but both go in mm_flags" #endif -int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, +int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags, mode_t mode) { - if (unlikely(!env || !path)) + if (unlikely(!env || !pathname)) return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) @@ -8883,11 +8969,11 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, if (flags & ~(CHANGEABLE | CHANGELESS)) return MDBX_EINVAL; - if (env->me_fd != INVALID_HANDLE_VALUE || + if (env->me_lazy_fd != INVALID_HANDLE_VALUE || (env->me_flags & MDBX_ENV_ACTIVE) != 0) return MDBX_EPERM; - size_t len_full, len = strlen(path); + size_t len_full, len = strlen(pathname); if (flags & MDBX_NOSUBDIR) { len_full = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; } else { @@ -8900,12 +8986,12 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, char *dxb_pathname; if (flags & MDBX_NOSUBDIR) { dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); - sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, path); - strcpy(dxb_pathname, path); + sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, pathname); + strcpy(dxb_pathname, pathname); } else { dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCKNAME); - sprintf(lck_pathname, "%s" MDBX_LOCKNAME, path); - sprintf(dxb_pathname, "%s" MDBX_DATANAME, path); + sprintf(lck_pathname, "%s" MDBX_LOCKNAME, pathname); + sprintf(dxb_pathname, "%s" MDBX_DATANAME, pathname); } int rc = MDBX_SUCCESS; @@ -8913,24 +8999,25 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when * we're only getting read access */ - flags &= ~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOSYNC | MDBX_NOMETASYNC | - MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); + flags &= + ~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | + MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); } else { -#ifdef __OpenBSD__ - /* Temporary `workaround` for OpenBSD kernel's bug. +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + /* Temporary `workaround` for OpenBSD kernel's flaw. * See https://github.com/leo-yuriev/libmdbx/issues/67 */ if ((flags & MDBX_WRITEMAP) == 0) { if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; else { mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "OpenBSD requires MDBX_WRITEMAP because of an internal " - "bug(s) in a file/buffer/page cache.\n"); + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); rc = 42 /* ENOPROTOOPT */; goto bailout; } } -#endif /* __OpenBSD__ */ +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ env->me_dirtylist = mdbx_calloc(MDBX_DPL_TXNFULL + 1, sizeof(MDBX_DP)); if (!env->me_dirtylist) rc = MDBX_ENOMEM; @@ -8941,7 +9028,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, if (rc) goto bailout; - env->me_path = mdbx_strdup(path); + env->me_path = mdbx_strdup(pathname); env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); @@ -8952,40 +9039,50 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_align4; /* aligned MDBX_INTEGERKEY */ - int oflags; - if (F_ISSET(flags, MDBX_RDONLY)) - oflags = O_RDONLY; - else if (mode != 0) { - if ((flags & MDBX_NOSUBDIR) == 0) { + if ((flags & (MDBX_RDONLY | MDBX_NOSUBDIR)) == 0 && mode != 0) { #if defined(_WIN32) || defined(_WIN64) - if (!CreateDirectoryA(path, nullptr)) { - rc = GetLastError(); - if (rc != ERROR_ALREADY_EXISTS) - goto bailout; - } + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) { + rc = ERROR_INVALID_NAME; + goto bailout; + } + if (!CreateDirectoryW(pathnameW, nullptr)) { + rc = GetLastError(); + if (rc != ERROR_ALREADY_EXISTS) + goto bailout; + } #else - const mode_t dir_mode = - (/* inherit read/write permissions for group and others */ mode & - (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | - /* always add read/write/search for owner */ S_IRWXU | - ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | - ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if (mkdir(path, dir_mode)) { - rc = errno; - if (rc != EEXIST) - goto bailout; - } -#endif + const mode_t dir_mode = + (/* inherit read/write permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write/search for owner */ S_IRWXU | + ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | + ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); + if (mkdir(pathname, dir_mode)) { + rc = errno; + if (rc != EEXIST) + goto bailout; } - oflags = O_RDWR | O_CREAT; - } else - oflags = O_RDWR; +#endif + } - rc = mdbx_openfile(dxb_pathname, oflags, mode, &env->me_fd, - (env->me_flags & MDBX_EXCLUSIVE) ? true : false); + rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, + env, dxb_pathname, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; + mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { + rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, dxb_pathname, + &env->me_dsync_fd, 0); + mdbx_ensure(env, (rc != MDBX_SUCCESS) == + (env->me_dsync_fd == INVALID_HANDLE_VALUE)); + } + #if MDBX_LOCKING == MDBX_LOCKING_SYSV env->me_sysv_ipc.key = ftok(dxb_pathname, 42); if (env->me_sysv_ipc.key == -1) { @@ -8997,13 +9094,13 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, #if !(defined(_WIN32) || defined(_WIN64)) if (mode == 0) { struct stat st; - if (fstat(env->me_fd, &st)) { + if (fstat(env->me_lazy_fd, &st)) { rc = errno; goto bailout; } mode = st.st_mode; } -#endif +#endif /* !Windows */ const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); if (MDBX_IS_ERROR(lck_rc)) { @@ -9011,7 +9108,8 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, goto bailout; } - const unsigned rigorous_flags = MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_MAPASYNC; + const unsigned rigorous_flags = + MDBX_WRITEMAP | MDBX_SAFE_NOSYNC | MDBX_MAPASYNC; const unsigned mode_flags = rigorous_flags | MDBX_NOMETASYNC | MDBX_LIFORECLAIM | MDBX_COALESCE | MDBX_NORDAHEAD; @@ -9072,27 +9170,32 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, } if ((flags & MDBX_RDONLY) == 0) { - rc = MDBX_ENOMEM; - MDBX_txn *txn; - int tsize = sizeof(MDBX_txn), - size = - tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(unsigned) + 1); - if ((env->me_pbuf = mdbx_calloc( - 1 /* page buffer */ + 1 /* page killer bufer */, env->me_psize)) && - (txn = mdbx_calloc(1, size))) { - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDBX_TXN_FINISHED; - env->me_txn0 = txn; - txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - if (txn->tw.retired_pages && txn->tw.reclaimed_pglist) - rc = MDBX_SUCCESS; + const size_t tsize = sizeof(MDBX_txn), + size = tsize + env->me_maxdbs * + (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + + sizeof(unsigned) + 1); + rc = mdbx_memalign_alloc( + env->me_os_psize, + env->me_psize * (1 /* page buffer */ + 1 /* page killer bufer */), + &env->me_pbuf); + if (rc == MDBX_SUCCESS) { + memset(env->me_pbuf, -1, env->me_psize * 2); + MDBX_txn *txn = mdbx_calloc(1, size); + if (txn) { + txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); + txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDBX_TXN_FINISHED; + env->me_txn0 = txn; + txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + if (!txn->tw.retired_pages || !txn->tw.reclaimed_pglist) + rc = MDBX_ENOMEM; + } else + rc = MDBX_ENOMEM; } } @@ -9116,6 +9219,9 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, #endif bailout: +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, nullptr); +#endif if (rc) { rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; env->me_flags = saved_me_flags | MDBX_FATAL_ERROR; @@ -9153,9 +9259,15 @@ static int __cold mdbx_env_close0(MDBX_env *env) { env->me_valgrind_handle = -1; #endif } - if (env->me_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_fd); - env->me_fd = INVALID_HANDLE_VALUE; + + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + (void)mdbx_closefile(env->me_dsync_fd); + env->me_dsync_fd = INVALID_HANDLE_VALUE; + } + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + (void)mdbx_closefile(env->me_lazy_fd); + env->me_lazy_fd = INVALID_HANDLE_VALUE; } if (env->me_lck) @@ -9167,11 +9279,11 @@ static int __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_dbxs) { - for (unsigned i = env->me_maxdbs; --i >= CORE_DBS;) + for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) mdbx_free(env->me_dbxs[i].md_name.iov_base); mdbx_free(env->me_dbxs); } - mdbx_free(env->me_pbuf); + mdbx_memalign_free(env->me_pbuf); mdbx_free(env->me_dbiseqs); mdbx_free(env->me_dbflags); mdbx_free(env->me_path); @@ -9219,7 +9331,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; #else struct stat st; - if (unlikely(fstat(env->me_fd, &st))) + if (unlikely(fstat(env->me_lazy_fd, &st))) rc = errno; else if (st.st_nlink > 0 /* don't sync deleted files */) { rc = mdbx_env_sync_ex(env, true, true); @@ -9499,14 +9611,14 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, MDBX_txn *txn = mc->mc_txn; if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_debug("page %" PRIaPGNO " not found", pgno); - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_NOTFOUND; + goto corrupted; } - MDBX_env *env = txn->mt_env; + MDBX_env *const env = txn->mt_env; MDBX_page *p = NULL; int level; - if ((txn->mt_flags & (MDBX_RDONLY | MDBX_WRITEMAP)) == 0) { + mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + if (unlikely((txn->mt_flags & (MDBX_RDONLY | MDBX_WRITEMAP)) == 0)) { level = 1; do { /* Spilled pages were dirtied in this txn and flushed @@ -9514,35 +9626,45 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) - goto mapped; + goto spilled; p = mdbx_dpl_find(txn->tw.dirtylist, pgno); if (p) - goto done; + goto dirty; level++; } while ((txn = txn->mt_parent) != NULL); } level = 0; -mapped: +spilled: p = pgno2page(env, pgno); -done: - txn = nullptr /* avoid future use */; +dirty: if (unlikely(p->mp_pgno != pgno)) { mdbx_error("mismatch pgno %" PRIaPGNO " (actual) != %" PRIaPGNO " (expected)", p->mp_pgno, pgno); - return MDBX_CORRUPTED; + goto corrupted; + } + + if (unlikely((p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) != 0 || + p->mp_txnid > mc->mc_txn->mt_txnid)) { + if (unlikely((mc->mc_txn->mt_flags & MDBX_RDONLY) != 0 || + (p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) != + P_DIRTY)) { + mdbx_error("invalid page's flags (0x%x) or txnid %" PRIaTXN + " > (actual) %" PRIaTXN " (expected)", + p->mp_flags, p->mp_txnid, mc->mc_txn->mt_txnid); + goto corrupted; + } } - if (unlikely(p->mp_upper < p->mp_lower || ((p->mp_lower | p->mp_upper) & 1) || - PAGEHDRSZ + p->mp_upper > env->me_psize) && - !IS_OVERFLOW(p)) { + if (unlikely(!IS_OVERFLOW(p) && (p->mp_upper < p->mp_lower || + ((p->mp_lower | p->mp_upper) & 1) != 0 || + PAGEHDRSZ + p->mp_upper > env->me_psize))) { mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower, p->mp_upper, page_space(env)); - return MDBX_CORRUPTED; + goto corrupted; } - /* TODO: more checks here, including p->mp_validator */ if (mdbx_audit_enabled()) { int err = mdbx_page_check(env, p, true); @@ -9554,6 +9676,10 @@ done: if (lvl) *lvl = level; return MDBX_SUCCESS; + +corrupted: + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_CORRUPTED; } /* Finish mdbx_page_search() / mdbx_page_search_lowest(). @@ -9760,8 +9886,8 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { * [out] data Updated to point to the node's data. * * Returns 0 on success, non-zero on failure. */ -static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node, - MDBX_val *data) { +static __always_inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node, + MDBX_val *data) { data->iov_len = node_ds(node); data->iov_base = node_data(node); if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { @@ -9789,7 +9915,7 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; MDBX_cursor_couple cx; @@ -9813,7 +9939,7 @@ int mdbx_get_nearest(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) @@ -9853,7 +9979,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; MDBX_cursor_couple cx; @@ -10133,8 +10259,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Set the cursor on a specific data item. */ -__hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op, int *exactp) { +static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op, int *exactp) { int rc; MDBX_page *mp; MDBX_node *node = NULL; @@ -10813,6 +10939,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, insert_key = insert_data = (rc != MDBX_SUCCESS); uint16_t fp_flags = P_LEAF | P_DIRTY; MDBX_page *fp = env->me_pbuf; + fp->mp_txnid = INVALID_TXNID; if (insert_key) { /* The key does not exist */ mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); @@ -10831,7 +10958,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { char *ptr; unsigned ksize = mc->mc_db->md_xsize; - if (key->iov_len != ksize) + if (unlikely(key->iov_len != ksize)) return MDBX_BAD_VALSIZE; ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); @@ -10893,13 +11020,14 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* LY: add configurable threshold to keep reserve space */ dpages) { if (!IS_DIRTY(omp) && (level || (env->me_flags & MDBX_WRITEMAP))) { - rc = mdbx_page_unspill(mc->mc_txn, omp, &omp); - if (unlikely(rc)) - return rc; + rc2 = mdbx_page_unspill(mc->mc_txn, omp, &omp); + if (unlikely(rc2)) + return rc2; level = 0; /* dirty in this txn or clean */ } /* Is it dirty? */ if (IS_DIRTY(omp)) { + mdbx_cassert(mc, omp->mp_txnid > SAFE64_INVALID_THRESHOLD); /* yes, overwrite it. Note in this case we don't * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ @@ -11031,6 +11159,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: fp->mp_flags |= P_DIRTY; + fp->mp_txnid = INVALID_TXNID; fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; @@ -11060,8 +11189,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, nested_dupdb.md_entries = page_numkeys(fp); xdata.iov_len = sizeof(nested_dupdb); xdata.iov_base = &nested_dupdb; - if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) - return rc; + if ((rc2 = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) + return rc2; mc->mc_db->md_leaf_pages += 1; mdbx_cassert(mc, env->me_psize > olddata.iov_len); offset = env->me_psize - (unsigned)olddata.iov_len; @@ -11072,6 +11201,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } if (mp != fp) { mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_txnid = INVALID_TXNID; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); @@ -11419,6 +11549,7 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = (uint16_t)(flags | P_DIRTY); + np->mp_txnid = INVALID_TXNID; np->mp_lower = 0; np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); @@ -11865,7 +11996,7 @@ int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_VALID))) return MDBX_EINVAL; if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_RDONLY))) @@ -11907,7 +12038,7 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, mc->mc_dbi, DB_VALID))) return MDBX_EINVAL; if (unlikely(mc->mc_backup)) @@ -13239,7 +13370,7 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!key)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(txn->mt_flags & (MDBX_RDONLY | MDBX_TXN_BLOCKED))) @@ -13445,6 +13576,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; + copy->mp_txnid = INVALID_TXNID; copy->mp_lower = 0; copy->mp_upper = (indx_t)page_space(env); @@ -13780,7 +13912,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_RESERVE | @@ -13850,22 +13982,26 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; uint8_t *ptr; int toggle = 0; - int rc; mdbx_condmutex_lock(&my->mc_condmutex); while (!my->mc_error) { - while (!my->mc_new) - mdbx_condmutex_wait(&my->mc_condmutex); + while (!my->mc_new && !my->mc_error) { + int err = mdbx_condmutex_wait(&my->mc_condmutex); + if (err != MDBX_SUCCESS) { + my->mc_error = err; + goto bailout; + } + } if (my->mc_new == 0 + MDBX_EOF) /* 0 buffers, just EOF */ break; size_t wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { - rc = mdbx_write(my->mc_fd, ptr, wsize); - if (rc != MDBX_SUCCESS) { - my->mc_error = rc; - break; + int err = mdbx_write(my->mc_fd, ptr, wsize); + if (err != MDBX_SUCCESS) { + my->mc_error = err; + goto bailout; } } @@ -13882,6 +14018,7 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { my->mc_new--; mdbx_condmutex_signal(&my->mc_condmutex); } +bailout: mdbx_condmutex_unlock(&my->mc_condmutex); return (THREAD_RESULT)0; } @@ -13894,8 +14031,11 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { mdbx_condmutex_lock(&my->mc_condmutex); my->mc_new += (short)adjust; mdbx_condmutex_signal(&my->mc_condmutex); - while (my->mc_new & 2) /* both buffers in use */ - mdbx_condmutex_wait(&my->mc_condmutex); + while (!my->mc_error && (my->mc_new & 2) /* both buffers in use */) { + int err = mdbx_condmutex_wait(&my->mc_condmutex); + if (err != MDBX_SUCCESS) + my->mc_error = err; + } mdbx_condmutex_unlock(&my->mc_condmutex); my->mc_toggle ^= (adjust & 1); @@ -13979,6 +14119,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); mo->mp_pgno = my->mc_next_pgno; + mo->mp_txnid = MIN_TXNID; my->mc_next_pgno += omp->mp_pages; my->mc_wlen[toggle] += my->mc_env->me_psize; if (omp->mp_pages > 1) { @@ -14043,6 +14184,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdbx_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_txnid = MIN_TXNID; mo->mp_pgno = my->mc_next_pgno++; my->mc_wlen[toggle] += my->mc_env->me_psize; if (mc.mc_top) { @@ -14263,10 +14405,10 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, buffer + roundup_powerof2(meta_bytes, env->me_os_psize); for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { if (dest_is_pipe) { -#if defined(__linux__) || defined(__gnu_linux__) +#if defined(__linux__) || defined(__gnu_linux__) && !defined(MDBX_SAFE4QEMU) off_t in_offset = offset; const intptr_t written = - sendfile(fd, env->me_fd, &in_offset, used_size - offset); + sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset); if (unlikely(written <= 0)) { rc = written ? errno : MDBX_ENODATA; break; @@ -14275,10 +14417,10 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, continue; #endif } else { -#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) +#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) && !defined(MDBX_SAFE4QEMU) off_t in_offset = offset, out_offset = offset; ssize_t bytes_copied = copy_file_range( - env->me_fd, &in_offset, fd, &out_offset, used_size - offset, 0); + env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); if (unlikely(bytes_copied <= 0)) { rc = bytes_copied ? errno : MDBX_ENODATA; break; @@ -14392,49 +14534,28 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, unsigned flags) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - char *dxb_pathname; - mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; - - if (env->me_flags & MDBX_NOSUBDIR) { - dxb_pathname = (char *)dest_path; - } else { - size_t len = strlen(dest_path); - len += sizeof(MDBX_DATANAME); - dxb_pathname = mdbx_malloc(len); - if (!dxb_pathname) - return MDBX_ENOMEM; - sprintf(dxb_pathname, "%s" MDBX_DATANAME, dest_path); - } - /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ - int rc = mdbx_openfile(dxb_pathname, O_WRONLY | O_CREAT | O_EXCL, 0640, - &newfd, true); - if (rc == MDBX_SUCCESS) { - if (env->me_psize >= env->me_os_psize) { -#ifdef F_NOCACHE /* __APPLE__ */ - (void)fcntl(newfd, F_NOCACHE, 1); -#elif defined(O_DIRECT) && defined(F_GETFL) - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); + mdbx_filehandle_t newfd; + int rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, +#if defined(_WIN32) || defined(_WIN64) + (mode_t)-1 +#else + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP #endif - } + ); + if (rc == MDBX_SUCCESS) rc = mdbx_env_copy2fd(env, newfd, flags); - } if (newfd != INVALID_HANDLE_VALUE) { int err = mdbx_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; if (rc != MDBX_SUCCESS) - (void)mdbx_removefile(dxb_pathname); + (void)mdbx_removefile(dest_path); } - if (dxb_pathname != dest_path) - mdbx_free(dxb_pathname); - return rc; } @@ -14529,7 +14650,7 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - *arg = env->me_fd; + *arg = env->me_lazy_fd; return MDBX_SUCCESS; } @@ -14977,7 +15098,7 @@ int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, if (unlikely(!dest)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_VALID))) return MDBX_EINVAL; const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); @@ -14997,8 +15118,9 @@ int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, } static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { - if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) - return MDBX_EINVAL; + mdbx_assert(env, dbi >= CORE_DBS); + if (unlikely(dbi >= env->me_numdbs)) + return MDBX_BAD_DBI; char *ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ @@ -15041,7 +15163,7 @@ int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, if (unlikely(!flags || !state)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_VALID))) return MDBX_EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; @@ -15155,7 +15277,7 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { if (unlikely(1 < (unsigned)del)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) @@ -15166,7 +15288,7 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) { + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) { rc = MDBX_EINVAL; goto bailout; } @@ -15223,7 +15345,7 @@ int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; @@ -15235,7 +15357,7 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; @@ -15475,36 +15597,6 @@ int __cold mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { flags &= MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN; #endif -#if defined(__linux__) || defined(__gnu_linux__) - if ((mdbx_runtime_flags ^ flags) & MDBX_DBG_DUMP) { - /* http://man7.org/linux/man-pages/man5/core.5.html */ - const unsigned long dump_bits = - 1 << 3 /* Dump file-backed shared mappings */ - | 1 << 6 /* Dump shared huge pages */ - | 1 << 8 /* Dump shared DAX pages */; - const int core_filter_fd = - open("/proc/self/coredump_filter", O_TRUNC | O_RDWR); - if (core_filter_fd != -1) { - char buf[32]; - intptr_t bytes = pread(core_filter_fd, buf, sizeof(buf), 0); - if (bytes > 0 && (size_t)bytes < sizeof(buf)) { - buf[bytes] = 0; - const unsigned long present_mask = strtoul(buf, NULL, 16); - const unsigned long wanna_mask = (flags & MDBX_DBG_DUMP) - ? present_mask | dump_bits - : present_mask & ~dump_bits; - if (wanna_mask != present_mask) { - bytes = snprintf(buf, sizeof(buf), "0x%lx\n", wanna_mask); - if (bytes > 0 && (size_t)bytes < sizeof(buf)) { - bytes = pwrite(core_filter_fd, buf, bytes, 0); - (void)bytes; - } - } - } - close(core_filter_fd); - } - } -#endif /* Linux */ mdbx_runtime_flags = (uint8_t)flags; } @@ -16204,9 +16296,8 @@ __hot static ptrdiff_t estimate(const MDBX_db *db, } } -__hot int mdbx_estimate_distance(const MDBX_cursor *first, - const MDBX_cursor *last, - ptrdiff_t *distance_items) { +int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, + ptrdiff_t *distance_items) { if (unlikely(first == NULL || last == NULL || distance_items == NULL)) return MDBX_EINVAL; @@ -16310,7 +16401,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; MDBX_cursor_couple begin; @@ -16482,7 +16573,7 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, if (unlikely(new_data == NULL && !(flags & MDBX_CURRENT))) return MDBX_EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_RESERVE | @@ -16677,7 +16768,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + if (unlikely(!mdbx_txn_dbi_exists(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) @@ -17024,6 +17115,9 @@ __dll_export #ifdef MDBX_USE_VALGRIND " MDBX_USE_VALGRIND=YES" #endif /* MDBX_USE_VALGRIND */ +#ifdef MDBX_FORCE_ASSERTIONS + " MDBX_FORCE_ASSERTIONS=YES" +#endif /* MDBX_FORCE_ASSERTIONS */ #ifdef _GNU_SOURCE " _GNU_SOURCE=YES" #else diff --git a/libs/libmdbx/src/src/elements/internals.h b/libs/libmdbx/src/src/elements/internals.h index 6a39460ab2..6c6a9d4ac6 100644 --- a/libs/libmdbx/src/src/elements/internals.h +++ b/libs/libmdbx/src/src/elements/internals.h @@ -211,6 +211,7 @@ typedef uint32_t pgno_t; typedef uint64_t txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) +#define INVALID_TXNID UINT64_MAX /* LY: for testing non-atomic 64-bit txnid on 32-bit arches. * #define MDBX_TXNID_STEP (UINT32_MAX / 3) */ #ifndef MDBX_TXNID_STEP @@ -256,7 +257,7 @@ typedef union mdbx_safe64 { typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ uint16_t md_depth; /* depth of this tree */ - uint32_t md_xsize; /* also ksize for LEAF2 pages */ + uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */ pgno_t md_root; /* the root page of this tree */ pgno_t md_branch_pages; /* number of internal pages */ pgno_t md_leaf_pages; /* number of leaf pages */ @@ -347,8 +348,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_validator; /* checksum of page content or a txnid during - * which the page has been updated */ + uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -703,7 +703,8 @@ struct MDBX_txn { /* Transaction Flags */ /* mdbx_txn_begin() flags */ #define MDBX_TXN_BEGIN_FLAGS \ - (MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_MAPASYNC | MDBX_RDONLY | MDBX_TRYTXN) + (MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_MAPASYNC | MDBX_RDONLY | \ + MDBX_TRYTXN) /* internal txn flags */ #define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ #define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ @@ -864,7 +865,8 @@ struct MDBX_env { size_t me_signature; mdbx_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb -#define me_fd me_dxb_mmap.fd +#define me_lazy_fd me_dxb_mmap.fd + mdbx_filehandle_t me_dsync_fd; mdbx_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck @@ -947,7 +949,7 @@ struct MDBX_env { #endif #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ MDBX_env *me_lcklist_next; /* struct me_dbgeo used for accepting db-geo params from user for the new @@ -1008,7 +1010,7 @@ MDBX_INTERNAL_FUNC void mdbx_panic(const char *fmt, ...) __printf_args(1, 2); #define mdbx_audit_enabled() (0) -#if !defined(NDEBUG) || defined(MDBX_FORCE_ASSERT) +#if !defined(NDEBUG) || defined(MDBX_FORCE_ASSERTIONS) #define mdbx_assert_enabled() (1) #else #define mdbx_assert_enabled() (0) diff --git a/libs/libmdbx/src/src/elements/lck-posix.c b/libs/libmdbx/src/src/elements/lck-posix.c index ce2c0c3f23..84652d7adf 100644 --- a/libs/libmdbx/src/src/elements/lck-posix.c +++ b/libs/libmdbx/src/src/elements/lck-posix.c @@ -220,8 +220,67 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { } #endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ +static int check_fstat(MDBX_env *env) { + struct stat st; + + int rc = MDBX_SUCCESS; + if (fstat(env->me_lazy_fd, &st)) { + rc = errno; + mdbx_error("fstat(%s), err %d", "DXB", rc); + return rc; + } + + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + mdbx_error("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", + rc); + return rc; + } + + if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { + mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } + + //---------------------------------------------------------------------------- + + if (fstat(env->me_lfd, &st)) { + rc = errno; + mdbx_error("fstat(%s), err %d", "LCK", rc); + return rc; + } + + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + mdbx_error("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", + rc); + return rc; + } + + /* Checking file size for detect the situation when we got the shared lock + * immediately after mdbx_lck_destroy(). */ + if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { + mdbx_verbose("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } + + return rc; +} + MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { - assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); if (unlikely(mdbx_getpid() != env->me_pid)) return MDBX_PANIC; #if MDBX_USE_OFDLOCKS @@ -229,43 +288,68 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { choice_fcntl(); #endif /* MDBX_USE_OFDLOCKS */ - int rc; + int rc = MDBX_SUCCESS; if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ rc = - lck_op(env->me_fd, op_setlk, + lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); if (rc != MDBX_SUCCESS) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc); + mdbx_error("%s, err %u", "without-lck", rc); mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; } +#if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0 + sched_yield(); +#endif + +retry: + if (rc == MDBX_RESULT_TRUE) { + rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "unlock-before-retry", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + } -retry_exclusive: /* Firstly try to get exclusive locking. */ rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); if (rc == MDBX_SUCCESS) { + rc = check_fstat(env); + if (MDBX_IS_ERROR(rc)) + return rc; + continue_dxb_exclusive: rc = - lck_op(env->me_fd, op_setlk, + lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); if (rc == MDBX_SUCCESS) return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + int err = check_fstat(env); + if (MDBX_IS_ERROR(err)) + return err; + /* the cause may be a collision with POSIX's file-lock recovery. */ if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "dxb-exclusive", rc); + mdbx_error("%s, err %u", "dxb-exclusive", rc); mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } /* Fallback to lck-shared */ + } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == EDEADLK)) { + mdbx_error("%s, err %u", "try-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; } - /* Here could be one of two:: + /* Here could be one of two: * - mdbx_lck_destroy() from the another process was hold the lock * during a destruction. * - either mdbx_lck_seize() from the another process was got the exclusive @@ -277,11 +361,19 @@ retry_exclusive: * competing process doesn't call lck_downgrade(). */ rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "try-shared", rc); + mdbx_error("%s, err %u", "try-shared", rc); mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } + rc = check_fstat(env); + if (rc == MDBX_RESULT_TRUE) + goto retry; + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "lck_fstat", rc); + return rc; + } + /* got shared, retry exclusive */ rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); if (rc == MDBX_SUCCESS) @@ -289,39 +381,17 @@ retry_exclusive: if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "try-exclusive", rc); + mdbx_error("%s, err %u", "try-exclusive", rc); mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } - /* Checking file size for detect the situation when we got the shared lock - * immediately after mdbx_lck_destroy(). */ - struct stat st; - if (fstat(env->me_lfd, &st)) { - rc = errno; - mdbx_error("%s(%s) failed: errcode %u", __func__, "check-filesize", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); - return rc; - } - if (st.st_size < (unsigned)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { - mdbx_verbose("lck-file is too short (%u), retry exclusive-lock", - (unsigned)st.st_size); - rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); - if (rc != MDBX_SUCCESS) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "retry-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); - return rc; - } - goto retry_exclusive; - } - /* Lock against another process operating in without-lck or exclusive mode. */ rc = - lck_op(env->me_fd, op_setlk, + lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s(%s) failed: errcode %u", __func__, - "lock-against-without-lck", rc); + mdbx_error("%s, err %u", "lock-against-without-lck", rc); mdbx_assert(env, MDBX_IS_ERROR(rc)); return rc; } @@ -337,15 +407,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { int rc = MDBX_SUCCESS; if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - rc = lck_op(env->me_fd, op_setlk, F_UNLCK, 0, env->me_pid); + rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid); if (rc == MDBX_SUCCESS) - rc = lck_op(env->me_fd, op_setlk, F_UNLCK, env->me_pid + 1, + rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1, OFF_T_MAX - env->me_pid - 1); } if (rc == MDBX_SUCCESS) rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); if (unlikely(rc != 0)) { - mdbx_error("%s(%s) failed: errcode %u", __func__, "lck", rc); + mdbx_error("%s, err %u", "lck", rc); assert(MDBX_IS_ERROR(rc)); } return rc; @@ -357,15 +427,18 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, return MDBX_PANIC; int rc = MDBX_SUCCESS; + struct stat lck_info; if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && env->me_lck && /* try get exclusive access */ lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && - lck_op(env->me_fd, op_setlk, + /* if LCK was not removed */ + fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 && + lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_verbose("%s: got exclusive, drown locks", __func__); + mdbx_verbose("%p got exclusive, drown locks", (void *)env); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; @@ -392,14 +465,19 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, * locks should be released here explicitly with properly order. */ /* close dxb and restore lock */ - if (env->me_fd != INVALID_HANDLE_VALUE) { - if (unlikely(close(env->me_fd) != 0) && rc == MDBX_SUCCESS) + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_dsync_fd = INVALID_HANDLE_VALUE; + } + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS) rc = errno; - env->me_fd = INVALID_HANDLE_VALUE; + env->me_lazy_fd = INVALID_HANDLE_VALUE; if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { /* restore file-lock */ rc = lck_op( - inprocess_neighbor->me_fd, F_SETLKW, + inprocess_neighbor->me_lazy_fd, F_SETLKW, (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? 0 @@ -439,7 +517,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, int semid = -1; if (global_uniqueness_flag) { struct stat st; - if (fstat(env->me_fd, &st)) + if (fstat(env->me_lazy_fd, &st)) return errno; sysv_retry_create: semid = semget(env->me_sysv_ipc.key, 2, @@ -705,7 +783,7 @@ MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); mdbx_trace("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: errcode %d\n", __func__, rc); + mdbx_panic("%s() failed: err %d\n", __func__, rc); mdbx_jitter4testing(true); } @@ -722,6 +800,6 @@ void mdbx_txn_unlock(MDBX_env *env) { int rc = mdbx_ipclock_unlock(env, env->me_wlock); mdbx_trace("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: errcode %d\n", __func__, rc); + mdbx_panic("%s() failed: err %d\n", __func__, rc); mdbx_jitter4testing(true); } diff --git a/libs/libmdbx/src/src/elements/lck-windows.c b/libs/libmdbx/src/src/elements/lck-windows.c index 5d74bb8ccd..56d77f5ebf 100644 --- a/libs/libmdbx/src/src/elements/lck-windows.c +++ b/libs/libmdbx/src/src/elements/lck-windows.c @@ -151,7 +151,7 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_fd, + flock(env->me_lazy_fd, dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) : (LCK_EXCLUSIVE | LCK_WAITFOR), LCK_BODY)) @@ -162,11 +162,12 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } void mdbx_txn_unlock(MDBX_env *env) { - int rc = - (env->me_flags & MDBX_EXCLUSIVE) ? TRUE : funlock(env->me_fd, LCK_BODY); + int rc = (env->me_flags & MDBX_EXCLUSIVE) + ? TRUE + : funlock(env->me_lazy_fd, LCK_BODY); LeaveCriticalSection(&env->me_windowsbug_lock); if (!rc) - mdbx_panic("%s failed: errcode %u", __func__, GetLastError()); + mdbx_panic("%s failed: err %u", __func__, GetLastError()); } /*----------------------------------------------------------------------------*/ @@ -200,7 +201,7 @@ MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { /* transite from S-E (locked) to S-? (used), e.g. unlock upper-part */ if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && !funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s failed: errcode %u", __func__, GetLastError()); + mdbx_panic("%s failed: err %u", __func__, GetLastError()); } mdbx_srwlock_ReleaseShared(&env->me_remap_guard); } @@ -385,24 +386,24 @@ static void lck_unlock(MDBX_env *env) { SetLastError(ERROR_SUCCESS); } - if (env->me_fd != INVALID_HANDLE_VALUE) { + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ - while (funlock(env->me_fd, LCK_BODY)) + while (funlock(env->me_lazy_fd, LCK_BODY)) ; rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_fd, LCK_META)) + while (funlock(env->me_lazy_fd, LCK_META)) ; rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_fd, LCK_WHOLE)) + while (funlock(env->me_lazy_fd, LCK_WHOLE)) ; rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); @@ -446,8 +447,7 @@ static int internal_seize_lck(HANDLE lfd) { mdbx_jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = GetLastError() /* 2) something went wrong, give up */; - mdbx_error("%s(%s) failed: errcode %u", __func__, - "?-?(free) >> ?-E(middle)", rc); + mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } @@ -462,8 +462,8 @@ static int internal_seize_lck(HANDLE lfd) { if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, - "?-E(middle) >> ?-?(free)", GetLastError()); + mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", + GetLastError()); return rc; } @@ -474,13 +474,12 @@ static int internal_seize_lck(HANDLE lfd) { mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) - mdbx_error("%s(%s) failed: errcode %u", __func__, - "?-E(middle) >> S-E(locked)", rc); + mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transite to S-? (used) or ?-? (free) */ if (!funlock(lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, + mdbx_panic("%s(%s) failed: err %u", __func__, "X-E(locked/middle) >> X-?(used/free)", GetLastError()); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ @@ -490,7 +489,7 @@ static int internal_seize_lck(HANDLE lfd) { MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { int rc; - assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -499,9 +498,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ mdbx_jitter4testing(false); - if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { + if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = GetLastError(); - mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc); + mdbx_error("%s, err %u", "without-lck", rc); return rc; } return MDBX_RESULT_FALSE; @@ -516,16 +515,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { * - we need an exclusive lock for do so; * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ - if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { + if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { rc = GetLastError(); - mdbx_error("%s(%s) failed: errcode %u", __func__, - "lock-against-without-lck", rc); + mdbx_error("%s, err %u", "lock-against-without-lck", rc); mdbx_jitter4testing(false); lck_unlock(env); } else { mdbx_jitter4testing(false); - if (!funlock(env->me_fd, LCK_BODY)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, + if (!funlock(env->me_lazy_fd, LCK_BODY)) + mdbx_panic("%s(%s) failed: err %u", __func__, "unlock-against-without-lck", GetLastError()); } } @@ -535,7 +533,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* Transite from exclusive state (E-?) to used (S-?) */ - assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); #if 1 @@ -547,7 +545,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { if (env->me_flags & MDBX_EXCLUSIVE) { /* transite from E-E to E_? (exclusive-read) */ if (!funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, + mdbx_panic("%s(%s) failed: err %u", __func__, "E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError()); return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */; } @@ -555,21 +553,20 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* 3) now at E-E (exclusive-write), transite to ?_E (middle) */ if (!funlock(env->me_lfd, LCK_LOWER)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, + mdbx_panic("%s(%s) failed: err %u", __func__, "E-E(exclusive-write) >> ?-E(middle)", GetLastError()); /* 4) now at ?-E (middle), transite to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { int rc = GetLastError() /* 5) something went wrong, give up */; - mdbx_error("%s(%s) failed: errcode %u", __func__, - "?-E(middle) >> S-E(locked)", rc); + mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } /* 6) got S-E (locked), continue transition to S-? (used) */ if (!funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", __func__, - "S-E(locked) >> S-?(used)", GetLastError()); + mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", + GetLastError()); return MDBX_SUCCESS /* 7) now at S-? (used), done */; } @@ -613,6 +610,10 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { case WAIT_OBJECT_0: /* process just exited */ return MDBX_RESULT_FALSE; + case ERROR_ACCESS_DENIED: + /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc. + * assume pid exists */ + return MDBX_RESULT_TRUE; case WAIT_TIMEOUT: /* pid running */ return MDBX_RESULT_TRUE; diff --git a/libs/libmdbx/src/src/elements/osal.c b/libs/libmdbx/src/src/elements/osal.c index ae99fa05bb..118e57a191 100644 --- a/libs/libmdbx/src/src/elements/osal.c +++ b/libs/libmdbx/src/src/elements/osal.c @@ -371,11 +371,11 @@ MDBX_INTERNAL_FUNC int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) int rc = MDBX_SUCCESS; condmutex->event = NULL; - condmutex->mutex = CreateMutex(NULL, FALSE, NULL); + condmutex->mutex = CreateMutexW(NULL, FALSE, NULL); if (!condmutex->mutex) return GetLastError(); - condmutex->event = CreateEvent(NULL, FALSE, FALSE, NULL); + condmutex->event = CreateEventW(NULL, TRUE, FALSE, NULL); if (!condmutex->event) { rc = GetLastError(); (void)CloseHandle(condmutex->mutex); @@ -459,8 +459,11 @@ MDBX_INTERNAL_FUNC int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) DWORD code = SignalObjectAndWait(condmutex->mutex, condmutex->event, INFINITE, FALSE); - if (code == WAIT_OBJECT_0) + if (code == WAIT_OBJECT_0) { code = WaitForSingleObject(condmutex->mutex, INFINITE); + if (code == WAIT_OBJECT_0) + return ResetEvent(condmutex->event) ? MDBX_SUCCESS : GetLastError(); + } return waitstatus2errcode(code); #else return pthread_cond_wait(&condmutex->cond, &condmutex->mutex); @@ -509,108 +512,157 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #if defined(_WIN32) || defined(_WIN64) - return DeleteFileA(pathname) ? MDBX_SUCCESS : GetLastError(); + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + return DeleteFileW(pathnameW) ? MDBX_SUCCESS : GetLastError(); #else return unlink(pathname) ? errno : MDBX_SUCCESS; #endif } -MDBX_INTERNAL_FUNC int mdbx_openfile(const char *pathname, int flags, - mode_t mode, mdbx_filehandle_t *fd, - bool exclusive) { +MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, + const MDBX_env *env, const char *pathname, + mdbx_filehandle_t *fd, + mode_t unix_mode_bits) { *fd = INVALID_HANDLE_VALUE; + #if defined(_WIN32) || defined(_WIN64) - (void)mode; - size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) return ERROR_INVALID_NAME; wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) return ERROR_INVALID_NAME; - DWORD DesiredAccess, ShareMode; - DWORD FlagsAndAttributes = FILE_ATTRIBUTE_NORMAL; - switch (flags & (O_RDONLY | O_WRONLY | O_RDWR)) { + DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; + DWORD FlagsAndAttributes = + FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + DWORD DesiredAccess = FILE_READ_ATTRIBUTES; + DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE) + ? 0 + : (FILE_SHARE_READ | FILE_SHARE_WRITE); + + switch (purpose) { default: return ERROR_INVALID_PARAMETER; - case O_RDONLY: - DesiredAccess = GENERIC_READ; - ShareMode = - exclusive ? FILE_SHARE_READ : (FILE_SHARE_READ | FILE_SHARE_WRITE); + case MDBX_OPEN_LCK: + CreationDisposition = OPEN_ALWAYS; + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; + FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY; break; - case O_WRONLY: /* assume for MDBX_env_copy() and friends output */ - DesiredAccess = GENERIC_WRITE; - ShareMode = 0; - FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + case MDBX_OPEN_DXB_READ: + CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_READ; + ShareMode |= FILE_SHARE_READ; break; - case O_RDWR: - DesiredAccess = GENERIC_READ | GENERIC_WRITE; - ShareMode = exclusive ? 0 : (FILE_SHARE_READ | FILE_SHARE_WRITE); + case MDBX_OPEN_DXB_LAZY: + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; - } - - DWORD CreationDisposition; - switch (flags & (O_EXCL | O_CREAT)) { - default: - return ERROR_INVALID_PARAMETER; - case 0: + case MDBX_OPEN_DXB_DSYNC: CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_WRITE; + FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; break; - case O_EXCL | O_CREAT: + case MDBX_OPEN_COPY: CreationDisposition = CREATE_NEW; - FlagsAndAttributes |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; - break; - case O_CREAT: - CreationDisposition = OPEN_ALWAYS; - FlagsAndAttributes |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + ShareMode = 0; + DesiredAccess |= GENERIC_WRITE; + FlagsAndAttributes |= + (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; break; } *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, CreationDisposition, FlagsAndAttributes, NULL); - if (*fd == INVALID_HANDLE_VALUE) return GetLastError(); - if ((flags & O_CREAT) && GetLastError() != ERROR_ALREADY_EXISTS) { - /* set FILE_ATTRIBUTE_NOT_CONTENT_INDEXED for new file */ - DWORD FileAttributes = GetFileAttributesA(pathname); - if (FileAttributes == INVALID_FILE_ATTRIBUTES || - !SetFileAttributesA(pathname, FileAttributes | - FILE_ATTRIBUTE_NOT_CONTENT_INDEXED)) { - int rc = GetLastError(); - CloseHandle(*fd); - *fd = INVALID_HANDLE_VALUE; - return rc; - } + + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(*fd, &info)) { + int err = GetLastError(); + CloseHandle(*fd); + *fd = INVALID_HANDLE_VALUE; + return err; } + const DWORD AttributesDiff = + (info.dwFileAttributes ^ FlagsAndAttributes) & + (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | + FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); + if (AttributesDiff) + (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff); + #else - (void)exclusive; + int flags = unix_mode_bits ? O_CREAT : 0; + switch (purpose) { + default: + return EINVAL; + case MDBX_OPEN_LCK: + flags |= O_RDWR; + break; + case MDBX_OPEN_DXB_READ: + flags = O_RDONLY; + break; + case MDBX_OPEN_DXB_LAZY: + flags |= O_RDWR; + break; + case MDBX_OPEN_COPY: + flags = O_CREAT | O_WRONLY | O_EXCL; + break; + case MDBX_OPEN_DXB_DSYNC: + flags |= O_WRONLY; +#if defined(O_DSYNC) + flags |= O_DSYNC; +#elif defined(O_SYNC) + flags |= O_SYNC; +#elif defined(O_FSYNC) + flags |= O_FSYNC; +#endif + break; + } + + const bool direct_nocache_for_copy = + env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY; + if (direct_nocache_for_copy) { +#if defined(O_DIRECT) + flags |= O_DIRECT; +#endif /* O_DIRECT */ +#if defined(O_NOCACHE) + flags |= O_NOCACHE; +#endif /* O_NOCACHE */ + } + #ifdef O_CLOEXEC flags |= O_CLOEXEC; #endif /* O_CLOEXEC */ - *fd = open(pathname, flags, mode); + + *fd = open(pathname, flags, unix_mode_bits); +#if defined(O_DIRECT) + if (*fd < 0 && (flags & O_DIRECT) && + (errno == EINVAL || errno == EAFNOSUPPORT)) { + flags &= ~(O_DIRECT | O_EXCL); + *fd = open(pathname, flags, unix_mode_bits); + } +#endif /* O_DIRECT */ if (*fd < 0) return errno; #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) - int fd_flags = fcntl(*fd, F_GETFD); + const int fd_flags = fcntl(*fd, F_GETFD); if (fd_flags != -1) (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); #endif /* FD_CLOEXEC && !O_CLOEXEC */ - if ((flags & (O_RDONLY | O_WRONLY | O_RDWR)) == O_WRONLY) { - /* assume for MDBX_env_copy() and friends output */ -#if defined(O_DIRECT) - int fd_flags = fcntl(*fd, F_GETFD); - if (fd_flags != -1) - (void)fcntl(*fd, F_SETFL, fd_flags | O_DIRECT); -#endif /* O_DIRECT */ -#if defined(F_NOCACHE) + if (direct_nocache_for_copy) { +#if defined(F_NOCACHE) && !defined(O_NOCACHE) (void)fcntl(*fd, F_NOCACHE, 1); #endif /* F_NOCACHE */ } -#endif +#endif return MDBX_SUCCESS; } @@ -914,11 +966,6 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, #endif /* Linux */ const int mode = async ? MS_ASYNC : MS_SYNC; int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; -#if defined(__APPLE__) && \ - MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY - if (rc == MDBX_SUCCESS && mode == MS_SYNC) - rc = likely(fcntl(map->fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; -#endif /* MacOS */ return rc; #endif } @@ -1334,10 +1381,14 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, #endif + VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); + ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); return MDBX_SUCCESS; } MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + ASAN_POISON_MEMORY_REGION(map->address, map->current); #if defined(_WIN32) || defined(_WIN64) if (map->section) NtClose(map->section); @@ -1369,20 +1420,22 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, /* growth rw-section */ SectionSize.QuadPart = size; status = NtExtendSection(map->section, &SectionSize); - if (NT_SUCCESS(status)) { - map->current = size; - if (map->filesize < size) - map->filesize = size; - } - return ntstatus2errcode(status); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; } if (limit > map->limit) { - /* check ability of address space for growth before umnap */ + /* check ability of address space for growth before unmap */ PVOID BaseAddress = (PBYTE)map->address + map->limit; SIZE_T RegionSize = limit - map->limit; status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, &RegionSize, MEM_RESERVE, PAGE_NOACCESS); + if (status == /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) + return MDBX_RESULT_TRUE; if (!NT_SUCCESS(status)) return ntstatus2errcode(status); @@ -1411,9 +1464,13 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, bailout: map->address = NULL; map->current = map->limit = 0; - if (ReservedAddress) - (void)NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, - &ReservedSize, MEM_RELEASE); + if (ReservedAddress) { + ReservedSize = 0; + status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, + &ReservedSize, MEM_RELEASE); + assert(NT_SUCCESS(status)); + (void)status; + } return err; } @@ -1464,6 +1521,7 @@ retry_file_and_section: if (ReservedAddress) { /* release reserved address space */ + ReservedSize = 0; status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, &ReservedSize, MEM_RELEASE); ReservedAddress = NULL; @@ -1528,15 +1586,17 @@ retry_mapview:; } if (limit != map->limit) { -#if defined(_GNU_SOURCE) && (defined(__linux__) || defined(__gnu_linux__)) - void *ptr = mremap(map->address, map->limit, limit, - /* LY: in case changing the mapping size calling code - must guarantees the absence of competing threads, - and a willingness to another base address */ - MREMAP_MAYMOVE); +#if defined(MREMAP_MAYMOVE) + void *ptr = mremap(map->address, map->limit, limit, 0); if (ptr == MAP_FAILED) { rc = errno; - return (rc == EAGAIN || rc == ENOMEM) ? MDBX_RESULT_TRUE : rc; + switch (rc) { + case EAGAIN: + case ENOMEM: + case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */: + rc = MDBX_RESULT_TRUE; + } + return rc; } map->address = ptr; map->limit = limit; @@ -1544,15 +1604,17 @@ retry_mapview:; #ifdef MADV_DONTFORK if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) return errno; -#endif +#endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); -#endif +#endif /* MADV_NOHUGEPAGE */ -#else +#else /* MREMAP_MAYMOVE */ + /* TODO: Perhaps here it is worth to implement suspend/resume threads + * and perform unmap/map as like for Windows. */ rc = MDBX_RESULT_TRUE; -#endif /* _GNU_SOURCE && __linux__ */ +#endif /* !MREMAP_MAYMOVE */ } #endif return rc; diff --git a/libs/libmdbx/src/src/elements/osal.h b/libs/libmdbx/src/src/elements/osal.h index 815aae185f..c590ee51c9 100644 --- a/libs/libmdbx/src/src/elements/osal.h +++ b/libs/libmdbx/src/src/elements/osal.h @@ -566,13 +566,23 @@ enum mdbx_syncmode_bits { }; MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, - enum mdbx_syncmode_bits mode_bits); + const enum mdbx_syncmode_bits mode_bits); MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); -MDBX_INTERNAL_FUNC int mdbx_openfile(const char *pathname, int flags, - mode_t mode, mdbx_filehandle_t *fd, - bool exclusive); + +enum mdbx_openfile_purpose { + MDBX_OPEN_DXB_READ = 0, + MDBX_OPEN_DXB_LAZY = 1, + MDBX_OPEN_DXB_DSYNC = 2, + MDBX_OPEN_LCK = 3, + MDBX_OPEN_COPY = 4 +}; + +MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, + const MDBX_env *env, const char *pathname, + mdbx_filehandle_t *fd, + mode_t unix_mode_bits); MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); diff --git a/libs/libmdbx/src/src/elements/version.c b/libs/libmdbx/src/src/elements/version.c index 741fe7b541..58234230c4 100644 --- a/libs/libmdbx/src/src/elements/version.c +++ b/libs/libmdbx/src/src/elements/version.c @@ -4,7 +4,7 @@ #include "internals.h" #if MDBX_VERSION_MAJOR != 0 || \ - MDBX_VERSION_MINOR != 4 + MDBX_VERSION_MINOR != 5 #error "API version mismatch! Had `git fetch --tags` done?" #endif @@ -12,35 +12,35 @@ static const char sourcery[] = STRINGIFY(MDBX_BUILD_SOURCERY); __dll_export #ifdef __attribute_used__ -__attribute_used__ + __attribute_used__ #elif defined(__GNUC__) || __has_attribute(__used__) -__attribute__((__used__)) + __attribute__((__used__)) #endif #ifdef __attribute_externally_visible__ -__attribute_externally_visible__ + __attribute_externally_visible__ #elif (defined(__GNUC__) && !defined(__clang__)) || \ __has_attribute(__externally_visible__) -__attribute__((__externally_visible__)) + __attribute__((__externally_visible__)) #endif -const mdbx_version_info mdbx_version = { - 0, - 4, - 0, - 1692, - {"2019-12-05T01:43:57+03:00", "456d64170249e6108e90da02648de8c7707e2ed0", "3dccbb25a7710d4eff5fb136182b7bb0abfc4914", - "v0.4.0-12-g3dccbb2"}, - sourcery }; + const mdbx_version_info mdbx_version = { + 0, + 5, + 0, + 1879, + {"2020-01-07T22:30:21+03:00", "f982437718fa106bde7985a976752c08e95599a8", "3ffcc1722de83ce4cd5a37772d3b41e8aea02042", + "v0.5.0-10-g3ffcc17"}, + sourcery}; __dll_export #ifdef __attribute_used__ -__attribute_used__ + __attribute_used__ #elif defined(__GNUC__) || __has_attribute(__used__) -__attribute__((__used__)) + __attribute__((__used__)) #endif #ifdef __attribute_externally_visible__ -__attribute_externally_visible__ + __attribute_externally_visible__ #elif (defined(__GNUC__) && !defined(__clang__)) || \ __has_attribute(__externally_visible__) -__attribute__((__externally_visible__)) + __attribute__((__externally_visible__)) #endif -const char *const mdbx_sourcery_anchor = sourcery; + const char *const mdbx_sourcery_anchor = sourcery; diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.c b/libs/libmdbx/src/src/tools/mdbx_chk.c index ce1a4a0efd..61fb74c7e5 100644 --- a/libs/libmdbx/src/src/tools/mdbx_chk.c +++ b/libs/libmdbx/src/src/tools/mdbx_chk.c @@ -93,7 +93,7 @@ size_t userdb_count, skipped_subdb; uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; -bool ignore_wrong_order, quiet; +bool ignore_wrong_order, quiet, dont_traversal; const char *only_subdb; struct problem { @@ -626,22 +626,23 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, error("too many DBIs or out of memory\n"); return MDBX_ENOMEM; } - const uint64_t subtotal_pages = - ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages; - if (subtotal_pages != dbi->pages.total) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "subtotal", - subtotal_pages, dbi->pages.total); - if (ms.ms_branch_pages != dbi->pages.branch) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch", - ms.ms_branch_pages, dbi->pages.branch); - const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed; - if (ms.ms_leaf_pages != allleaf_pages) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "all-leaf", - ms.ms_leaf_pages, allleaf_pages); - if (ms.ms_overflow_pages != dbi->pages.large_volume) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", - "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume); - + if (!dont_traversal) { + const uint64_t subtotal_pages = + ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages; + if (subtotal_pages != dbi->pages.total) + error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", + "subtotal", subtotal_pages, dbi->pages.total); + if (ms.ms_branch_pages != dbi->pages.branch) + error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch", + ms.ms_branch_pages, dbi->pages.branch); + const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed; + if (ms.ms_leaf_pages != allleaf_pages) + error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", + "all-leaf", ms.ms_leaf_pages, allleaf_pages); + if (ms.ms_overflow_pages != dbi->pages.large_volume) + error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", + "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume); + } rc = mdbx_cursor_open(txn, dbi_handle, &mc); if (rc) { error("mdbx_cursor_open failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -758,18 +759,19 @@ bailout: } static void usage(char *prog) { - fprintf(stderr, - "usage: %s [-V] [-v] [-n] [-q] [-c] [-w] [-d] [-i] [-s subdb] dbpath\n" - " -V\t\tprint version and exit\n" - " -v\t\tmore verbose, could be used multiple times\n" - " -n\t\tNOSUBDIR mode for open\n" - " -q\t\tbe quiet\n" - " -c\t\tforce cooperative mode (don't try exclusive)\n" - " -w\t\tlock DB for writing while checking\n" - " -d\t\tdisable page-by-page traversal of B-tree\n" - " -i\t\tignore wrong order errors (for custom comparators case)\n" - " -s subdb\tprocess a specific subdatabase only\n", - prog); + fprintf( + stderr, + "usage: %s [-V] [-v] [-n] [-q] [-c] [-w] [-d] [-i] [-s subdb] dbpath\n" + " -V\t\tprint version and exit\n" + " -v\t\tmore verbose, could be used multiple times\n" + " -n\t\tNOSUBDIR mode for open\n" + " -q\t\tbe quiet\n" + " -c\t\tforce cooperative mode (don't try exclusive)\n" + " -w\t\tlock DB for writing while checking\n" + " -d\t\tdisable page-by-page traversal of B-tree\n" + " -i\t\tignore wrong order errors (for custom comparators case)\n" + " -s subdb\tprocess a specific subdatabase only\n", + prog); exit(EXIT_INTERRUPTED); } @@ -912,7 +914,6 @@ int main(int argc, char *argv[]) { char *prog = argv[0]; char *envname; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; - bool dont_traversal = false; bool locked = false; double elapsed; @@ -1171,6 +1172,14 @@ int main(int argc, char *argv[]) { } printf(", %" PRIu64 " pages\n", envinfo.mi_geo.current / envinfo.mi_dxb_pagesize); +#if defined(_WIN32) || defined(_WIN64) + if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper) + print(" WARNING: Due Windows system limitations a " + "file couldn't\n be truncated while database " + "is opened. So, the size of\n database file " + "may by large than the database itself,\n " + "until it will be closed or reopened in read-write mode.\n"); +#endif print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 ", lag %" PRIi64 "\n", envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.c b/libs/libmdbx/src/src/tools/mdbx_stat.c index bd70eb7006..461f569afb 100644 --- a/libs/libmdbx/src/src/tools/mdbx_stat.c +++ b/libs/libmdbx/src/src/tools/mdbx_stat.c @@ -72,7 +72,7 @@ static int reader_list_func(void *ctx, int num, int slot, mdbx_pid_t pid, size_t bytes_used, size_t bytes_retained) { (void)ctx; if (num == 1) - printf("Reader Table Status\n" + printf("Reader Table\n" " #\tslot\t%6s %*s %20s %10s %13s %13s\n", "pid", (int)sizeof(size_t) * 2, "thread", "txnid", "lag", "used", "retained"); @@ -212,21 +212,28 @@ int main(int argc, char *argv[]) { mei.mi_geo.shrink, mei.mi_geo.lower / mst.ms_psize, mei.mi_geo.upper / mst.ms_psize, mei.mi_geo.grow / mst.ms_psize, mei.mi_geo.shrink / mst.ms_psize); + printf(" Current mapsize: %" PRIu64 " bytes, %" PRIu64 " pages \n", + mei.mi_mapsize, mei.mi_mapsize / mst.ms_psize); printf(" Current datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", mei.mi_geo.current, mei.mi_geo.current / mst.ms_psize); +#if defined(_WIN32) || defined(_WIN64) + if (mei.mi_geo.shrink && mei.mi_geo.current != mei.mi_geo.upper) + printf(" WARNING: Due Windows system limitations a " + "file couldn't\n be truncated while database " + "is opened. So, the size of\n database file " + "may by large than the database itself,\n " + "until it will be closed or reopened in read-write mode.\n"); +#endif } else { printf(" Fixed datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", mei.mi_geo.current, mei.mi_geo.current / mst.ms_psize); } - printf(" Current mapsize: %" PRIu64 " bytes, %" PRIu64 " pages \n", - mei.mi_mapsize, mei.mi_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIu64 "\n", mei.mi_last_pgno + 1); printf(" Last transaction ID: %" PRIu64 "\n", mei.mi_recent_txnid); - printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", + printf(" Latter reader transaction ID: %" PRIu64 " (%" PRIi64 ")\n", mei.mi_latter_reader_txnid, mei.mi_latter_reader_txnid - mei.mi_recent_txnid); printf(" Max readers: %u\n", mei.mi_maxreaders); - printf(" Number of readers used: %u\n", mei.mi_numreaders); + printf(" Number of reader slots uses: %u\n", mei.mi_numreaders); } else { /* LY: zap warnings from gcc */ memset(&mst, 0, sizeof(mst)); @@ -262,7 +269,7 @@ int main(int argc, char *argv[]) { pgno_t pages = 0, *iptr; pgno_t reclaimable = 0; - printf("Freelist Status\n"); + printf("Garbage Collection\n"); dbi = 0; rc = mdbx_cursor_open(txn, dbi, &cursor); if (rc) { @@ -343,20 +350,23 @@ int main(int argc, char *argv[]) { if (envinfo) { uint64_t value = mei.mi_mapsize / mst.ms_psize; double percent = value / 100.0; - printf("Page Allocation Info\n"); - printf(" Max pages: %" PRIu64 " 100%%\n", value); + printf("Page Usage\n"); + printf(" Total: %" PRIu64 " 100%%\n", value); + + value = mei.mi_geo.current / mst.ms_psize; + printf(" Backed: %" PRIu64 " %.1f%%\n", value, value / percent); value = mei.mi_last_pgno + 1; - printf(" Pages used: %" PRIu64 " %.1f%%\n", value, value / percent); + printf(" Allocated: %" PRIu64 " %.1f%%\n", value, value / percent); value = mei.mi_mapsize / mst.ms_psize - (mei.mi_last_pgno + 1); printf(" Remained: %" PRIu64 " %.1f%%\n", value, value / percent); value = mei.mi_last_pgno + 1 - pages; - printf(" Used now: %" PRIu64 " %.1f%%\n", value, value / percent); + printf(" Used: %" PRIu64 " %.1f%%\n", value, value / percent); value = pages; - printf(" Unallocated: %" PRIu64 " %.1f%%\n", value, value / percent); + printf(" GC: %" PRIu64 " %.1f%%\n", value, value / percent); value = pages - reclaimable; printf(" Detained: %" PRIu64 " %.1f%%\n", value, value / percent); @@ -368,7 +378,7 @@ int main(int argc, char *argv[]) { mei.mi_mapsize / mst.ms_psize - (mei.mi_last_pgno + 1) + reclaimable; printf(" Available: %" PRIu64 " %.1f%%\n", value, value / percent); } else - printf(" Free pages: %" PRIaPGNO "\n", pages); + printf(" GC: %" PRIaPGNO " pages\n", pages); } rc = mdbx_dbi_open(txn, subname, 0, &dbi); diff --git a/libs/libmdbx/src/test/CMakeLists.txt b/libs/libmdbx/src/test/CMakeLists.txt index bb4abd5d6d..0cc22182c9 100644 --- a/libs/libmdbx/src/test/CMakeLists.txt +++ b/libs/libmdbx/src/test/CMakeLists.txt @@ -1,10 +1,10 @@ -set(TARGET mdbx_test) -project(${TARGET}) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(TEST_OSAL windows) +else() + set(TEST_OSAL unix) +endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-cast-qual") - -add_executable(${TARGET} +add_executable(mdbx_test base.h cases.cc chrono.cc @@ -21,7 +21,7 @@ add_executable(${TARGET} log.h main.cc osal.h - osal-unix.cc + osal-${TEST_OSAL}.cc test.cc test.h try.cc @@ -29,9 +29,25 @@ add_executable(${TARGET} utils.h append.cc ttl.cc + nested.cc ) -target_link_libraries(${TARGET} - mdbx - ) +set_target_properties(mdbx_test PROPERTIES + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}> + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON) + +if(CC_HAS_FASTMATH) + target_compile_options(mdbx_test PRIVATE "-ffast-math") +endif() +if(CC_HAS_VISIBILITY AND (LTO_ENABLED OR INTERPROCEDURAL_OPTIMIZATION)) + set_target_properties(mdbx_test PROPERTIES LINK_FLAGS "-fvisibility=hidden") +endif() + +target_link_libraries(mdbx_test mdbx ${LIB_MATH} ${CMAKE_THREAD_LIBS_INIT}) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + target_link_libraries(mdbx_test winmm.lib) +endif() +if(UNIX AND NOT SUBPROJECT) + add_subdirectory(pcrf) +endif() diff --git a/libs/libmdbx/src/test/append.cc b/libs/libmdbx/src/test/append.cc index 273f68b810..a30351dcde 100644 --- a/libs/libmdbx/src/test/append.cc +++ b/libs/libmdbx/src/test/append.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -15,7 +15,6 @@ #include "test.h" bool testcase_append::run() { - MDBX_dbi dbi; int err = db_open__begin__table_create_open_clean(dbi); if (unlikely(err != MDBX_SUCCESS)) { log_notice("append: bailout-prepare due '%s'", mdbx_strerror(err)); @@ -54,7 +53,7 @@ bool testcase_append::run() { } log_trace("append: append-a %" PRIu64, serial); - generate_pair(serial, key, data); + generate_pair(serial); int cmp = inserted_number ? mdbx_cmp(txn_guard.get(), dbi, &key->value, &last_key->value) : 1; diff --git a/libs/libmdbx/src/test/base.h b/libs/libmdbx/src/test/base.h index 5ca134a81d..05dfd1c4e3 100644 --- a/libs/libmdbx/src/test/base.h +++ b/libs/libmdbx/src/test/base.h @@ -80,14 +80,19 @@ #include <unordered_set> #include <vector> +#define MDBX_INTERNAL_FUNC +#define MDBX_INTERNAL_VAR extern +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ #include "../mdbx.h" -#include "../src/defs.h" -#include "../src/osal.h" +#include "../src/elements/defs.h" +#include "../src/elements/osal.h" #if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__)) #define __thread __declspec(thread) #endif /* __thread */ +#include "../src/elements/options.h" + #ifdef _MSC_VER #pragma warning(pop) #pragma warning(disable : 4201) /* nonstandard extension used : \ diff --git a/libs/libmdbx/src/test/cases.cc b/libs/libmdbx/src/test/cases.cc index a98834a457..b01169004b 100644 --- a/libs/libmdbx/src/test/cases.cc +++ b/libs/libmdbx/src/test/cases.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -17,7 +17,6 @@ void configure_actor(unsigned &last_space_id, const actor_testcase testcase, const char *space_id_cstr, const actor_params ¶ms) { unsigned wait4id = 0; - if (params.waitfor_nops) { for (auto i = global::actors.rbegin(); i != global::actors.rend(); ++i) { if (i->is_waitable(params.waitfor_nops)) { @@ -61,15 +60,15 @@ void testcase_setup(const char *casename, actor_params ¶ms, unsigned &last_space_id) { if (strcmp(casename, "basic") == 0) { log_notice(">>> testcase_setup(%s)", casename); - configure_actor(last_space_id, ac_jitter, nullptr, params); - configure_actor(last_space_id, ac_hill, nullptr, params); - configure_actor(last_space_id, ac_ttl, nullptr, params); - configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_nested, nullptr, params); configure_actor(last_space_id, ac_hill, nullptr, params); configure_actor(last_space_id, ac_ttl, nullptr, params); - configure_actor(last_space_id, ac_try, nullptr, params); configure_actor(last_space_id, ac_copy, nullptr, params); configure_actor(last_space_id, ac_append, nullptr, params); + configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_try, nullptr, params); + configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_try, nullptr, params); log_notice("<<< testcase_setup(%s): done", casename); } else { failure("unknown testcase `%s`", casename); diff --git a/libs/libmdbx/src/test/chrono.cc b/libs/libmdbx/src/test/chrono.cc index 38cb321a81..315d379088 100644 --- a/libs/libmdbx/src/test/chrono.cc +++ b/libs/libmdbx/src/test/chrono.cc @@ -16,7 +16,10 @@ namespace chrono { +#ifndef NSEC_PER_SEC #define NSEC_PER_SEC 1000000000u +#endif /* NSEC_PER_SEC */ + uint32_t ns2fractional(uint32_t ns) { assert(ns < NSEC_PER_SEC); /* LY: здесь и далее используется "длинное деление", которое @@ -30,7 +33,9 @@ uint32_t fractional2ns(uint32_t fractional) { return (fractional * (uint64_t)NSEC_PER_SEC) >> 32; } +#ifndef USEC_PER_SEC #define USEC_PER_SEC 1000000u +#endif /* USEC_PER_SEC */ uint32_t us2fractional(uint32_t us) { assert(us < USEC_PER_SEC); return ((uint64_t)us << 32) / USEC_PER_SEC; @@ -40,7 +45,9 @@ uint32_t fractional2us(uint32_t fractional) { return (fractional * (uint64_t)USEC_PER_SEC) >> 32; } +#ifndef MSEC_PER_SEC #define MSEC_PER_SEC 1000u +#endif /* MSEC_PER_SEC */ uint32_t ms2fractional(uint32_t ms) { assert(ms < MSEC_PER_SEC); return ((uint64_t)ms << 32) / MSEC_PER_SEC; diff --git a/libs/libmdbx/src/test/chrono.h b/libs/libmdbx/src/test/chrono.h index 11675195ac..07cdef66cf 100644 --- a/libs/libmdbx/src/test/chrono.h +++ b/libs/libmdbx/src/test/chrono.h @@ -15,7 +15,6 @@ #pragma once #include "base.h" -#include "log.h" #include "utils.h" namespace chrono { @@ -24,7 +23,7 @@ namespace chrono { typedef union time { uint64_t fixedpoint; - struct { + __anonymous_struct_extension__ struct { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ uint32_t fractional; union { diff --git a/libs/libmdbx/src/test/config.cc b/libs/libmdbx/src/test/config.cc index dd150e9a91..eedfd7b2d4 100644 --- a/libs/libmdbx/src/test/config.cc +++ b/libs/libmdbx/src/test/config.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -294,14 +294,21 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, //----------------------------------------------------------------------------- -const struct option_verb mode_bits[] = { - {"rdonly", MDBX_RDONLY}, {"mapasync", MDBX_MAPASYNC}, - {"utterly", MDBX_UTTERLY_NOSYNC}, {"nosubdir", MDBX_NOSUBDIR}, - {"nosync", MDBX_NOSYNC}, {"nometasync", MDBX_NOMETASYNC}, - {"writemap", MDBX_WRITEMAP}, {"notls", MDBX_NOTLS}, - {"nordahead", MDBX_NORDAHEAD}, {"nomeminit", MDBX_NOMEMINIT}, - {"coalesce", MDBX_COALESCE}, {"lifo", MDBX_LIFORECLAIM}, - {"perturb", MDBX_PAGEPERTURB}, {nullptr, 0}}; +const struct option_verb mode_bits[] = {{"rdonly", MDBX_RDONLY}, + {"mapasync", MDBX_MAPASYNC}, + {"nosync-utterly", MDBX_UTTERLY_NOSYNC}, + {"nosubdir", MDBX_NOSUBDIR}, + {"nosync-safe", MDBX_SAFE_NOSYNC}, + {"nometasync", MDBX_NOMETASYNC}, + {"writemap", MDBX_WRITEMAP}, + {"notls", MDBX_NOTLS}, + {"nordahead", MDBX_NORDAHEAD}, + {"nomeminit", MDBX_NOMEMINIT}, + {"coalesce", MDBX_COALESCE}, + {"lifo", MDBX_LIFORECLAIM}, + {"perturb", MDBX_PAGEPERTURB}, + {"accede", MDBX_ACCEDE}, + {nullptr, 0}}; const struct option_verb table_bits[] = { {"key.reverse", MDBX_REVERSEKEY}, @@ -314,7 +321,7 @@ const struct option_verb table_bits[] = { static void dump_verbs(const char *caption, size_t bits, const struct option_verb *verbs) { - log_info("%s: 0x%" PRIx64 " = ", caption, (uint64_t)bits); + log_verbose("%s: 0x%" PRIx64 " = ", caption, (uint64_t)bits); const char *comma = ""; while (verbs->mask && bits) { @@ -330,7 +337,7 @@ static void dump_verbs(const char *caption, size_t bits, } static void dump_duration(const char *caption, unsigned duration) { - log_info("%s: ", caption); + log_verbose("%s: ", caption); if (duration) { if (duration > 24 * 3600) logging::feed("%u_", duration / (24 * 3600)); @@ -347,84 +354,92 @@ void dump(const char *title) { logging::local_suffix indent(title); for (auto i = global::actors.begin(); i != global::actors.end(); ++i) { - log_info("#%u, testcase %s, space_id/table %u\n", i->actor_id, - testcase2str(i->testcase), i->space_id); + log_verbose("#%u, testcase %s, space_id/table %u\n", i->actor_id, + testcase2str(i->testcase), i->space_id); indent.push(); if (i->params.loglevel) { - log_info("log: level %u, %s\n", i->params.loglevel, - i->params.pathname_log.empty() ? "console" - : i->params.pathname_log.c_str()); + log_verbose("log: level %u, %s\n", i->params.loglevel, + i->params.pathname_log.empty() + ? "console" + : i->params.pathname_log.c_str()); } - log_info("database: %s, size %" PRIuPTR "[%" PRIiPTR "..%" PRIiPTR - ", %i %i, %i]\n", - i->params.pathname_db.c_str(), i->params.size_now, - i->params.size_lower, i->params.size_upper, - i->params.shrink_threshold, i->params.growth_step, - i->params.pagesize); + log_verbose("database: %s, size %" PRIuPTR "[%" PRIiPTR "..%" PRIiPTR + ", %i %i, %i]\n", + i->params.pathname_db.c_str(), i->params.size_now, + i->params.size_lower, i->params.size_upper, + i->params.shrink_threshold, i->params.growth_step, + i->params.pagesize); dump_verbs("mode", i->params.mode_flags, mode_bits); dump_verbs("table", i->params.table_flags, table_bits); if (i->params.test_nops) - log_info("iterations/records %u\n", i->params.test_nops); + log_verbose("iterations/records %u\n", i->params.test_nops); else dump_duration("duration", i->params.test_duration); if (i->params.nrepeat) - log_info("repeat %u\n", i->params.nrepeat); + log_verbose("repeat %u\n", i->params.nrepeat); else - log_info("repeat ETERNALLY\n"); + log_verbose("repeat ETERNALLY\n"); - log_info("threads %u\n", i->params.nthreads); + log_verbose("threads %u\n", i->params.nthreads); - log_info( + log_verbose( "keygen.params: case %s, width %u, mesh %u, rotate %u, offset %" PRIu64 ", split %u/%u\n", keygencase2str(i->params.keygen.keycase), i->params.keygen.width, i->params.keygen.mesh, i->params.keygen.rotate, i->params.keygen.offset, i->params.keygen.split, i->params.keygen.width - i->params.keygen.split); - log_info("keygen.seed: %u\n", i->params.keygen.seed); - log_info("key: minlen %u, maxlen %u\n", i->params.keylen_min, - i->params.keylen_max); - log_info("data: minlen %u, maxlen %u\n", i->params.datalen_min, - i->params.datalen_max); + log_verbose("keygen.seed: %u\n", i->params.keygen.seed); + log_verbose("key: minlen %u, maxlen %u\n", i->params.keylen_min, + i->params.keylen_max); + log_verbose("data: minlen %u, maxlen %u\n", i->params.datalen_min, + i->params.datalen_max); - log_info("batch: read %u, write %u\n", i->params.batch_read, - i->params.batch_write); + log_verbose("batch: read %u, write %u\n", i->params.batch_read, + i->params.batch_write); if (i->params.waitfor_nops) - log_info("wait: actor %u for %u ops\n", i->wait4id, - i->params.waitfor_nops); + log_verbose("wait: actor %u for %u ops\n", i->wait4id, + i->params.waitfor_nops); else if (i->params.delaystart) dump_duration("delay", i->params.delaystart); else - log_info("no-delay\n"); + log_verbose("no-delay\n"); if (i->params.inject_writefaultn) - log_info("inject-writefault on %u ops\n", i->params.inject_writefaultn); + log_verbose("inject-writefault on %u ops\n", + i->params.inject_writefaultn); else - log_info("no-inject-writefault\n"); + log_verbose("no-inject-writefault\n"); - log_info("limits: readers %u, tables %u\n", i->params.max_readers, - i->params.max_tables); + log_verbose("limits: readers %u, tables %u, txn-bytes %zu\n", + i->params.max_readers, i->params.max_tables, + mdbx_limits_txnsize_max(i->params.pagesize)); + + log_verbose("drop table: %s\n", i->params.drop_table ? "Yes" : "No"); + log_verbose("ignore MDBX_MAP_FULL error: %s\n", + i->params.ignore_dbfull ? "Yes" : "No"); + log_verbose("verifying by speculum: %s\n", + i->params.speculum ? "Yes" : "No"); - log_info("drop table: %s\n", i->params.drop_table ? "Yes" : "No"); - log_info("ignore MDBX_MAP_FULL error: %s\n", - i->params.ignore_dbfull ? "Yes" : "No"); indent.pop(); } dump_duration("timeout", global::config::timeout_duration_seconds); - log_info("cleanup: before %s, after %s\n", - global::config::cleanup_before ? "Yes" : "No", - global::config::cleanup_after ? "Yes" : "No"); - - log_info("failfast: %s\n", global::config::failfast ? "Yes" : "No"); - log_info("progress indicator: %s\n", - global::config::progress_indicator ? "Yes" : "No"); + log_verbose("cleanup: before %s, after %s\n", + global::config::cleanup_before ? "Yes" : "No", + global::config::cleanup_after ? "Yes" : "No"); + + log_verbose("failfast: %s\n", global::config::failfast ? "Yes" : "No"); + log_verbose("progress indicator: %s\n", + global::config::progress_indicator ? "Yes" : "No"); + log_verbose("console mode: %s\n", + global::config::console_mode ? "Yes" : "No"); } } /* namespace config */ @@ -452,26 +467,31 @@ const std::string actor_config::serialize(const char *prefix) const { checksum.push(params.pathname_db); result.append(params.pathname_db); - result.append("|"); + result.push_back('|'); checksum.push(params.pathname_log); result.append(params.pathname_log); - result.append("|"); + result.push_back('|'); static_assert(std::is_pod<actor_params_pod>::value, "actor_params_pod should by POD"); result.append(data2hex(static_cast<const actor_params_pod *>(¶ms), sizeof(actor_params_pod), checksum)); - result.append("|"); + result.push_back('|'); static_assert(std::is_pod<actor_config_pod>::value, "actor_config_pod should by POD"); result.append(data2hex(static_cast<const actor_config_pod *>(this), sizeof(actor_config_pod), checksum)); - result.append("|"); + result.push_back('|'); + result.push_back(global::config::progress_indicator ? 'Y' : 'N'); + checksum.push(global::config::progress_indicator); + result.push_back(global::config::console_mode ? 'Y' : 'N'); + checksum.push(global::config::console_mode); + result.push_back('|'); result.append(osal_serialize(checksum)); - result.append("|"); + result.push_back('|'); result.append(std::to_string(checksum.value)); return result; @@ -535,6 +555,20 @@ bool actor_config::deserialize(const char *str, actor_config &config) { TRACE("<< actor_config::deserialize: slash-5\n"); return false; } + if ((str[0] == 'Y' || str[0] == 'N') && (str[1] == 'Y' || str[1] == 'N')) { + global::config::progress_indicator = str[0] == 'Y'; + checksum.push(global::config::progress_indicator); + global::config::console_mode = str[1] == 'Y'; + checksum.push(global::config::console_mode); + str = slash + 1; + + slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-6\n"); + return false; + } + } + if (!config.osal_deserialize(str, slash, checksum)) { TRACE("<< actor_config::deserialize: osal\n"); return false; @@ -556,10 +590,7 @@ unsigned actor_params::mdbx_keylen_min() const { } unsigned actor_params::mdbx_keylen_max() const { - return (table_flags & MDBX_INTEGERKEY) - ? 8 - : std::min((unsigned)mdbx_limits_keysize_max(pagesize), - (unsigned)UINT16_MAX); + return (unsigned)mdbx_limits_keysize_max(pagesize, table_flags); } unsigned actor_params::mdbx_datalen_min() const { @@ -567,10 +598,6 @@ unsigned actor_params::mdbx_datalen_min() const { } unsigned actor_params::mdbx_datalen_max() const { - return (table_flags & MDBX_INTEGERDUP) - ? 8 - : std::min((table_flags & MDBX_DUPSORT) - ? (unsigned)mdbx_limits_keysize_max(pagesize) - : (unsigned)MDBX_MAXDATASIZE, - (unsigned)UINT16_MAX); + return std::min((unsigned)UINT16_MAX, + (unsigned)mdbx_limits_valsize_max(pagesize, table_flags)); } diff --git a/libs/libmdbx/src/test/config.h b/libs/libmdbx/src/test/config.h index 89889d8eb6..9e083f3f22 100644 --- a/libs/libmdbx/src/test/config.h +++ b/libs/libmdbx/src/test/config.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -29,12 +29,13 @@ enum actor_testcase { ac_try, ac_copy, ac_append, - ac_ttl + ac_ttl, + ac_nested }; enum actor_status { as_unknown, - as_debuging, + as_debugging, as_running, as_successful, as_killed, @@ -123,8 +124,6 @@ inline bool parse_option_intptr(int argc, char *const argv[], int &narg, #pragma pack(push, 1) struct keygen_params_pod { - keygen_case keycase; - /* Параметры генератора пар key-value. * * Ключи и значения генерируются по задаваемым параметрам на основе "плоской" @@ -231,11 +230,10 @@ struct keygen_params_pod { uint8_t split; uint32_t seed; uint64_t offset; + keygen_case keycase; }; struct actor_params_pod { - unsigned loglevel; - unsigned mode_flags; unsigned table_flags; intptr_t size_lower; @@ -264,8 +262,10 @@ struct actor_params_pod { unsigned max_tables; keygen_params_pod keygen; + uint8_t loglevel; bool drop_table; bool ignore_dbfull; + bool speculum; }; struct actor_config_pod { diff --git a/libs/libmdbx/src/test/copy.cc b/libs/libmdbx/src/test/copy.cc index e239d41e34..ff53153e1a 100644 --- a/libs/libmdbx/src/test/copy.cc +++ b/libs/libmdbx/src/test/copy.cc @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h" void testcase_copy::copy_db(const bool with_compaction) { int err = osal_removefile(copy_pathname); diff --git a/libs/libmdbx/src/test/dead.cc b/libs/libmdbx/src/test/dead.cc index a1a8b5f9de..8f83bbeb2e 100644 --- a/libs/libmdbx/src/test/dead.cc +++ b/libs/libmdbx/src/test/dead.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. diff --git a/libs/libmdbx/src/test/hill.cc b/libs/libmdbx/src/test/hill.cc index 1b03ddf0fc..9d9890951a 100644 --- a/libs/libmdbx/src/test/hill.cc +++ b/libs/libmdbx/src/test/hill.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -15,12 +15,13 @@ #include "test.h" bool testcase_hill::run() { - MDBX_dbi dbi; int err = db_open__begin__table_create_open_clean(dbi); if (unlikely(err != MDBX_SUCCESS)) { log_notice("hill: bailout-prepare due '%s'", mdbx_strerror(err)); - return true; + return false; } + speculum.clear(); + speculum_commited.clear(); /* LY: тест "холмиком": * - сначала наполняем таблицу циклическими CRUD-манипуляциями, @@ -63,6 +64,7 @@ bool testcase_hill::run() { uint64_t commited_serial = serial_count; unsigned txn_nops = 0; + bool rc = false; while (should_continue()) { const keygen::serial_t a_serial = serial_count; if (unlikely(!keyvalue_maker.increment(serial_count, 1))) { @@ -78,53 +80,74 @@ bool testcase_hill::run() { log_trace("uphill: insert-a (age %" PRIu64 ") %" PRIu64, age_shift, a_serial); generate_pair(a_serial, a_key, a_data_1, age_shift); - err = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, - insert_flags); + + err = insert(a_key, a_data_1, insert_flags); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("uphill: bailout at insert-a due '%s'", mdbx_strerror(err)); txn_restart(true, false); serial_count = commited_serial; + speculum = speculum_commited; break; } failure_perror("mdbx_put(insert-a.1)", err); } + if (!speculum_verify()) { + log_notice("uphill: bailout after insert-a, before commit"); + goto bailout; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); serial_count = commited_serial; + speculum = speculum_commited; break; } + speculum_commited = speculum; commited_serial = a_serial; txn_nops = 0; + if (!speculum_verify()) { + log_notice("uphill: bailout after insert-a, after commit"); + goto bailout; + } } // создаем вторую запись из пары log_trace("uphill: insert-b %" PRIu64, b_serial); generate_pair(b_serial, b_key, b_data, 0); - err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, - insert_flags); + err = insert(b_key, b_data, insert_flags); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("uphill: bailout at insert-b due '%s'", mdbx_strerror(err)); txn_restart(true, false); serial_count = commited_serial; + speculum = speculum_commited; break; } failure_perror("mdbx_put(insert-b)", err); } + if (!speculum_verify()) { + log_notice("uphill: bailout after insert-b, before commit"); + goto bailout; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); serial_count = commited_serial; + speculum = speculum_commited; break; } + speculum_commited = speculum; commited_serial = a_serial; txn_nops = 0; + if (!speculum_verify()) { + log_notice("uphill: bailout after insert-b, after commit"); + goto bailout; + } } // обновляем данные в первой записи @@ -132,52 +155,73 @@ bool testcase_hill::run() { a_serial); generate_pair(a_serial, a_key, a_data_0, 0); checkdata("uphill: update-a", dbi, a_key->value, a_data_1->value); - err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, - &a_data_1->value, update_flags); + err = replace(a_key, a_data_0, a_data_1, update_flags); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("uphill: bailout at update-a due '%s'", mdbx_strerror(err)); txn_restart(true, false); serial_count = commited_serial; + speculum = speculum_commited; break; } failure_perror("mdbx_replace(update-a: 1->0)", err); } + if (!speculum_verify()) { + log_notice("uphill: bailout after update-a, before commit"); + goto bailout; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); serial_count = commited_serial; + speculum = speculum_commited; break; } + speculum_commited = speculum; commited_serial = a_serial; txn_nops = 0; + if (!speculum_verify()) { + log_notice("uphill: bailout after update-a, after commit"); + goto bailout; + } } // удаляем вторую запись log_trace("uphill: delete-b %" PRIu64, b_serial); checkdata("uphill: delete-b", dbi, b_key->value, b_data->value); - err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + err = remove(b_key, b_data); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("uphill: bailout at delete-b due '%s'", mdbx_strerror(err)); txn_restart(true, false); serial_count = commited_serial; + speculum = speculum_commited; break; } failure_perror("mdbx_del(b)", err); } + if (!speculum_verify()) { + log_notice("uphill: bailout after delete-b, before commit"); + goto bailout; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); serial_count = commited_serial; + speculum = speculum_commited; break; } + speculum_commited = speculum; commited_serial = a_serial; txn_nops = 0; + if (!speculum_verify()) { + log_notice("uphill: bailout after delete-b, after commit"); + goto bailout; + } } report(1); @@ -204,101 +248,145 @@ bool testcase_hill::run() { generate_pair(a_serial, a_key, a_data_0, 0); generate_pair(a_serial, a_key, a_data_1, age_shift); checkdata("downhill: update-a", dbi, a_key->value, a_data_0->value); - err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, - &a_data_0->value, update_flags); + err = replace(a_key, a_data_1, a_data_0, update_flags); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("downhill: bailout at update-a due '%s'", mdbx_strerror(err)); txn_end(true); + speculum = speculum_commited; break; } failure_perror("mdbx_put(update-a: 0->1)", err); } + if (!speculum_verify()) { + log_notice("downhill: bailout after update-a, before commit"); + break; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + speculum = speculum_commited; break; } + speculum_commited = speculum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("downhill: bailout after update-a, after commit"); + break; + } } // создаем вторую запись из пары log_trace("downhill: insert-b %" PRIu64, b_serial); generate_pair(b_serial, b_key, b_data, 0); - err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, - insert_flags); + err = insert(b_key, b_data, insert_flags); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("downhill: bailout at insert-a due '%s'", mdbx_strerror(err)); txn_end(true); + speculum = speculum_commited; break; } failure_perror("mdbx_put(insert-b)", err); } + if (!speculum_verify()) { + log_notice("downhill: bailout after insert-b, before commit"); + break; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + speculum = speculum_commited; break; } + speculum_commited = speculum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("downhill: bailout after insert-b, after commit"); + break; + } } // удаляем первую запись log_trace("downhill: delete-a (age %" PRIu64 ") %" PRIu64, age_shift, a_serial); checkdata("downhill: delete-a", dbi, a_key->value, a_data_1->value); - err = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value); + err = remove(a_key, a_data_1); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("downhill: bailout at delete-a due '%s'", mdbx_strerror(err)); txn_end(true); + speculum = speculum_commited; break; } failure_perror("mdbx_del(a)", err); } + if (!speculum_verify()) { + log_notice("downhill: bailout after delete-a, before commit"); + break; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + speculum = speculum_commited; break; } + speculum_commited = speculum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("downhill: bailout after delete-a, after commit"); + break; + } } // удаляем вторую запись log_trace("downhill: delete-b %" PRIu64, b_serial); checkdata("downhill: delete-b", dbi, b_key->value, b_data->value); - err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + err = remove(b_key, b_data); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("downhill: bailout at delete-b due '%s'", mdbx_strerror(err)); txn_end(true); + speculum = speculum_commited; break; } failure_perror("mdbx_del(b)", err); } + if (!speculum_verify()) { + log_notice("downhill: bailout after delete-b, before commit"); + break; + } if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + speculum = speculum_commited; break; } + speculum_commited = speculum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("downhill: bailout after delete-b, after commit"); + goto bailout; + } } report(1); } + rc = speculum_verify(); +bailout: if (txn_guard) { err = breakable_commit(); if (unlikely(err != MDBX_SUCCESS)) @@ -312,10 +400,10 @@ bool testcase_hill::run() { err = breakable_commit(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("hill: bailout-clean due '%s'", mdbx_strerror(err)); - return true; + return rc; } } else db_table_close(dbi); } - return true; + return rc; } diff --git a/libs/libmdbx/src/test/jitter.cc b/libs/libmdbx/src/test/jitter.cc index 82d1d764ff..c7b8bccb51 100644 --- a/libs/libmdbx/src/test/jitter.cc +++ b/libs/libmdbx/src/test/jitter.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -15,10 +15,26 @@ #include "test.h" bool testcase_jitter::run() { + int err; + size_t upper_limit = config.params.size_upper; + if (upper_limit < 1) + upper_limit = config.params.size_now * 2; + while (should_continue()) { jitter_delay(); db_open(); + if (upper_limit < 1) { + MDBX_envinfo info; + err = mdbx_env_info_ex(db_guard.get(), txn_guard.get(), &info, + sizeof(info)); + if (err) + failure_perror("mdbx_env_info_ex()", err); + upper_limit = (info.mi_geo.upper < INTPTR_MAX) + ? (intptr_t)info.mi_geo.upper + : INTPTR_MAX; + } + if (flipcoin()) { jitter_delay(); txn_begin(true); @@ -27,19 +43,29 @@ bool testcase_jitter::run() { txn_end(flipcoin()); } + const bool coin4size = flipcoin(); jitter_delay(); txn_begin(mode_readonly()); jitter_delay(); if (!mode_readonly()) { fetch_canary(); update_canary(1); - /* TODO: - * - db_setsize() - * ... - */ + err = mdbx_env_set_geometry( + db_guard.get(), -1, -1, + coin4size ? upper_limit * 2 / 3 : upper_limit * 3 / 2, -1, -1, -1); + if (err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE && + err != MDBX_MAP_FULL && err != MDBX_TOO_LARGE) + failure_perror("mdbx_env_set_geometry-1", err); } txn_end(flipcoin()); + err = mdbx_env_set_geometry( + db_guard.get(), -1, -1, + !coin4size ? upper_limit * 2 / 3 : upper_limit * 3 / 2, -1, -1, -1); + if (err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE && + err != MDBX_MAP_FULL && err != MDBX_TOO_LARGE) + failure_perror("mdbx_env_set_geometry-2", err); + if (flipcoin()) { jitter_delay(); txn_begin(true); @@ -48,6 +74,12 @@ bool testcase_jitter::run() { } jitter_delay(); + err = + mdbx_env_set_geometry(db_guard.get(), -1, -1, upper_limit, -1, -1, -1); + if (err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE && + err != MDBX_MAP_FULL && err != MDBX_TOO_LARGE) + failure_perror("mdbx_env_set_geometry-3", err); + db_close(); /* just 'align' nops with other tests with batching */ diff --git a/libs/libmdbx/src/test/keygen.cc b/libs/libmdbx/src/test/keygen.cc index 0110b049bf..c2098e6ef9 100644 --- a/libs/libmdbx/src/test/keygen.cc +++ b/libs/libmdbx/src/test/keygen.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -72,7 +72,7 @@ serial_t injective(const serial_t serial, } void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, - serial_t value_age) { + serial_t value_age, const bool keylen_changeable) { assert(mapping.width >= serial_minwith && mapping.width <= serial_maxwith); assert(mapping.split <= mapping.width); assert(mapping.mesh <= mapping.width); @@ -131,14 +131,65 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, log_trace("keygen-pair: key %" PRIu64 ", value %" PRIu64, key_serial, value_serial); - mk(key_serial, key_essentials, *key); - mk(value_serial, value_essentials, *value); + mk_begin(key_serial, key_essentials, *key); + mk_begin(value_serial, value_essentials, *value); + +#if 0 /* unused for now */ + if (key->value.iov_len + value->value.iov_len > pair_maxlen) { + unsigned extra = key->value.iov_len + value->value.iov_len - pair_maxlen; + if (keylen_changeable && + key->value.iov_len > std::max(8u, (unsigned)key_essentials.minlen)) { +#if defined(__GNUC__) || defined(__clang__) + const bool coin = __builtin_parityll(serial) != 0; +#else + const bool coin = INT64_C(0xF2CEECA9989BD96A) * int64_t(serial) < 0; +#endif + if (coin) { + const unsigned gap = + key->value.iov_len - std::max(8u, (unsigned)key_essentials.minlen); + const unsigned chop = std::min(gap, extra); + log_trace("keygen-pair: chop %u key-len %u -> %u", chop, + (unsigned)key->value.iov_len, + (unsigned)key->value.iov_len - chop); + key->value.iov_len -= chop; + extra -= chop; + } + } + if (extra && value->value.iov_len > + std::max(8u, (unsigned)value_essentials.minlen)) { + const unsigned gap = value->value.iov_len - + std::max(8u, (unsigned)value_essentials.minlen); + const unsigned chop = std::min(gap, extra); + log_trace("keygen-pair: chop %u value-len %u -> %u", chop, + (unsigned)value->value.iov_len, + (unsigned)value->value.iov_len - chop); + value->value.iov_len -= chop; + extra -= chop; + } + if (keylen_changeable && extra && + key->value.iov_len > std::max(8u, (unsigned)key_essentials.minlen)) { + const unsigned gap = + key->value.iov_len - std::max(8u, (unsigned)key_essentials.minlen); + const unsigned chop = std::min(gap, extra); + log_trace("keygen-pair: chop %u key-len %u -> %u", chop, + (unsigned)key->value.iov_len, + (unsigned)key->value.iov_len - chop); + key->value.iov_len -= chop; + extra -= chop; + } + } +#else + (void)keylen_changeable; +#endif /* unused for now */ + + mk_continue(key_serial, key_essentials, *key); + mk_continue(value_serial, value_essentials, *value); if (log_enabled(logging::trace)) { - char dump_key[128], dump_value[128]; + char dump_key[4096], dump_value[4096]; log_trace("keygen-pair: key %s, value %s", - mdbx_dkey(&key->value, dump_key, sizeof(dump_key)), - mdbx_dkey(&value->value, dump_value, sizeof(dump_value))); + mdbx_dump_val(&key->value, dump_key, sizeof(dump_key)), + mdbx_dump_val(&value->value, dump_value, sizeof(dump_value))); } } @@ -146,19 +197,22 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, unsigned thread_number) { key_essentials.flags = actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY | MDBX_DUPSORT); - assert(actor.keylen_min <= UINT8_MAX); - key_essentials.minlen = (uint8_t)actor.keylen_min; - assert(actor.keylen_max <= UINT16_MAX); - key_essentials.maxlen = (uint16_t)actor.keylen_max; + assert(actor.keylen_min <= UINT16_MAX); + key_essentials.minlen = (uint16_t)actor.keylen_min; + assert(actor.keylen_max <= UINT32_MAX); + key_essentials.maxlen = std::min( + (uint32_t)actor.keylen_max, + (uint32_t)mdbx_limits_keysize_max(actor.pagesize, key_essentials.flags)); value_essentials.flags = actor.table_flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP); - assert(actor.datalen_min <= UINT8_MAX); - value_essentials.minlen = (uint8_t)actor.datalen_min; - assert(actor.datalen_max <= UINT16_MAX); - value_essentials.maxlen = (uint16_t)actor.datalen_max; + assert(actor.datalen_min <= UINT16_MAX); + value_essentials.minlen = (uint16_t)actor.datalen_min; + assert(actor.datalen_max <= UINT32_MAX); + value_essentials.maxlen = std::min( + (uint32_t)actor.datalen_max, + (uint32_t)mdbx_limits_valsize_max(actor.pagesize, key_essentials.flags)); - assert(thread_number < 2); (void)thread_number; mapping = actor.keygen; salt = (actor.keygen.seed + actor_id) * UINT64_C(14653293970879851569); @@ -226,18 +280,25 @@ buffer alloc(size_t limit) { return buffer(ptr); } -void __hot maker::mk(const serial_t serial, const essentials ¶ms, - result &out) { +void __hot maker::mk_begin(const serial_t serial, const essentials ¶ms, + result &out) { assert(out.limit >= params.maxlen); assert(params.maxlen >= params.minlen); assert(params.maxlen >= length(serial)); - out.value.iov_base = out.bytes; out.value.iov_len = (params.maxlen > params.minlen) ? params.minlen + serial % (params.maxlen - params.minlen) : params.minlen; + if ((params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) == 0 && + out.value.iov_len < 8) + out.value.iov_len = std::max(length(serial), out.value.iov_len); +} + +void __hot maker::mk_continue(const serial_t serial, const essentials ¶ms, + result &out) { + out.value.iov_base = out.bytes; if (params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) { assert(params.maxlen == params.minlen); assert(params.minlen == 4 || params.minlen == 8); @@ -251,17 +312,13 @@ void __hot maker::mk(const serial_t serial, const essentials ¶ms, unaligned::store(out.bytes + out.value.iov_len - 8, htobe64(serial)); } else { out.u64 = htobe64(serial); - if (out.value.iov_len < 8) { - out.value.iov_len = std::max(length(serial), out.value.iov_len); + if (out.value.iov_len < 8) out.value.iov_base = out.bytes + 8 - out.value.iov_len; - } } } else { out.u64 = htole64(serial); if (out.value.iov_len > 8) memset(out.bytes + 8, '\0', out.value.iov_len - 8); - else - out.value.iov_len = std::max(length(serial), out.value.iov_len); } assert(out.value.iov_len >= params.minlen); diff --git a/libs/libmdbx/src/test/keygen.h b/libs/libmdbx/src/test/keygen.h index d0299e1e7d..d25b88349f 100644 --- a/libs/libmdbx/src/test/keygen.h +++ b/libs/libmdbx/src/test/keygen.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -93,7 +93,7 @@ struct result { //----------------------------------------------------------------------------- -struct buffer_deleter : public std::unary_function<void, result *> { +struct buffer_deleter /* : public std::unary_function<void, result *> */ { void operator()(result *buffer) const { free(buffer); } }; @@ -107,18 +107,25 @@ class maker { serial_t salt; struct essentials { - uint8_t minlen; - uint8_t flags; - uint16_t maxlen; + uint16_t minlen; + uint16_t flags; + uint32_t maxlen; } key_essentials, value_essentials; - static void mk(const serial_t serial, const essentials ¶ms, result &out); + static void mk_begin(const serial_t serial, const essentials ¶ms, + result &out); + static void mk_continue(const serial_t serial, const essentials ¶ms, + result &out); + static void mk(const serial_t serial, const essentials ¶ms, result &out) { + mk_begin(serial, params, out); + mk_continue(serial, params, out); + } public: maker() { memset(this, 0, sizeof(*this)); } void pair(serial_t serial, const buffer &key, buffer &value, - serial_t value_age); + serial_t value_age, const bool keylen_changeable); void setup(const config::actor_params_pod &actor, unsigned actor_id, unsigned thread_number); void make_ordered(); diff --git a/libs/libmdbx/src/test/log.cc b/libs/libmdbx/src/test/log.cc index 79544e11bb..2b22e28785 100644 --- a/libs/libmdbx/src/test/log.cc +++ b/libs/libmdbx/src/test/log.cc @@ -20,7 +20,7 @@ void failure(const char *fmt, ...) { va_list ap; va_start(ap, fmt); fflushall(); - logging::output(logging::failure, fmt, ap); + logging::output_nocheckloglevel_ap(logging::failure, fmt, ap); va_end(ap); fflushall(); exit(EXIT_FAILURE); @@ -37,29 +37,18 @@ void __noreturn failure_perror(const char *what, int errnum) { //----------------------------------------------------------------------------- -static void mdbx_logger(int type, const char *function, int line, +static void mdbx_logger(int priority, const char *function, int line, const char *msg, va_list args) { - logging::loglevel level = logging::info; - if (type & MDBX_DBG_EXTRA) - level = logging::extra; - if (type & MDBX_DBG_TRACE) - level = logging::trace; - if (type & MDBX_DBG_PRINT) - level = logging::verbose; - if (!function) function = "unknown"; - if (type & MDBX_DBG_ASSERT) { - log_error("mdbx: assertion failure: %s, %d", function, line); - level = logging::failure; - } - if (logging::output( - level, - strncmp(function, "mdbx_", 5) == 0 ? "%s: " : "mdbx: %s: ", function)) - logging::feed_ap(msg, args); - if (type & MDBX_DBG_ASSERT) - abort(); + if (priority == MDBX_LOG_FATAL) + log_error("mdbx: fatal failure: %s, %d", function, line); + + logging::output_nocheckloglevel( + logging::loglevel(priority), + strncmp(function, "mdbx_", 5) == 0 ? "%s: " : "mdbx %s: ", function); + logging::feed_ap(msg, args); } namespace logging { @@ -69,19 +58,16 @@ static std::string suffix; static loglevel level; static FILE *last; -void setlevel(loglevel _level) { - level = (_level > error) ? failure : _level; - int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER | MDBX_DBG_DUMP; - if (level <= trace) - mdbx_dbg_opts |= MDBX_DBG_TRACE; - if (level <= verbose) - mdbx_dbg_opts |= MDBX_DBG_PRINT; - int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_logger); +void setlevel(loglevel priority) { + level = priority; + int rc = mdbx_setup_debug(priority, + MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER, + mdbx_logger); log_trace("set mdbx debug-opts: 0x%02x", rc); } -void setup(loglevel _level, const std::string &_prefix) { - setlevel(_level); +void setup(loglevel priority, const std::string &_prefix) { + setlevel(priority); prefix = _prefix; } @@ -95,10 +81,10 @@ const char *level2str(const loglevel alevel) { return "extra"; case trace: return "trace"; + case debug: + return "debug"; case verbose: return "verbose"; - case info: - return "info"; case notice: return "notice"; case warning: @@ -111,26 +97,28 @@ const char *level2str(const loglevel alevel) { } bool output(const loglevel priority, const char *format, ...) { - if (priority < level) + if (lower(priority, level)) return false; va_list ap; va_start(ap, format); - output(priority, format, ap); + output_nocheckloglevel_ap(priority, format, ap); va_end(ap); return true; } -bool output(const logging::loglevel priority, const char *format, va_list ap) { +void output_nocheckloglevel_ap(const logging::loglevel priority, + const char *format, va_list ap) { if (last) { putc('\n', last); fflush(last); + if (last == stderr) { + putc('\n', stdout); + fflush(stdout); + } last = nullptr; } - if (priority < level) - return false; - chrono::time now = chrono::now_realtime(); struct tm tm; #ifdef _MSC_VER @@ -144,14 +132,14 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { last = stdout; fprintf(last, - "[ %02d%02d%02d-%02d:%02d:%02d.%06d_%05u %-10s %.4s ] %s" /* TODO */, + "[ %02d%02d%02d-%02d:%02d:%02d.%06d_%05lu %-10s %.4s ] %s" /* TODO */, tm.tm_year - 100, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, - tm.tm_sec, chrono::fractional2us(now.fractional), osal_getpid(), + tm.tm_sec, chrono::fractional2us(now.fractional), (long)osal_getpid(), prefix.c_str(), level2str(priority), suffix.c_str()); va_list ones; memset(&ones, 0, sizeof(ones)) /* zap MSVC and other stupid compilers */; - if (priority >= error) + if (same_or_higher(priority, error)) va_copy(ones, ap); vfprintf(last, format, ap); @@ -178,29 +166,36 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { break; } - if (priority >= error) { + if (same_or_higher(priority, error)) { if (last != stderr) { - fprintf(stderr, "[ %05u %-10s %.4s ] %s", osal_getpid(), prefix.c_str(), - level2str(priority), suffix.c_str()); + fprintf(stderr, "[ %05lu %-10s %.4s ] %s", (long)osal_getpid(), + prefix.c_str(), level2str(priority), suffix.c_str()); vfprintf(stderr, format, ones); - if (end != '\n') - putc('\n', stderr); - fflush(stderr); + if (end == '\n') + fflush(stderr); + else + last = stderr; } va_end(ones); } - - return true; } bool feed_ap(const char *format, va_list ap) { if (!last) return false; + if (last == stderr) { + va_list ones; + va_copy(ones, ap); + vfprintf(stdout, format, ones); + va_end(ones); + } vfprintf(last, format, ap); size_t len = strlen(format); if (len && format[len - 1] == '\n') { fflush(last); + if (last == stderr) + fflush(stdout); last = nullptr; } return true; @@ -242,73 +237,123 @@ void local_suffix::pop() { local_suffix::~local_suffix() { suffix.erase(trim_pos); } +void progress_canary(bool active) { + static chrono::time progress_timestamp; + chrono::time now = chrono::now_motonic(); + + if (now.fixedpoint - progress_timestamp.fixedpoint < + chrono::from_ms(42).fixedpoint) + return; + + if (osal_progress_push(active)) { + progress_timestamp = now; + return; + } + + if (progress_timestamp.fixedpoint == 0) { + putc('>', stderr); + progress_timestamp = now; + } else if (global::config::console_mode) { + if (active) { + static int last_point = -1; + int point = (now.fixedpoint >> 29) & 3; + if (point != last_point) { + progress_timestamp = now; + fprintf(stderr, "%c\b", "-\\|/"[last_point = point]); + } + } else if (now.fixedpoint - progress_timestamp.fixedpoint > + chrono::from_seconds(2).fixedpoint) { + progress_timestamp = now; + fprintf(stderr, "%c\b", "@*"[now.utc & 1]); + } + } else { + static int count; + if (active && now.fixedpoint - progress_timestamp.fixedpoint > + chrono::from_seconds(1).fixedpoint) { + putc('.', stderr); + progress_timestamp = now; + ++count; + } else if (now.fixedpoint - progress_timestamp.fixedpoint > + chrono::from_seconds(5).fixedpoint) { + putc("@*"[now.utc & 1], stderr); + progress_timestamp = now; + ++count; + } + if (count == 60) { + count = 0; + putc('\n', stderr); + } + } + fflush(stderr); +} + } // namespace logging void log_extra(const char *msg, ...) { - if (logging::extra >= logging::level) { + if (logging::same_or_higher(logging::extra, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::extra, msg, ap); + logging::output_nocheckloglevel_ap(logging::extra, msg, ap); va_end(ap); } else logging::last = nullptr; } void log_trace(const char *msg, ...) { - if (logging::trace >= logging::level) { + if (logging::same_or_higher(logging::trace, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::trace, msg, ap); + logging::output_nocheckloglevel_ap(logging::trace, msg, ap); va_end(ap); } else logging::last = nullptr; } -void log_verbose(const char *msg, ...) { - if (logging::verbose >= logging::level) { +void log_debug(const char *msg, ...) { + if (logging::same_or_higher(logging::debug, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::verbose, msg, ap); + logging::output_nocheckloglevel_ap(logging::debug, msg, ap); va_end(ap); } else logging::last = nullptr; } -void log_info(const char *msg, ...) { - if (logging::info >= logging::level) { +void log_verbose(const char *msg, ...) { + if (logging::same_or_higher(logging::verbose, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::info, msg, ap); + logging::output_nocheckloglevel_ap(logging::verbose, msg, ap); va_end(ap); } else logging::last = nullptr; } void log_notice(const char *msg, ...) { - if (logging::notice >= logging::level) { + if (logging::same_or_higher(logging::notice, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::notice, msg, ap); + logging::output_nocheckloglevel_ap(logging::notice, msg, ap); va_end(ap); } else logging::last = nullptr; } void log_warning(const char *msg, ...) { - if (logging::warning >= logging::level) { + if (logging::same_or_higher(logging::warning, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::warning, msg, ap); + logging::output_nocheckloglevel_ap(logging::warning, msg, ap); va_end(ap); } else logging::last = nullptr; } void log_error(const char *msg, ...) { - if (logging::error >= logging::level) { + if (logging::same_or_higher(logging::error, logging::level)) { va_list ap; va_start(ap, msg); - logging::output(logging::error, msg, ap); + logging::output_nocheckloglevel_ap(logging::error, msg, ap); va_end(ap); } else logging::last = nullptr; @@ -319,7 +364,7 @@ void log_trouble(const char *where, const char *what, int errnum) { } bool log_enabled(const logging::loglevel priority) { - return (priority >= logging::level); + return logging::same_or_higher(priority, logging::level); } void log_flush(void) { fflushall(); } diff --git a/libs/libmdbx/src/test/log.h b/libs/libmdbx/src/test/log.h index 7d6b4012f1..bb8f997b36 100644 --- a/libs/libmdbx/src/test/log.h +++ b/libs/libmdbx/src/test/log.h @@ -15,6 +15,7 @@ #pragma once #include "base.h" +#include "chrono.h" void __noreturn usage(void); void __noreturn __printf_args(1, 2) failure(const char *fmt, ...); @@ -24,27 +25,47 @@ const char *test_strerror(int errnum); namespace logging { enum loglevel { - extra, - trace, - verbose, - info, - notice, - warning, - error, - failure, + extra = MDBX_LOG_EXTRA, + trace = MDBX_LOG_TRACE, + debug = MDBX_LOG_DEBUG, + verbose = MDBX_LOG_VERBOSE, + notice = MDBX_LOG_NOTICE, + warning = MDBX_LOG_WARN, + error = MDBX_LOG_ERROR, + failure = MDBX_LOG_FATAL }; +inline bool lower(loglevel left, loglevel right) { + static_assert(MDBX_LOG_EXTRA > MDBX_LOG_FATAL, "WTF?"); + return left > right; +} + +inline bool same_or_higher(loglevel left, loglevel right) { + return left <= right; +} + const char *level2str(const loglevel level); -void setup(loglevel level, const std::string &prefix); +void setup(loglevel priority, const std::string &prefix); void setup(const std::string &prefix); -void setlevel(loglevel level); +void setlevel(loglevel priority); -bool output(const loglevel priority, const char *format, va_list ap); +void output_nocheckloglevel_ap(const loglevel priority, const char *format, + va_list ap); bool __printf_args(2, 3) output(const loglevel priority, const char *format, ...); bool feed_ap(const char *format, va_list ap); bool __printf_args(1, 2) feed(const char *format, ...); +void inline __printf_args(2, 3) + output_nocheckloglevel(const loglevel priority, const char *format, ...) { + va_list ap; + va_start(ap, format); + output_nocheckloglevel_ap(priority, format, ap); + va_end(ap); +} + +void progress_canary(bool active); + class local_suffix { protected: size_t trim_pos; @@ -66,8 +87,8 @@ public: void __printf_args(1, 2) log_extra(const char *msg, ...); void __printf_args(1, 2) log_trace(const char *msg, ...); +void __printf_args(1, 2) log_debug(const char *msg, ...); void __printf_args(1, 2) log_verbose(const char *msg, ...); -void __printf_args(1, 2) log_info(const char *msg, ...); void __printf_args(1, 2) log_notice(const char *msg, ...); void __printf_args(1, 2) log_warning(const char *msg, ...); void __printf_args(1, 2) log_error(const char *msg, ...); diff --git a/libs/libmdbx/src/test/long_stochastic.sh b/libs/libmdbx/src/test/long_stochastic.sh index 954d18268a..58ec6d5c6e 100644 --- a/libs/libmdbx/src/test/long_stochastic.sh +++ b/libs/libmdbx/src/test/long_stochastic.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash if ! which make cc c++ tee lz4 >/dev/null; then echo "Please install the following prerequisites: make cc c++ tee lz4" >&2 exit 1 @@ -7,19 +7,42 @@ fi set -euo pipefail UNAME="$(uname -s 2>/dev/null || echo Unknown)" + +## NOTE: Valgrind could produce some false-positive warnings +## in multi-process environment with shared memory. +## For instance, when the process "A" explicitly marks a memory +## region as "undefined", the process "B" fill it, +## and after this process "A" read such region, etc. +#VALGRIND="valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt" + +############################################################################### +# 1. clean data from prev runs and examine available RAM + +if [[ -v VALGRIND && ! -z "$VALGRIND" ]]; then + rm -f valgrind-*.log +else + VALGRIND=time +fi + +WANNA_MOUNT=0 case ${UNAME} in Linux) MAKE=make if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then + for old_test_dir in $(ls -d /dev/shm/mdbx-test.[0-9]*); do + rm -rf $old_test_dir + done TESTDB_DIR="/dev/shm/mdbx-test.$$" fi mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + if LC_ALL=C free | grep -q -i available; then ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 7) / 1024)) else ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 4) / 1024)) fi ;; + FreeBSD) MAKE=gmake if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then @@ -27,18 +50,46 @@ case ${UNAME} in umount $old_test_dir && rm -r $old_test_dir done TESTDB_DIR="/tmp/mdbx-test.$$" - rm -rf $TESTDB_DIR && mkdir -p $TESTDB_DIR && mount -t tmpfs tmpfs $TESTDB_DIR + rm -rf $TESTDB_DIR && mkdir -p $TESTDB_DIR + WANNA_MOUNT=1 else mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* fi + ram_avail_mb=$(($(LC_ALL=C vmstat -s | grep -ie '[0-9] pages free$' | cut -d p -f 1) * ($(LC_ALL=C vmstat -s | grep -ie '[0-9] bytes per page$' | cut -d b -f 1) / 1024) / 1024)) ;; + + Darwin) + MAKE=make + if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then + for vol in $(ls -d /Volumes/mdx[0-9]*[0-9]tst); do + disk=$(mount | grep $vol | cut -d ' ' -f 1) + echo "umount: volume $vol disk $disk" + hdiutil unmount $vol -force + hdiutil detach $disk + done + TESTDB_DIR="/Volumes/mdx$$tst" + WANNA_MOUNT=1 + else + mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + fi + + pagesize=$(($(LC_ALL=C vm_stat | grep -o 'page size of [0-9]\+ bytes' | cut -d' ' -f 4) / 1024)) + freepages=$(LC_ALL=C vm_stat | grep '^Pages free:' | grep -o '[0-9]\+\.$' | cut -d'.' -f 1) + ram_avail_mb=$((pagesize * freepages / 1024)) + echo "pagesize ${pagesize}K, freepages ${freepages}, ram_avail_mb ${ram_avail_mb}" + + ;; + *) echo "FIXME: ${UNAME} not supported by this script" exit 2 ;; esac +############################################################################### +# 2. estimate reasonable RAM space for test-db + echo "=== ${ram_avail_mb}M RAM available" ram_reserve4logs_mb=1234 if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then @@ -70,16 +121,47 @@ if [ $db_size_mb -gt 3072 ]; then fi echo "=== use ${db_size_mb}M for DB" -${MAKE} TESTDB=${TESTDB_DIR}/smoke.db TESTLOG=${TESTDB_DIR}/smoke.log check +############################################################################### +# 3. Create test-directory in ramfs/tmpfs, i.e. create/format/mount if required +case ${UNAME} in + Linux) + ;; + + FreeBSD) + if [[ WANNA_MOUNT ]]; then + mount -t tmpfs tmpfs $TESTDB_DIR + fi + ;; + + Darwin) + if [[ WANNA_MOUNT ]]; then + ramdisk_size_mb=$((42 + db_size_mb * 2 + ram_reserve4logs_mb)) + number_of_sectors=$((ramdisk_size_mb * 2048)) + ramdev=$(hdiutil attach -nomount ram://${number_of_sectors}) + diskutil erasevolume ExFAT "mdx$$tst" ${ramdev} + fi + ;; + + *) + echo "FIXME: ${UNAME} not supported by this script" + exit 2 + ;; +esac + +############################################################################### +# 4. Run basic test, i.e. `make check` + +${MAKE} TEST_DB=${TESTDB_DIR}/smoke.db TEST_LOG=${TESTDB_DIR}/smoke.log check rm -f ${TESTDB_DIR}/* ############################################################################### +# 5. run stochastic iterations function rep9 { printf "%*s" $1 '' | tr ' ' '9'; } function join { local IFS="$1"; shift; echo "$*"; } function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; } -options=(writemap coalesce lifo) +options=(writemap coalesce lifo notls) function bits2list { local -n arr=$1 @@ -95,13 +177,13 @@ function probe { echo "=============================================== $(date)" echo "${caption}: $*" rm -f ${TESTDB_DIR}/* \ - && ./mdbx_test --ignore-dbfull --repeat=42 --pathname=${TESTDB_DIR}/long.db "$@" | lz4 > ${TESTDB_DIR}/long.log.lz4 \ - && ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db | tee ${TESTDB_DIR}/long-chk.log \ - && ([ ! -e ${TESTDB_DIR}/long.db-copy ] || ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db-copy | tee ${TESTDB_DIR}/long-chk-copy.log) \ + && ${VALGRIND} ./mdbx_test --ignore-dbfull --repeat=42 --pathname=${TESTDB_DIR}/long.db "$@" | lz4 > ${TESTDB_DIR}/long.log.lz4 \ + && ${VALGRIND} ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db | tee ${TESTDB_DIR}/long-chk.log \ + && ([ ! -e ${TESTDB_DIR}/long.db-copy ] || ${VALGRIND} ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db-copy | tee ${TESTDB_DIR}/long-chk-copy.log) \ || (echo "FAILED"; exit 1) } -############################################################################### +#------------------------------------------------------------------------------ count=0 for nops in $(seq 2 6); do diff --git a/libs/libmdbx/src/test/main.cc b/libs/libmdbx/src/test/main.cc index 959359a515..c00837fa72 100644 --- a/libs/libmdbx/src/test/main.cc +++ b/libs/libmdbx/src/test/main.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -14,9 +14,100 @@ #include "test.h" +#if !(defined(_WIN32) || defined(_WIN64)) +#include <sys/resource.h> +#include <sys/time.h> +#endif /* !Windows */ + void __noreturn usage(void) { - printf("usage:\n" - "\tFIXME\n"); + puts( + "usage:\n" + " --help or -h Show this text\n" + "Common parameters:\n" + " --pathname=... Path and/or name of database files\n" + " --repeat=N Set repeat counter\n" + " --threads=N Number of thread (unsunpported for now)\n" + " --timeout=N[s|m|h|d] Set timeout in seconds/minutes/hours/days\n" + " --failfast[=YES/no] Lill all actors on first failure/error\n" + " --max-readers=N See mdbx_env_set_maxreaders() description\n" + " --max-tables=N Se mdbx_env_set_maxdbs() description\n" + " --dump-config[=YES/no] Dump entire test config before run\n" + " --progress[=YES/no] Enable/disable progress `canary`\n" + " --console[=yes/no] Enable/disable console-like output\n" + " --cleanup-before[=YES/no] Cleanup/remove and re-create database\n" + " --cleanup-after[=YES/no] Cleanup/remove database after completion\n" + "Database size control:\n" + " --pagesize=... Database page size: min, max, 256..65536\n" + " --size-lower=N[K|M|G|T] Lower-bound of size in Kb/Mb/Gb/Tb\n" + " --size-upper Upper-bound of size in Kb/Mb/Gb/Tb\n" + " --size Initial size in Kb/Mb/Gb/Tb\n" + " --shrink-threshold Shrink threshold in Kb/Mb/Gb/Tb\n" + " --growth-step Grow step in Kb/Mb/Gb/Tb\n" + "Predefined complext scenarios/cases:\n" + " --case=... Only `basic` scenario implemented for now\n" + " basic == Simultaneous multi-process execution\n" + " of test-actors: nested,hill,ttl,copy,append,jitter,try\n" + "Test actors:\n" + " --hill Fill-up and empty-down\n" + " by CRUD-operation quads\n" + " --ttl Stochastic time-to-live simulation\n" + " --nested Nested transactionы\n" + " with stochastic-size bellows\n" + " --jitter Jitter/delays simulation\n" + " --try Try write-transaction, no more\n" + " --copy Online copy/backup\n" + " --append Append-mode insertions\n" + " --dead.reader Dead-reader simulator\n" + " --dead.writer Dead-writer simulator\n" + "Actor options:\n" + " --batch.read=N Read-operations batch size\n" + " --batch.write=N Write-operations batch size\n" + " --delay=N | --no-delay (no)Delay test-actor before start\n" + " --wait4ops=N | --no-wait4ops (no)Wait for previous test-actor\n" + " completes # ops before start\n" + " --duration=N[s|m|h|d] Define running duration\n" + " --nops=N[K|M|G|T] Define number of operations/steps\n" + " --inject-writefault[=yes|NO] TBD (see the source code)\n" + " --drop[=yes|NO] Drop key-value space/table on " + "completion\n" + " --ignore-dbfull[=yes|NO] Ignore MDBX_MAP_FULL error\n" + " --speculum[=yes|NO] Use internal `speculum` to check " + "dataset\n" + "Keys and Value:\n" + " --keylen.min=N Minimal keys length\n" + " --keylen.max=N Miximal keys length\n" + " --datalen.min=N Minimal data length\n" + " --datalen.max=N Miximal data length\n" + " --keygen.width=N TBD (see the source code)\n" + " --keygen.mesh=N TBD (see the source code)\n" + " --keygen.seed=N TBD (see the source code)\n" + " --keygen.split=N TBD (see the source code)\n" + " --keygen.rotate=N TBD (see the source code)\n" + " --keygen.offset=N TBD (see the source code)\n" + " --keygen.case=random Generator case (only `random` for now)\n" + "Database operation mode:\n" + " --mode={[+-]FLAG}[,[+-]FLAG]...\n" + " nosubdir == MDBX_NOSUBDIR\n" + " rdonly == MDBX_RDONLY\n" + " nometasync == MDBX_NOMETASYNC\n" + " lifo == MDBX_LIFORECLAIM\n" + " coalesce == MDBX_COALESCE\n" + " nosync-safe == MDBX_SAFE_NOSYNC\n" + " writemap == MDBX_WRITEMAP\n" + " mapasync == MDBX_MAPASYNC\n" + " nosync-utterly == MDBX_UTTERLY_NOSYNC\n" + " perturb == MDBX_PAGEPERTURB\n" + " notls == MDBX_NOTLS\n" + " nordahead == MDBX_NORDAHEAD\n" + " nomeminit == MDBX_NOMEMINIT\n" + "Key-value space/table options:\n" + " --table={[+-]FLAG}[,[+-]FLAG]...\n" + " key.reverse == MDBX_REVERSEKEY\n" + " key.integer == MDBX_INTEGERKEY\n" + " data.dups == MDBX_DUPSORT\n" + " data.integer == MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT\n" + " data.fixed == MDBX_DUPFIXED | MDBX_DUPSORT\n" + " data.reverse == MDBX_REVERSEDUP | MDBX_DUPSORT\n"); exit(EXIT_FAILURE); } @@ -25,17 +116,15 @@ void __noreturn usage(void) { void actor_params::set_defaults(const std::string &tmpdir) { pathname_log = ""; loglevel = -#ifdef NDEBUG - logging::info; -#elif defined(_WIN32) || defined(_WIN64) +#if defined(NDEBUG) || defined(_WIN32) || defined(_WIN64) logging::verbose; #else logging::trace; #endif pathname_db = tmpdir + "mdbx-test.db"; - mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD | - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; + mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOMEMINIT | + MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE; table_flags = MDBX_DUPSORT; size_lower = -1; @@ -73,6 +162,7 @@ void actor_params::set_defaults(const std::string &tmpdir) { drop_table = false; ignore_dbfull = false; + speculum = false; max_readers = 42; max_tables = 42; @@ -82,7 +172,8 @@ void actor_params::set_defaults(const std::string &tmpdir) { global::config::cleanup_before = true; global::config::cleanup_after = true; global::config::failfast = true; - global::config::progress_indicator = osal_istty(STDERR_FILENO); + global::config::progress_indicator = true; + global::config::console_mode = osal_istty(STDERR_FILENO); } namespace global { @@ -103,6 +194,7 @@ bool cleanup_before; bool cleanup_after; bool failfast; bool progress_indicator; +bool console_mode; } /* namespace config */ } /* namespace global */ @@ -130,7 +222,7 @@ int main(int argc, char *const argv[]) { #endif /* _DEBUG */ if (argc < 2) - failure("No parameters given\n"); + failure("No parameters given. Try --help\n"); if (argc == 2 && strncmp(argv[1], global::thunk_param_prefix, strlen(global::thunk_param_prefix)) == 0) @@ -139,6 +231,10 @@ int main(int argc, char *const argv[]) { ? EXIT_SUCCESS : EXIT_FAILURE; + if (argc == 2 && + (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) + usage(); + actor_params params; params.set_defaults(osal_tempdir()); global::config::dump_config = true; @@ -168,8 +264,8 @@ int main(int argc, char *const argv[]) { } if (config::parse_option(argc, argv, narg, "pagesize", params.pagesize, - mdbx_limits_pgsize_min(), - mdbx_limits_pgsize_max())) { + int(mdbx_limits_pgsize_min()), + int(mdbx_limits_pgsize_max()))) { const unsigned keylen_max = params.mdbx_keylen_max(); if (params.keylen_min > keylen_max) params.keylen_min = keylen_max; @@ -182,6 +278,17 @@ int main(int argc, char *const argv[]) { params.datalen_max = datalen_max; continue; } + if (config::parse_option(argc, argv, narg, "repeat", params.nrepeat, + config::no_scale)) + continue; + if (config::parse_option(argc, argv, narg, "threads", params.nthreads, + config::no_scale, 1, 64)) + continue; + if (config::parse_option(argc, argv, narg, "timeout", + global::config::timeout_duration_seconds, + config::duration, 1)) + continue; + if (config::parse_option_intptr(argc, argv, narg, "size-lower", params.size_lower, mdbx_limits_dbsize_min(params.pagesize), @@ -231,17 +338,6 @@ int main(int argc, char *const argv[]) { keycase_setup(value, params); continue; } - - if (config::parse_option(argc, argv, narg, "repeat", params.nrepeat, - config::no_scale)) - continue; - if (config::parse_option(argc, argv, narg, "threads", params.nthreads, - config::no_scale, 1, 64)) - continue; - if (config::parse_option(argc, argv, narg, "timeout", - global::config::timeout_duration_seconds, - config::duration, 1)) - continue; if (config::parse_option(argc, argv, narg, "keylen.min", params.keylen_min, config::no_scale, params.mdbx_keylen_min(), params.mdbx_keylen_max())) { @@ -296,6 +392,8 @@ int main(int argc, char *const argv[]) { if (config::parse_option(argc, argv, narg, "ignore-dbfull", params.ignore_dbfull)) continue; + if (config::parse_option(argc, argv, narg, "speculum", params.speculum)) + continue; if (config::parse_option(argc, argv, narg, "dump-config", global::config::dump_config)) continue; @@ -316,7 +414,7 @@ int main(int argc, char *const argv[]) { params.delaystart = 0; continue; } - if (config::parse_option(argc, argv, narg, "no-wait", nullptr)) { + if (config::parse_option(argc, argv, narg, "no-wait4ops", nullptr)) { params.waitfor_nops = 0; continue; } @@ -346,6 +444,10 @@ int main(int argc, char *const argv[]) { configure_actor(last_space_id, ac_deadwrite, value, params); continue; } + if (config::parse_option(argc, argv, narg, "try", nullptr)) { + configure_actor(last_space_id, ac_try, value, params); + continue; + } if (config::parse_option(argc, argv, narg, "copy", nullptr)) { configure_actor(last_space_id, ac_copy, value, params); continue; @@ -358,17 +460,24 @@ int main(int argc, char *const argv[]) { configure_actor(last_space_id, ac_ttl, value, params); continue; } + if (config::parse_option(argc, argv, narg, "nested", nullptr)) { + configure_actor(last_space_id, ac_nested, value, params); + continue; + } if (config::parse_option(argc, argv, narg, "failfast", global::config::failfast)) continue; if (config::parse_option(argc, argv, narg, "progress", global::config::progress_indicator)) continue; + if (config::parse_option(argc, argv, narg, "console", + global::config::console_mode)) + continue; if (*argv[narg] != '-') testcase_setup(argv[narg], params, last_space_id); else - failure("Unknown option '%s'\n", argv[narg]); + failure("Unknown option '%s'. Try --help\n", argv[narg]); } if (global::config::dump_config) @@ -456,8 +565,8 @@ int main(int argc, char *const argv[]) { if (!actor) continue; - log_info("actor #%u, id %d, pid %u: %s\n", actor->actor_id, - actor->space_id, pid, status2str(status)); + log_verbose("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, + actor->space_id, (long)pid, status2str(status)); if (status > as_running) { left -= 1; if (status != as_successful) { @@ -480,9 +589,37 @@ int main(int argc, char *const argv[]) { log_notice("RESULT: %s\n", failed ? "Failed" : "Successful"); if (global::config::cleanup_before) { if (failed) - log_info("skip cleanup"); + log_verbose("skip cleanup"); else cleanup(); } + +#if !(defined(_WIN32) || defined(_WIN64)) + struct rusage spent; + if (!getrusage(global::singlemode ? RUSAGE_SELF : RUSAGE_CHILDREN, &spent)) { + log_notice("%6s: user %f, system %f", "CPU", + spent.ru_utime.tv_sec + spent.ru_utime.tv_usec * 1e-6, + spent.ru_stime.tv_sec + spent.ru_stime.tv_usec * 1e-6); +#if defined(__linux__) || defined(__gnu_linux__) || defined(__FreeBSD__) || \ + defined(__NetBSD__) || defined(__OpenBSD__) || defined(__BSD__) || \ + defined(__bsdi__) || defined(__DragonFly__) || defined(__APPLE__) || \ + defined(__MACH__) || defined(__sun) + log_notice("%6s: read %ld, write %ld", "IOPs", spent.ru_inblock, + spent.ru_oublock); + if (spent.ru_maxrss > 0) + log_notice("%6s: %ld Kb", "RAM", + spent.ru_maxrss +#if defined(__sun) + * getpagesize() / 1024u +#elif defined(__APPLE__) + / 1024u +#endif + ); + log_notice("%6s: reclaims %ld, faults %ld, swaps %ld", "Paging", + spent.ru_minflt, spent.ru_majflt, spent.ru_nswap); +#endif /* Linux */ + } +#endif /* !Windows */ + return failed ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/libs/libmdbx/src/test/osal-unix.cc b/libs/libmdbx/src/test/osal-unix.cc index 0157bace23..4b8694c708 100644 --- a/libs/libmdbx/src/test/osal-unix.cc +++ b/libs/libmdbx/src/test/osal-unix.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -21,112 +21,266 @@ #include <sys/wait.h> #include <unistd.h> -#ifdef __APPLE__ +#ifndef MDBX_LOCKING +#error "Opps, MDBX_LOCKING is undefined!" +#endif + +#if defined(__APPLE__) && (MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008) #include "darwin/pthread_barrier.c" +#endif /* __APPLE__ && MDBX_LOCKING >= MDBX_LOCKING_POSIX2001 */ + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV +#include <sys/ipc.h> +#include <sys/sem.h> +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 +#include <semaphore.h> + +#if __cplusplus >= 201103L +#include <atomic> +static __inline __maybe_unused int atomic_decrement(std::atomic_int *p) { + return std::atomic_fetch_sub(p, 1) - 1; +} +#else +static __inline __maybe_unused int atomic_decrement(volatile int *p) { +#if defined(__GNUC__) || defined(__clang__) + return __sync_sub_and_fetch(p, 1); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile int)); + return _InterlockedDecrement((volatile long *)p); +#elif defined(__APPLE__) + return OSAtomicDecrement32Barrier((volatile int *)p); +#else +#error FIXME: Unsupported compiler #endif +} +#endif /* C++11 */ +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX1988 */ + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV +static int ipc; +static pid_t ipc_overlord_pid; +static void ipc_remove(void) { + if (ipc_overlord_pid == getpid()) + semctl(ipc, 0, IPC_RMID, nullptr); +} +#else struct shared_t { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 pthread_barrier_t barrier; pthread_mutex_t mutex; - size_t conds_size; - pthread_cond_t conds[1]; + size_t count; + pthread_cond_t events[1]; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + struct { +#if __cplusplus >= 201103L + std::atomic_int countdown; +#else + volatile int countdown; +#endif /* C++11 */ + sem_t sema; + } barrier; + size_t count; + sem_t events[1]; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ }; - static shared_t *shared; +#endif /* MDBX_LOCKING != MDBX_LOCKING_SYSV */ void osal_wait4barrier(void) { +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op; + op.sem_num = 0; + op.sem_op = -1; + op.sem_flg = IPC_NOWAIT; + if (semop(ipc, &op, 1)) + failure_perror("semop(dec)", errno); + op.sem_op = 0; + op.sem_flg = 0; + if (semop(ipc, &op, 1)) + failure_perror("semop(wait)", errno); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 assert(shared != nullptr && shared != MAP_FAILED); - int rc = pthread_barrier_wait(&shared->barrier); - if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { - failure_perror("pthread_barrier_wait(shared)", rc); - } + int err = pthread_barrier_wait(&shared->barrier); + if (err != 0 && err != PTHREAD_BARRIER_SERIAL_THREAD) + failure_perror("pthread_barrier_wait(shared)", err); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + assert(shared != nullptr && shared != MAP_FAILED); + int err = (atomic_decrement(&shared->barrier.countdown) > 0 && + sem_wait(&shared->barrier.sema)) + ? errno + : 0; + if (err != 0) + failure_perror("sem_wait(shared)", err); + if (sem_post(&shared->barrier.sema)) + failure_perror("sem_post(shared)", errno); +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ } void osal_setup(const std::vector<actor_config> &actors) { +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (ipc_overlord_pid) + failure("ipc already created by %ld pid", (long)ipc_overlord_pid); + ipc_overlord_pid = getpid(); +#ifndef SEM_A +#define SEM_A S_IRUSR +#endif +#ifndef SEM_R +#define SEM_R S_IWUSR +#endif + ipc = semget(IPC_PRIVATE, actors.size() + 2, IPC_CREAT | SEM_A | SEM_R); + if (ipc < 0) + failure_perror("semget(IPC_PRIVATE, shared_sems)", errno); + if (atexit(ipc_remove)) + failure_perror("atexit(ipc_remove)", errno); + if (semctl(ipc, 0, SETVAL, (int)(actors.size() + 1))) + failure_perror("semctl(SETVAL.0, shared_sems)", errno); + for (size_t i = 1; i < actors.size() + 2; ++i) + if (semctl(ipc, i, SETVAL, 1)) + failure_perror("semctl(SETVAL.N, shared_sems)", errno); +#else assert(shared == nullptr); - - pthread_mutexattr_t mutexattr; - int rc = pthread_mutexattr_init(&mutexattr); - if (rc) - failure_perror("pthread_mutexattr_init()", rc); - rc = pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); - if (rc) - failure_perror("pthread_mutexattr_setpshared()", rc); - - pthread_barrierattr_t barrierattr; - rc = pthread_barrierattr_init(&barrierattr); - if (rc) - failure_perror("pthread_barrierattr_init()", rc); - rc = pthread_barrierattr_setpshared(&barrierattr, PTHREAD_PROCESS_SHARED); - if (rc) - failure_perror("pthread_barrierattr_setpshared()", rc); - - pthread_condattr_t condattr; - rc = pthread_condattr_init(&condattr); - if (rc) - failure_perror("pthread_condattr_init()", rc); - rc = pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); - if (rc) - failure_perror("pthread_condattr_setpshared()", rc); - shared = (shared_t *)mmap( - nullptr, sizeof(shared_t) + actors.size() * sizeof(pthread_cond_t), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + nullptr, sizeof(shared_t) + actors.size() * sizeof(shared->events[0]), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS +#ifdef MAP_HASSEMAPHORE + | MAP_HASSEMAPHORE +#endif + , + -1, 0); if (MAP_FAILED == (void *)shared) failure_perror("mmap(shared_conds)", errno); - rc = pthread_mutex_init(&shared->mutex, &mutexattr); - if (rc) - failure_perror("pthread_mutex_init(shared)", rc); + shared->count = actors.size() + 1; - rc = pthread_barrier_init(&shared->barrier, &barrierattr, actors.size() + 1); - if (rc) - failure_perror("pthread_barrier_init(shared)", rc); +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + pthread_barrierattr_t barrierattr; + int err = pthread_barrierattr_init(&barrierattr); + if (err) + failure_perror("pthread_barrierattr_init()", err); + err = pthread_barrierattr_setpshared(&barrierattr, PTHREAD_PROCESS_SHARED); + if (err) + failure_perror("pthread_barrierattr_setpshared()", err); + + err = pthread_barrier_init(&shared->barrier, &barrierattr, shared->count); + if (err) + failure_perror("pthread_barrier_init(shared)", err); + pthread_barrierattr_destroy(&barrierattr); - const size_t n = actors.size() + 1; - for (size_t i = 0; i < n; ++i) { - pthread_cond_t *event = &shared->conds[i]; - rc = pthread_cond_init(event, &condattr); - if (rc) - failure_perror("pthread_cond_init(shared)", rc); + pthread_mutexattr_t mutexattr; + err = pthread_mutexattr_init(&mutexattr); + if (err) + failure_perror("pthread_mutexattr_init()", err); + err = pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + if (err) + failure_perror("pthread_mutexattr_setpshared()", err); + + pthread_condattr_t condattr; + err = pthread_condattr_init(&condattr); + if (err) + failure_perror("pthread_condattr_init()", err); + err = pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); + if (err) + failure_perror("pthread_condattr_setpshared()", err); + + err = pthread_mutex_init(&shared->mutex, &mutexattr); + if (err) + failure_perror("pthread_mutex_init(shared)", err); + + for (size_t i = 0; i < shared->count; ++i) { + pthread_cond_t *event = &shared->events[i]; + err = pthread_cond_init(event, &condattr); + if (err) + failure_perror("pthread_cond_init(shared)", err); log_trace("osal_setup: event(shared pthread_cond) %" PRIuPTR " -> %p", i, - event); + __Wpedantic_format_voidptr(event)); } - shared->conds_size = actors.size() + 1; - - pthread_barrierattr_destroy(&barrierattr); pthread_condattr_destroy(&condattr); pthread_mutexattr_destroy(&mutexattr); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + shared->barrier.countdown = shared->count; + if (sem_init(&shared->barrier.sema, true, 1)) + failure_perror("sem_init(shared.barrier)", errno); + for (size_t i = 0; i < shared->count; ++i) { + sem_t *event = &shared->events[i]; + if (sem_init(event, true, 0)) + failure_perror("sem_init(shared.event)", errno); + log_trace("osal_setup: event(shared sem_init) %" PRIuPTR " -> %p", i, + __Wpedantic_format_voidptr(event)); + } +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING != MDBX_LOCKING_SYSV */ } void osal_broadcast(unsigned id) { - assert(shared != nullptr && shared != MAP_FAILED); log_trace("osal_broadcast: event %u", id); - if (id >= shared->conds_size) +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (semctl(ipc, id + 1, SETVAL, 0)) + failure_perror("semctl(SETVAL)", errno); +#else + assert(shared != nullptr && shared != MAP_FAILED); + if (id >= shared->count) failure("osal_broadcast: id > limit"); - int rc = pthread_cond_broadcast(shared->conds + id); - if (rc) - failure_perror("sem_post(shared)", rc); +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int err = pthread_cond_broadcast(shared->events + id); + if (err) + failure_perror("pthread_cond_broadcast(shared)", err); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + if (sem_post(shared->events + id)) + failure_perror("sem_post(shared)", errno); +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING != MDBX_LOCKING_SYSV */ } int osal_waitfor(unsigned id) { - assert(shared != nullptr && shared != MAP_FAILED); - log_trace("osal_waitfor: event %u", id); - if (id >= shared->conds_size) +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op; + memset(&op, 0, sizeof(op)); + op.sem_num = (short)(id + 1); + int rc = semop(ipc, &op, 1) ? errno : MDBX_SUCCESS; +#else + assert(shared != nullptr && shared != MAP_FAILED); + if (id >= shared->count) failure("osal_waitfor: id > limit"); +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 int rc = pthread_mutex_lock(&shared->mutex); if (rc != 0) failure_perror("pthread_mutex_lock(shared)", rc); - rc = pthread_cond_wait(shared->conds + id, &shared->mutex); + rc = pthread_cond_wait(shared->events + id, &shared->mutex); if (rc && rc != EINTR) failure_perror("pthread_cond_wait(shared)", rc); rc = pthread_mutex_unlock(&shared->mutex); if (rc != 0) failure_perror("pthread_mutex_unlock(shared)", rc); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = sem_wait(shared->events + id) ? errno : 0; + if (rc == 0 && sem_post(shared->events + id)) + failure_perror("sem_post(shared)", errno); +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ +#endif /* MDBX_LOCKING != MDBX_LOCKING_SYSV */ return (rc == 0) ? true : false; } @@ -149,21 +303,69 @@ bool actor_config::osal_deserialize(const char *str, const char *end, //----------------------------------------------------------------------------- +static pid_t overlord_pid; + +static volatile sig_atomic_t sigusr1_head, sigusr2_head; +static void handler_SIGUSR(int signum) { + switch (signum) { + case SIGUSR1: + sigusr1_head += 1; + return; + case SIGUSR2: + sigusr2_head += 1; + return; + default: + abort(); + } +} + +bool osal_progress_push(bool active) { + if (overlord_pid) { + if (kill(overlord_pid, active ? SIGUSR1 : SIGUSR2)) + failure_perror("osal_progress_push: kill(overload)", errno); + return true; + } + + return false; +} + +//----------------------------------------------------------------------------- + static std::unordered_map<pid_t, actor_status> childs; -static void handler_SIGCHLD(int unused) { (void)unused; } +static volatile sig_atomic_t sigalarm_head; +static void handler_SIGCHLD(int signum) { + if (signum == SIGALRM) + sigalarm_head += 1; +} mdbx_pid_t osal_getpid(void) { return getpid(); } int osal_delay(unsigned seconds) { return sleep(seconds) ? errno : 0; } int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { - if (childs.empty()) - signal(SIGCHLD, handler_SIGCHLD); + if (childs.empty()) { + struct sigaction act; + memset(&act, 0, sizeof(act)); + act.sa_handler = handler_SIGCHLD; + sigaction(SIGCHLD, &act, nullptr); + sigaction(SIGALRM, &act, nullptr); + act.sa_handler = handler_SIGUSR; + sigaction(SIGUSR1, &act, nullptr); + sigaction(SIGUSR2, &act, nullptr); + + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGCHLD); + sigaddset(&mask, SIGUSR1); + sigaddset(&mask, SIGUSR2); + sigprocmask(SIG_UNBLOCK, &mask, nullptr); + } pid = fork(); if (pid == 0) { + overlord_pid = getppid(); const bool result = test_execute(config); exit(result ? EXIT_SUCCESS : EXIT_FAILURE); } @@ -171,7 +373,8 @@ int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { if (pid < 0) return errno; - log_trace("osal_actor_start: fork pid %i for %u", pid, config.actor_id); + log_trace("osal_actor_start: fork pid %ld for %u", (long)pid, + config.actor_id); childs[pid] = as_running; return 0; } @@ -186,60 +389,67 @@ void osal_killall_actors(void) { } int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { - struct timespec ts; - ts.tv_nsec = 0; - ts.tv_sec = (timeout > INT_MAX) ? INT_MAX : timeout; -retry: - int status, options = WNOHANG; + static sig_atomic_t sigalarm_tail; + alarm(0) /* cancel prev timeout */; + sigalarm_tail = sigalarm_head /* reset timeout flag */; + + int options = WNOHANG; + if (timeout) { + alarm((timeout > INT_MAX) ? INT_MAX : timeout); + options = 0; + } + #ifdef WUNTRACED options |= WUNTRACED; #endif #ifdef WCONTINUED options |= WCONTINUED; #endif - pid = waitpid(0, &status, options); - - if (pid > 0) { - if (WIFEXITED(status)) - childs[pid] = - (WEXITSTATUS(status) == EXIT_SUCCESS) ? as_successful : as_failed; - else if (WCOREDUMP(status)) - childs[pid] = as_coredump; - else if (WIFSIGNALED(status)) - childs[pid] = as_killed; - else if (WIFSTOPPED(status)) - childs[pid] = as_debuging; - else if (WIFCONTINUED(status)) - childs[pid] = as_running; - else { - assert(false); - } - return 0; - } - if (pid == 0) { - /* child still running */ - if (ts.tv_sec == 0 && ts.tv_nsec == 0) - ts.tv_nsec = 1; - if (nanosleep(&ts, &ts) == 0) { - /* timeout and no signal from child */ - pid = 0; + while (sigalarm_tail == sigalarm_head) { + int status; + pid = waitpid(0, &status, options); + + if (pid > 0) { + if (WIFEXITED(status)) + childs[pid] = + (WEXITSTATUS(status) == EXIT_SUCCESS) ? as_successful : as_failed; + else if (WCOREDUMP(status)) + childs[pid] = as_coredump; + else if (WIFSIGNALED(status)) + childs[pid] = as_killed; + else if (WIFSTOPPED(status)) + childs[pid] = as_debugging; + else if (WIFCONTINUED(status)) + childs[pid] = as_running; + else { + assert(false); + } return 0; } - if (errno == EINTR) - goto retry; - } - switch (errno) { - case EINTR: - pid = 0; - return 0; + static sig_atomic_t sigusr1_tail, sigusr2_tail; + if (sigusr1_tail != sigusr1_head) { + sigusr1_tail = sigusr1_head; + logging::progress_canary(true); + if (pid < 0 && errno == EINTR) + continue; + } + if (sigusr2_tail != sigusr2_head) { + sigusr2_tail = sigusr2_head; + logging::progress_canary(false); + if (pid < 0 && errno == EINTR) + continue; + } - case ECHILD: - default: - pid = 0; - return errno; + if (pid == 0) + break; + + int err = errno; + if (err != EINTR) + return err; } + return 0 /* timeout */; } void osal_yield(void) { @@ -254,9 +464,17 @@ void osal_udelay(unsigned us) { static unsigned threshold_us; if (threshold_us == 0) { +#if defined(_POSIX_CPUTIME) && _POSIX_CPUTIME > -1 && \ + defined(CLOCK_PROCESS_CPUTIME_ID) if (clock_getres(CLOCK_PROCESS_CPUTIME_ID, &ts)) { int rc = errno; - failure_perror("clock_getres(CLOCK_PROCESS_CPUTIME_ID)", rc); + log_warning("clock_getres(CLOCK_PROCESS_CPUTIME_ID), failed errno %d", + rc); + } +#endif /* CLOCK_PROCESS_CPUTIME_ID */ + if (threshold_us == 0 && clock_getres(CLOCK_MONOTONIC, &ts)) { + int rc = errno; + failure_perror("clock_getres(CLOCK_MONOTONIC)", rc); } chrono::time threshold = chrono::from_timespec(ts); assert(threshold.seconds() == 0); diff --git a/libs/libmdbx/src/test/osal-windows.cc b/libs/libmdbx/src/test/osal-windows.cc index 975d8268f5..6de674651b 100644 --- a/libs/libmdbx/src/test/osal-windows.cc +++ b/libs/libmdbx/src/test/osal-windows.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -16,6 +16,7 @@ static std::unordered_map<unsigned, HANDLE> events; static HANDLE hBarrierSemaphore, hBarrierEvent; +static HANDLE hProgressActiveEvent, hProgressPassiveEvent; static int waitstatus2errcode(DWORD result) { switch (result) { @@ -85,6 +86,16 @@ void osal_setup(const std::vector<actor_config> &actors) { if (!hBarrierEvent) failure_perror("CreateEvent(BarrierEvent)", GetLastError()); hBarrierEvent = make_inheritable(hBarrierEvent); + + hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!hProgressActiveEvent) + failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError()); + hProgressActiveEvent = make_inheritable(hProgressActiveEvent); + + hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!hProgressPassiveEvent) + failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError()); + hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent); } void osal_broadcast(unsigned id) { @@ -112,6 +123,8 @@ const std::string actor_config::osal_serialize(simple_checksum &checksum) const { checksum.push(hBarrierSemaphore); checksum.push(hBarrierEvent); + checksum.push(hProgressActiveEvent); + checksum.push(hProgressPassiveEvent); HANDLE hWait = INVALID_HANDLE_VALUE; if (wait4id) { @@ -125,8 +138,8 @@ actor_config::osal_serialize(simple_checksum &checksum) const { checksum.push(hSignal); } - return format("%p.%p.%p.%p", hBarrierSemaphore, hBarrierEvent, hWait, - hSignal); + return format("%p.%p.%p.%p.%p.%p", hBarrierSemaphore, hBarrierEvent, hWait, + hSignal, hProgressActiveEvent, hProgressPassiveEvent); } bool actor_config::osal_deserialize(const char *str, const char *end, @@ -137,17 +150,22 @@ bool actor_config::osal_deserialize(const char *str, const char *end, assert(hBarrierSemaphore == 0); assert(hBarrierEvent == 0); + assert(hProgressActiveEvent == 0); + assert(hProgressPassiveEvent == 0); assert(events.empty()); HANDLE hWait, hSignal; - if (sscanf_s(copy.c_str(), "%p.%p.%p.%p", &hBarrierSemaphore, &hBarrierEvent, - &hWait, &hSignal) != 4) { + if (sscanf_s(copy.c_str(), "%p.%p.%p.%p.%p.%p", &hBarrierSemaphore, + &hBarrierEvent, &hWait, &hSignal, &hProgressActiveEvent, + &hProgressPassiveEvent) != 6) { TRACE("<< osal_deserialize: failed\n"); return false; } checksum.push(hBarrierSemaphore); checksum.push(hBarrierEvent); + checksum.push(hProgressActiveEvent); + checksum.push(hProgressPassiveEvent); if (wait4id) { checksum.push(hWait); @@ -168,6 +186,17 @@ bool actor_config::osal_deserialize(const char *str, const char *end, typedef std::pair<HANDLE, actor_status> child; static std::unordered_map<mdbx_pid_t, child> childs; +bool osal_progress_push(bool active) { + if (!childs.empty()) { + if (!SetEvent(active ? hProgressActiveEvent : hProgressPassiveEvent)) + failure_perror("osal_progress_push: SetEvent(overlord.progress)", + GetLastError()); + return true; + } + + return false; +} + static void ArgvQuote(std::string &CommandLine, const std::string &Argument, bool Force = false) @@ -314,7 +343,7 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { break; case EXCEPTION_BREAKPOINT: case EXCEPTION_SINGLE_STEP: - status = as_debuging; + status = as_debugging; break; case STATUS_CONTROL_C_EXIT: status = as_killed; @@ -344,32 +373,45 @@ void osal_killall_actors(void) { int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { std::vector<HANDLE> handles; - handles.reserve(childs.size()); + handles.reserve(childs.size() + 2); + handles.push_back(hProgressActiveEvent); + handles.push_back(hProgressPassiveEvent); for (const auto &pair : childs) if (pair.second.second <= as_running) handles.push_back(pair.second.first); - DWORD rc = - MsgWaitForMultipleObjectsEx((DWORD)handles.size(), &handles[0], - (timeout > 60) ? 60 * 1000 : timeout * 1000, - QS_ALLINPUT | QS_ALLPOSTMESSAGE, 0); + while (true) { + DWORD rc = + MsgWaitForMultipleObjectsEx((DWORD)handles.size(), &handles[0], + (timeout > 60) ? 60 * 1000 : timeout * 1000, + QS_ALLINPUT | QS_ALLPOSTMESSAGE, 0); - if (rc >= WAIT_OBJECT_0 && rc < WAIT_OBJECT_0 + handles.size()) { - pid = 0; - for (const auto &pair : childs) - if (pair.second.first == handles[rc - WAIT_OBJECT_0]) { - pid = pair.first; - break; - } - return 0; - } + if (rc == WAIT_OBJECT_0) { + logging::progress_canary(true); + continue; + } + if (rc == WAIT_OBJECT_0 + 1) { + logging::progress_canary(false); + continue; + } - if (rc == WAIT_TIMEOUT) { - pid = 0; - return 0; - } + if (rc >= WAIT_OBJECT_0 + 2 && rc < WAIT_OBJECT_0 + handles.size()) { + pid = 0; + for (const auto &pair : childs) + if (pair.second.first == handles[rc - WAIT_OBJECT_0]) { + pid = pair.first; + break; + } + return 0; + } - return waitstatus2errcode(rc); + if (rc == WAIT_TIMEOUT) { + pid = 0; + return 0; + } + + return waitstatus2errcode(rc); + } } void osal_yield(void) { SwitchToThread(); } diff --git a/libs/libmdbx/src/test/osal.h b/libs/libmdbx/src/test/osal.h index 5acf7ad094..6d0e1c4eb6 100644 --- a/libs/libmdbx/src/test/osal.h +++ b/libs/libmdbx/src/test/osal.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -26,6 +26,8 @@ void osal_killall_actors(void); int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout); void osal_wait4barrier(void); +bool osal_progress_push(bool active); + mdbx_pid_t osal_getpid(void); int osal_delay(unsigned seconds); void osal_udelay(unsigned us); diff --git a/libs/libmdbx/src/test/pcrf/CMakeLists.txt b/libs/libmdbx/src/test/pcrf/CMakeLists.txt index 399c33f88f..8bd3e3d859 100644 --- a/libs/libmdbx/src/test/pcrf/CMakeLists.txt +++ b/libs/libmdbx/src/test/pcrf/CMakeLists.txt @@ -1,7 +1,5 @@ set(TARGET pcrf_test) -project(${TARGET}) - add_executable(${TARGET} pcrf_test.c) - +target_include_directories(${TARGET} PRIVATE "${PROJECT_SOURCE_DIR}") target_link_libraries(${TARGET} mdbx) diff --git a/libs/libmdbx/src/test/pcrf/pcrf_test.c b/libs/libmdbx/src/test/pcrf/pcrf_test.c index 213c8b1d80..206bd21d4e 100644 --- a/libs/libmdbx/src/test/pcrf/pcrf_test.c +++ b/libs/libmdbx/src/test/pcrf/pcrf_test.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2017 Leonid Yuriev <leo@yuriev.ru>. + * Copyright 2016-2019 Leonid Yuriev <leo@yuriev.ru>. * Copyright 2015 Vladimir Romanov * <https://www.linkedin.com/in/vladimirromanov>, Yota Lab. * @@ -36,7 +36,7 @@ (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ (int)((addr)&0xff) -char opt_db_path[PATH_MAX] = "/root/lmdbx_bench2"; +char opt_db_path[PATH_MAX] = "./mdbx_bench2"; static MDBX_env *env; #define REC_COUNT 10240000 int64_t ids[REC_COUNT * 10]; @@ -105,12 +105,12 @@ static void db_connect() { MDBX_dbi dbi_ip; MDBX_CHECK(mdbx_env_create(&env)); - MDBX_CHECK( - mdbx_env_set_mapsize(env, REC_COUNT * sizeof(session_data_t) * 10)); + MDBX_CHECK(mdbx_env_set_geometry( + env, 0, 0, REC_COUNT * sizeof(session_data_t) * 10, -1, -1, -1)); MDBX_CHECK(mdbx_env_set_maxdbs(env, 30)); MDBX_CHECK(mdbx_env_open(env, opt_db_path, MDBX_CREATE | MDBX_WRITEMAP | MDBX_MAPASYNC | - MDBX_NOSYNC | MDBX_LIFORECLAIM, + MDBX_SAFE_NOSYNC | MDBX_LIFORECLAIM, 0664)); MDBX_txn *txn; @@ -126,7 +126,7 @@ static void db_connect() { printf("Connection open\n"); } -static void create_record(int64_t record_id) { +static void create_record(uint64_t record_id) { MDBX_dbi dbi_session; MDBX_dbi dbi_session_id; MDBX_dbi dbi_event; @@ -136,11 +136,13 @@ static void create_record(int64_t record_id) { session_data_t data; // transaction init snprintf(data.session_id1, sizeof(data.session_id1), - "prefix%02ld_%02ld.fill.fill.fill.fill.fill.fill;%ld", - record_id % 3 + 1, record_id % 9 + 1, record_id); + "prefix%02u_%02u.fill.fill.fill.fill.fill.fill;%" PRIu64, + (unsigned)(record_id % 3) + 1, (unsigned)(record_id % 9) + 1, + record_id); snprintf(data.session_id2, sizeof(data.session_id2), - "dprefix%ld;%ld.fill.fill.;suffix", record_id, - record_id % 1000000000 + 99999); + "dprefix%" PRIu64 ";%" PRIu64 ".fill.fill.;suffix", record_id, + (record_id + UINT64_C(1442695040888963407)) % + UINT64_C(6364136223846793005)); snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); event.obj_id = record_id; @@ -241,8 +243,9 @@ static void get_db_stat(const char *db, int64_t *ms_branch_pages, MDBX_CHECK(mdbx_dbi_open(txn, db, MDBX_CREATE, &dbi)); MDBX_CHECK(mdbx_dbi_stat(txn, dbi, &stat, sizeof(stat))); mdbx_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, - stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, + printf("%15s | %15" PRIu64 " | %5u | %10" PRIu64 " | %10" PRIu64 + " | %11" PRIu64 " |\n", + db, stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, stat.ms_leaf_pages, stat.ms_overflow_pages); (*ms_branch_pages) += stat.ms_branch_pages; (*ms_leaf_pages) += stat.ms_leaf_pages; @@ -253,8 +256,8 @@ static void periodic_stat(void) { int64_t ms_leaf_pages = 0; MDBX_stat mst; MDBX_envinfo mei; - MDBX_CHECK(mdbx_env_stat(env, &mst, sizeof(mst))); - MDBX_CHECK(mdbx_env_info(env, &mei, sizeof(mei))); + MDBX_CHECK(mdbx_env_stat_ex(env, NULL, &mst, sizeof(mst))); + MDBX_CHECK(mdbx_env_info_ex(env, NULL, &mei, sizeof(mei))); printf("Environment Info\n"); printf(" Pagesize: %u\n", mst.ms_psize); if (mei.mi_geo.lower != mei.mi_geo.upper) { @@ -287,8 +290,8 @@ static void periodic_stat(void) { get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, - "", "", ms_leaf_pages, ""); + printf("%15s | %15" PRIu64 " | %5s | %10s | %10" PRIu64 " | %11s |\n", "", + ms_branch_pages, "", "", ms_leaf_pages, ""); static int64_t prev_add_count; static int64_t prev_del_count; @@ -297,26 +300,32 @@ static void periodic_stat(void) { static int64_t t = -1; if (t > 0) { int64_t delta = (getClockUs() - t); - printf( - "CPS: add %ld, delete %ld, items processed - %ldK data=%ldK key=%ldK\n", - (mdbx_add_count - prev_add_count) * 1000000 / delta, - (mdbx_del_count - prev_del_count) * 1000000 / delta, obj_id / 1024, - mdbx_data_size / 1024, mdbx_key_size / 1024); - printf("usage data=%ld%%", ((mdbx_data_size + mdbx_key_size) * 100) / - ((ms_leaf_pages + ms_branch_pages) * 4096)); + printf("CPS: add %" PRIu64 ", delete %" PRIu64 + ", items processed - %" PRIu64 "K data=%" PRIu64 "K key=%" PRIu64 + "K\n", + (mdbx_add_count - prev_add_count) * 1000000 / delta, + (mdbx_del_count - prev_del_count) * 1000000 / delta, obj_id / 1024, + mdbx_data_size / 1024, mdbx_key_size / 1024); + printf("usage data=%" PRIu64 "%%", + ((mdbx_data_size + mdbx_key_size) * 100) / + ((ms_leaf_pages + ms_branch_pages) * 4096)); if (prev_add_time != mdbx_add_time) { - printf(" Add : %ld c/s", (mdbx_add_count - prev_add_count) * 1000000 / - (mdbx_add_time - prev_add_time)); + printf(" Add : %" PRIu64 " c/s", (mdbx_add_count - prev_add_count) * + 1000000 / + (mdbx_add_time - prev_add_time)); } if (prev_del_time != mdbx_del_time) { - printf(" Del : %ld c/s", (mdbx_del_count - prev_del_count) * 1000000 / - (mdbx_del_time - prev_del_time)); + printf(" Del : %" PRIu64 " c/s", (mdbx_del_count - prev_del_count) * + 1000000 / + (mdbx_del_time - prev_del_time)); } if (mdbx_add_time) { - printf(" tAdd : %ld c/s", mdbx_add_count * 1000000 / mdbx_add_time); + printf(" tAdd : %" PRIu64 " c/s", + mdbx_add_count * 1000000 / mdbx_add_time); } if (mdbx_del_time) { - printf(" tDel : %ld c/s", mdbx_del_count * 1000000 / mdbx_del_time); + printf(" tDel : %" PRIu64 " c/s", + mdbx_del_count * 1000000 / mdbx_del_time); } puts(""); } @@ -385,14 +394,14 @@ int main(int argc, char **argv) { id = get_id_from_pool(); delete_record(id); } - // for (i = 0; i < 50; i++) { - // int64_t id = obj_id++; - // create_record(id); - // add_id_to_pool(id); - // } - // int64_t id = obj_id++; - // create_record(id); - // add_id_to_pool(id); + // for (i = 0; i < 50; i++) { + // int64_t id = obj_id++; + // create_record(id); + // add_id_to_pool(id); + // } + // int64_t id = obj_id++; + // create_record(id); + // add_id_to_pool(id); int64_t now = getClockUs(); if ((now - t) > 10000000L) { periodic_stat(); diff --git a/libs/libmdbx/src/test/test.cc b/libs/libmdbx/src/test/test.cc index e9e925e9a9..ff1676274b 100644 --- a/libs/libmdbx/src/test/test.cc +++ b/libs/libmdbx/src/test/test.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -37,6 +37,8 @@ const char *testcase2str(const actor_testcase testcase) { return "append"; case ac_ttl: return "ttl"; + case ac_nested: + return "nested"; } } @@ -45,8 +47,8 @@ const char *status2str(actor_status status) { default: assert(false); return "?!"; - case as_debuging: - return "debuging"; + case as_debugging: + return "debugging"; case as_running: return "running"; case as_successful: @@ -76,15 +78,16 @@ const char *keygencase2str(const keygen_case keycase) { //----------------------------------------------------------------------------- -int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, - unsigned gap, int retry) { +int testcase::oom_callback(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid, + uint64_t txn, unsigned gap, size_t space, + int retry) { testcase *self = (testcase *)mdbx_env_get_userctx(env); if (retry == 0) - log_notice("oom_callback: waitfor pid %u, thread %" PRIuPTR - ", txn #%" PRIu64 ", gap %d", - pid, (size_t)tid, txn, gap); + log_notice("oom_callback: waitfor pid %lu, thread %" PRIuPTR + ", txn #%" PRIu64 ", gap %d, scape %zu", + (long)pid, (size_t)tid, txn, gap, space); if (self->should_continue(true)) { osal_yield(); @@ -186,7 +189,8 @@ int testcase::breakable_commit() { if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { rc = err; err = mdbx_txn_abort(txn); - if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH)) + if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH && + err != MDBX_BAD_TXN)) failure_perror("mdbx_txn_abort()", err); } else failure_perror("mdbx_txn_commit()", err); @@ -196,6 +200,20 @@ int testcase::breakable_commit() { return rc; } +unsigned testcase::txn_underutilization_x256(MDBX_txn *txn) const { + if (txn) { + MDBX_txn_info info; + int err = mdbx_txn_info(txn, &info, false); + if (unlikely(err != MDBX_SUCCESS)) + failure_perror("mdbx_txn_info()", err); + const size_t left = size_t(info.txn_space_leftover); + const size_t total = + size_t(info.txn_space_leftover) + size_t(info.txn_space_dirty); + return (unsigned)(left / (total >> 8)); + } + return 0; +} + void testcase::txn_end(bool abort) { log_trace(">> txn_end(%s)", abort ? "abort" : "commit"); assert(txn_guard); @@ -203,7 +221,8 @@ void testcase::txn_end(bool abort) { MDBX_txn *txn = txn_guard.release(); if (abort) { int err = mdbx_txn_abort(txn); - if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH)) + if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH && + err != MDBX_BAD_TXN)) failure_perror("mdbx_txn_abort()", err); } else { txn_inject_writefault(txn); @@ -215,18 +234,18 @@ void testcase::txn_end(bool abort) { log_trace("<< txn_end(%s)", abort ? "abort" : "commit"); } -void testcase::cursor_open(unsigned dbi) { - log_trace(">> cursor_open(%u)", dbi); +void testcase::cursor_open(MDBX_dbi handle) { + log_trace(">> cursor_open(%u)", handle); assert(!cursor_guard); assert(txn_guard); MDBX_cursor *cursor = nullptr; - int rc = mdbx_cursor_open(txn_guard.get(), dbi, &cursor); + int rc = mdbx_cursor_open(txn_guard.get(), handle, &cursor); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_cursor_open()", rc); cursor_guard.reset(cursor); - log_trace("<< cursor_open(%u)", dbi); + log_trace("<< cursor_open(%u)", handle); } void testcase::cursor_close() { @@ -264,8 +283,9 @@ void testcase::txn_inject_writefault(MDBX_txn *txn) { if (config.params.inject_writefaultn && txn) { if (config.params.inject_writefaultn <= nops_completed && (mdbx_txn_flags(txn) & MDBX_RDONLY) == 0) { - log_info("== txn_inject_writefault(): got %u nops or more, inject FAULT", - config.params.inject_writefaultn); + log_verbose( + "== txn_inject_writefault(): got %u nops or more, inject FAULT", + config.params.inject_writefaultn); log_flush(); #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) TerminateProcess(GetCurrentProcess(), 42); @@ -305,21 +325,9 @@ bool testcase::wait4start() { } void testcase::kick_progress(bool active) const { - chrono::time now = chrono::now_motonic(); - if (active) { - static int last_point = -1; - int point = (now.fixedpoint >> 29) & 3; - if (point != last_point) { - last.progress_timestamp = now; - fprintf(stderr, "%c\b", "-\\|/"[last_point = point]); - fflush(stderr); - } - } else if (now.fixedpoint - last.progress_timestamp.fixedpoint > - chrono::from_seconds(2).fixedpoint) { - last.progress_timestamp = now; - fprintf(stderr, "%c\b", "@*"[now.utc & 1]); - fflush(stderr); - } + if (!global::config::progress_indicator) + return; + logging::progress_canary(active); } void testcase::report(size_t nops_done) { @@ -328,11 +336,10 @@ void testcase::report(size_t nops_done) { return; nops_completed += nops_done; - log_verbose("== complete +%" PRIuPTR " iteration, total %" PRIuPTR " done", - nops_done, nops_completed); + log_debug("== complete +%" PRIuPTR " iteration, total %" PRIuPTR " done", + nops_done, nops_completed); - if (global::config::progress_indicator) - kick_progress(true); + kick_progress(true); if (config.signal_nops && !signalled && config.signal_nops <= nops_completed) { @@ -387,7 +394,7 @@ bool testcase::should_continue(bool check_timeout_only) const { nops_completed >= config.params.test_nops) result = false; - if (result && global::config::progress_indicator) + if (result) kick_progress(false); return result; @@ -430,14 +437,14 @@ void testcase::update_canary(uint64_t increment) { log_trace("<< update_canary: sequence = %" PRIu64, canary_now.y); } -int testcase::db_open__begin__table_create_open_clean(MDBX_dbi &dbi) { +int testcase::db_open__begin__table_create_open_clean(MDBX_dbi &handle) { db_open(); int err, retry_left = 42; for (;;) { txn_begin(false); - dbi = db_table_open(true); - db_table_clear(dbi); + handle = db_table_open(true); + db_table_clear(handle); err = breakable_commit(); if (likely(err == MDBX_SUCCESS)) { txn_begin(false); @@ -464,7 +471,7 @@ MDBX_dbi testcase::db_table_open(bool create) { failure("snprintf(tablename): %d", rc); tablename = tablename_buf; } - log_verbose("use %s table", tablename ? tablename : "MAINDB"); + log_debug("use %s table", tablename ? tablename : "MAINDB"); MDBX_dbi handle = 0; int rc = mdbx_dbi_open(txn_guard.get(), tablename, @@ -490,9 +497,9 @@ void testcase::db_table_drop(MDBX_dbi handle) { } } -void testcase::db_table_clear(MDBX_dbi handle) { +void testcase::db_table_clear(MDBX_dbi handle, MDBX_txn *txn) { log_trace(">> testcase::db_table_clear, handle %u", handle); - int rc = mdbx_drop(txn_guard.get(), handle, false); + int rc = mdbx_drop(txn ? txn : txn_guard.get(), handle, false); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_drop(delete=false)", rc); log_trace("<< testcase::db_table_clear"); @@ -510,7 +517,7 @@ void testcase::db_table_close(MDBX_dbi handle) { void testcase::checkdata(const char *step, MDBX_dbi handle, MDBX_val key2check, MDBX_val expected_valued) { MDBX_val actual_value = expected_valued; - int rc = mdbx_get2(txn_guard.get(), handle, &key2check, &actual_value); + int rc = mdbx_get_nearest(txn_guard.get(), handle, &key2check, &actual_value); if (unlikely(rc != MDBX_SUCCESS)) failure_perror(step, rc); if (!is_samedata(&actual_value, &expected_valued)) @@ -560,6 +567,9 @@ bool test_execute(const actor_config &config_const) { case ac_ttl: test.reset(new testcase_ttl(config, pid)); break; + case ac_nested: + test.reset(new testcase_nested(config, pid)); + break; default: test.reset(new testcase(config, pid)); break; @@ -582,13 +592,13 @@ bool test_execute(const actor_config &config_const) { } if (config.params.nrepeat == 1) - log_info("test successed"); + log_verbose("test successed"); else { if (config.params.nrepeat) - log_info("test successed (iteration %zi of %zi)", iter, - size_t(config.params.nrepeat)); + log_verbose("test successed (iteration %zi of %zi)", iter, + size_t(config.params.nrepeat)); else - log_info("test successed (iteration %zi)", iter); + log_verbose("test successed (iteration %zi)", iter); config.params.keygen.seed += INT32_C(0xA4F4D37B); } @@ -599,3 +609,135 @@ bool test_execute(const actor_config &config_const) { return false; } } + +//----------------------------------------------------------------------------- + +int testcase::insert(const keygen::buffer &akey, const keygen::buffer &adata, + unsigned flags) { + int err = mdbx_put(txn_guard.get(), dbi, &akey->value, &adata->value, flags); + if (err == MDBX_SUCCESS && config.params.speculum) { + const auto S_key = S(akey); + const auto S_data = S(adata); + const bool inserted = speculum.emplace(S_key, S_data).second; + assert(inserted); + (void)inserted; + } + return err; +} + +int testcase::replace(const keygen::buffer &akey, + const keygen::buffer &new_data, + const keygen::buffer &old_data, unsigned flags) { + if (config.params.speculum) { + const auto S_key = S(akey); + const auto S_old = S(old_data); + const auto S_new = S(new_data); + const auto removed = speculum.erase(SET::key_type(S_key, S_old)); + assert(removed == 1); + (void)removed; + const bool inserted = speculum.emplace(S_key, S_new).second; + assert(inserted); + (void)inserted; + } + return mdbx_replace(txn_guard.get(), dbi, &akey->value, &new_data->value, + &old_data->value, flags); +} + +int testcase::remove(const keygen::buffer &akey, const keygen::buffer &adata) { + if (config.params.speculum) { + const auto S_key = S(akey); + const auto S_data = S(adata); + const auto removed = speculum.erase(SET::key_type(S_key, S_data)); + assert(removed == 1); + (void)removed; + } + return mdbx_del(txn_guard.get(), dbi, &akey->value, &adata->value); +} + +bool testcase::speculum_verify() { + if (!config.params.speculum) + return true; + + if (!txn_guard) + txn_begin(true); + + char dump_key[128], dump_value[128]; + char dump_mkey[128], dump_mvalue[128]; + + MDBX_cursor *cursor; + int err = mdbx_cursor_open(txn_guard.get(), dbi, &cursor); + if (err != MDBX_SUCCESS) + failure_perror("mdbx_cursor_open()", err); + + bool rc = true; + MDBX_val akey, avalue; + MDBX_val mkey, mvalue; + err = mdbx_cursor_get(cursor, &akey, &avalue, MDBX_FIRST); + + unsigned extra = 0, lost = 0, n = 0; + assert(std::is_sorted(speculum.cbegin(), speculum.cend(), ItemCompare(this))); + auto it = speculum.cbegin(); + while (true) { + if (err != MDBX_SUCCESS) { + akey.iov_len = avalue.iov_len = 0; + akey.iov_base = avalue.iov_base = nullptr; + } + const auto S_key = S(akey); + const auto S_data = S(avalue); + if (it != speculum.cend()) { + mkey.iov_base = (void *)it->first.c_str(); + mkey.iov_len = it->first.size(); + mvalue.iov_base = (void *)it->second.c_str(); + mvalue.iov_len = it->second.size(); + } + if (err == MDBX_SUCCESS && it != speculum.cend() && S_key == it->first && + S_data == it->second) { + ++it; + err = mdbx_cursor_get(cursor, &akey, &avalue, MDBX_NEXT); + } else if (err == MDBX_SUCCESS && + (it == speculum.cend() || S_key < it->first || + (S_key == it->first && S_data < it->second))) { + extra += 1; + if (it != speculum.cend()) { + log_error("extra pair %u/%u: db{%s, %s} < mi{%s, %s}", n, extra, + mdbx_dump_val(&akey, dump_key, sizeof(dump_key)), + mdbx_dump_val(&avalue, dump_value, sizeof(dump_value)), + mdbx_dump_val(&mkey, dump_mkey, sizeof(dump_mkey)), + mdbx_dump_val(&mvalue, dump_mvalue, sizeof(dump_mvalue))); + } else { + log_error("extra pair %u/%u: db{%s, %s} < mi.END", n, extra, + mdbx_dump_val(&akey, dump_key, sizeof(dump_key)), + mdbx_dump_val(&avalue, dump_value, sizeof(dump_value))); + } + err = mdbx_cursor_get(cursor, &akey, &avalue, MDBX_NEXT); + rc = false; + } else if (it != speculum.cend() && + (err == MDBX_NOTFOUND || S_key > it->first || + (S_key == it->first && S_data > it->second))) { + lost += 1; + if (err == MDBX_NOTFOUND) { + log_error("lost pair %u/%u: db.END > mi{%s, %s}", n, lost, + mdbx_dump_val(&mkey, dump_mkey, sizeof(dump_mkey)), + mdbx_dump_val(&mvalue, dump_mvalue, sizeof(dump_mvalue))); + } else { + log_error("lost pair %u/%u: db{%s, %s} > mi{%s, %s}", n, lost, + mdbx_dump_val(&akey, dump_key, sizeof(dump_key)), + mdbx_dump_val(&avalue, dump_value, sizeof(dump_value)), + mdbx_dump_val(&mkey, dump_mkey, sizeof(dump_mkey)), + mdbx_dump_val(&mvalue, dump_mvalue, sizeof(dump_mvalue))); + } + ++it; + rc = false; + } else if (err == MDBX_NOTFOUND && it == speculum.cend()) { + break; + } else if (err != MDBX_SUCCESS) { + failure_perror("mdbx_cursor_get()", err); + } else { + assert(!"WTF?"); + } + n += 1; + } + + mdbx_cursor_close(cursor); + return rc; +} diff --git a/libs/libmdbx/src/test/test.h b/libs/libmdbx/src/test/test.h index fb5ad4ee83..630059ac3f 100644 --- a/libs/libmdbx/src/test/test.h +++ b/libs/libmdbx/src/test/test.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -22,6 +22,24 @@ #include "osal.h" #include "utils.h" +#include <deque> +#include <set> +#include <stack> +#include <tuple> + +#ifndef HAVE_cxx17_std_string_view +#if __cplusplus >= 201703L && __has_include(<string_view>) +#include <string_view> +#define HAVE_cxx17_std_string_view 1 +#else +#define HAVE_cxx17_std_string_view 0 +#endif +#endif /* HAVE_cxx17_std_string_view */ + +#if HAVE_cxx17_std_string_view +#include <string_view> +#endif + bool test_execute(const actor_config &config); std::string thunk_param(const actor_config &config); void testcase_setup(const char *casename, actor_params ¶ms, @@ -49,25 +67,26 @@ extern bool cleanup_before; extern bool cleanup_after; extern bool failfast; extern bool progress_indicator; +extern bool console_mode; } /* namespace config */ } /* namespace global */ //----------------------------------------------------------------------------- -struct db_deleter : public std::unary_function<void, MDBX_env *> { +struct db_deleter /* : public std::unary_function<void, MDBX_env *> */ { void operator()(MDBX_env *env) const { mdbx_env_close(env); } }; -struct txn_deleter : public std::unary_function<void, MDBX_txn *> { +struct txn_deleter /* : public std::unary_function<void, MDBX_txn *> */ { void operator()(MDBX_txn *txn) const { int rc = mdbx_txn_abort(txn); if (rc) - log_trouble(mdbx_func_, "mdbx_txn_abort()", rc); + log_trouble(__func__, "mdbx_txn_abort()", rc); } }; -struct cursor_deleter : public std::unary_function<void, MDBX_cursor *> { +struct cursor_deleter /* : public std::unary_function<void, MDBX_cursor *> */ { void operator()(MDBX_cursor *cursor) const { mdbx_cursor_close(cursor); } }; @@ -79,9 +98,45 @@ typedef std::unique_ptr<MDBX_cursor, cursor_deleter> scoped_cursor_guard; class testcase { protected: +#if HAVE_cxx17_std_string_view + using data_view = std::string_view; +#else + using data_view = std::string; +#endif + static inline data_view S(const MDBX_val &v) { + return data_view(static_cast<const char *>(v.iov_base), v.iov_len); + } + static inline data_view S(const keygen::buffer &b) { return S(b->value); } + + using Item = std::pair<std::string, std::string>; + struct ItemCompare { + const testcase *context; + ItemCompare(const testcase *owner) : context(owner) {} + + bool operator()(const Item &a, const Item &b) const { + MDBX_val va, vb; + va.iov_base = (void *)a.first.data(); + va.iov_len = a.first.size(); + vb.iov_base = (void *)b.first.data(); + vb.iov_len = b.first.size(); + int cmp = mdbx_cmp(context->txn_guard.get(), context->dbi, &va, &vb); + if (cmp == 0 && + (context->config.params.table_flags & MDBX_DUPSORT) != 0) { + va.iov_base = (void *)a.second.data(); + va.iov_len = a.second.size(); + vb.iov_base = (void *)b.second.data(); + vb.iov_len = b.second.size(); + cmp = mdbx_dcmp(context->txn_guard.get(), context->dbi, &va, &vb); + } + return cmp < 0; + } + }; + using SET = std::set<Item, ItemCompare>; + const actor_config &config; const mdbx_pid_t pid; + MDBX_dbi dbi; scoped_db_guard db_guard; scoped_txn_guard txn_guard; scoped_cursor_guard cursor_guard; @@ -95,12 +150,23 @@ protected: struct { mdbx_canary canary; - mutable chrono::time progress_timestamp; } last; - static int oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, - unsigned gap, int retry); + SET speculum; + bool speculum_verify(); + int insert(const keygen::buffer &akey, const keygen::buffer &adata, + unsigned flags); + int replace(const keygen::buffer &akey, const keygen::buffer &new_value, + const keygen::buffer &old_value, unsigned flags); + int remove(const keygen::buffer &akey, const keygen::buffer &adata); + static int oom_callback(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid, + uint64_t txn, unsigned gap, size_t space, int retry); + + bool is_nested_txn_available() const { + return (config.params.mode_flags & MDBX_WRITEMAP) == 0; + } + void kick_progress(bool active) const; void db_prepare(); void db_open(); void db_close(); @@ -109,21 +175,21 @@ protected: void txn_end(bool abort); int breakable_restart(); void txn_restart(bool abort, bool readonly, unsigned flags = 0); - void cursor_open(unsigned dbi); + void cursor_open(MDBX_dbi handle); void cursor_close(); void txn_inject_writefault(void); void txn_inject_writefault(MDBX_txn *txn); void fetch_canary(); void update_canary(uint64_t increment); - void kick_progress(bool active) const; void checkdata(const char *step, MDBX_dbi handle, MDBX_val key2check, MDBX_val expected_valued); + unsigned txn_underutilization_x256(MDBX_txn *txn) const; MDBX_dbi db_table_open(bool create); void db_table_drop(MDBX_dbi handle); - void db_table_clear(MDBX_dbi handle); + void db_table_clear(MDBX_dbi handle, MDBX_txn *txn = nullptr); void db_table_close(MDBX_dbi handle); - int db_open__begin__table_create_open_clean(MDBX_dbi &dbi); + int db_open__begin__table_create_open_clean(MDBX_dbi &handle); bool wait4start(); void report(size_t nops_done); @@ -131,13 +197,12 @@ protected: bool should_continue(bool check_timeout_only = false) const; void generate_pair(const keygen::serial_t serial, keygen::buffer &out_key, - keygen::buffer &out_value, keygen::serial_t data_age = 0) { - keyvalue_maker.pair(serial, out_key, out_value, data_age); + keygen::buffer &out_value, keygen::serial_t data_age) { + keyvalue_maker.pair(serial, out_key, out_value, data_age, false); } - void generate_pair(const keygen::serial_t serial, - keygen::serial_t data_age = 0) { - generate_pair(serial, key, data, data_age); + void generate_pair(const keygen::serial_t serial) { + keyvalue_maker.pair(serial, key, data, 0, true); } bool mode_readonly() const { @@ -146,7 +211,8 @@ protected: public: testcase(const actor_config &config, const mdbx_pid_t pid) - : config(config), pid(pid), signalled(false), nops_completed(0) { + : config(config), pid(pid), signalled(false), nops_completed(0), + speculum(ItemCompare(this)) { start_timestamp.reset(); memset(&last, 0, sizeof(last)); } @@ -161,49 +227,52 @@ class testcase_ttl : public testcase { public: testcase_ttl(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_hill : public testcase { + using inherited = testcase; + SET speculum_commited; + public: testcase_hill(const actor_config &config, const mdbx_pid_t pid) - : testcase(config, pid) {} - bool run(); + : testcase(config, pid), speculum_commited(ItemCompare(this)) {} + bool run() override; }; class testcase_append : public testcase { public: testcase_append(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_deadread : public testcase { public: testcase_deadread(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_deadwrite : public testcase { public: testcase_deadwrite(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_jitter : public testcase { public: testcase_jitter(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_try : public testcase { public: testcase_try(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid) {} - bool run(); + bool run() override; }; class testcase_copy : public testcase { @@ -214,5 +283,31 @@ public: testcase_copy(const actor_config &config, const mdbx_pid_t pid) : testcase(config, pid), copy_pathname(config.params.pathname_db + "-copy") {} - bool run(); + bool run() override; +}; + +class testcase_nested : public testcase { + using inherited = testcase; + using FIFO = std::deque<std::pair<uint64_t, unsigned>>; + + uint64_t serial; + FIFO fifo; + std::stack<std::tuple<scoped_txn_guard, uint64_t, FIFO, SET>> stack; + + bool trim_tail(unsigned window_width); + bool grow_head(unsigned head_count); + bool pop_txn(bool abort); + bool pop_txn() { + return pop_txn(inherited::is_nested_txn_available() ? flipcoin_x3() + : flipcoin_x2()); + } + void push_txn(); + bool stochastic_breakable_restart_with_nested(bool force_restart = false); + +public: + testcase_nested(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid) {} + bool setup() override; + bool run() override; + bool teardown() override; }; diff --git a/libs/libmdbx/src/test/test.vcxproj b/libs/libmdbx/src/test/test.vcxproj deleted file mode 100644 index 93de00e3a0..0000000000 --- a/libs/libmdbx/src/test/test.vcxproj +++ /dev/null @@ -1,209 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <ItemGroup> - <ProjectReference Include="..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbxtest</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <TargetName>mdbx_test</TargetName> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <TargetName>mdbx_test</TargetName> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <TargetName>mdbx_test</TargetName> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <TargetName>mdbx_test</TargetName> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader>Use</PrecompiledHeader> - <WarningLevel>Level4</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <SDLCheck>true</SDLCheck> - <PrecompiledHeaderFile>test.h</PrecompiledHeaderFile> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <GenerateDebugInformation>true</GenerateDebugInformation> - <AdditionalDependencies>winmm.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader>Use</PrecompiledHeader> - <WarningLevel>Level4</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <SDLCheck>true</SDLCheck> - <PrecompiledHeaderFile>test.h</PrecompiledHeaderFile> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <GenerateDebugInformation>true</GenerateDebugInformation> - <AdditionalDependencies>winmm.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>Level4</WarningLevel> - <PrecompiledHeader>Use</PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <SDLCheck>true</SDLCheck> - <PrecompiledHeaderFile>test.h</PrecompiledHeaderFile> - <OmitFramePointers>true</OmitFramePointers> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - <GenerateDebugInformation>true</GenerateDebugInformation> - <AdditionalDependencies>winmm.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>Level4</WarningLevel> - <PrecompiledHeader>Use</PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <SDLCheck>true</SDLCheck> - <PrecompiledHeaderFile>test.h</PrecompiledHeaderFile> - <OmitFramePointers>true</OmitFramePointers> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - <GenerateDebugInformation>true</GenerateDebugInformation> - <AdditionalDependencies>winmm.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ClInclude Include="base.h" /> - <ClInclude Include="chrono.h" /> - <ClInclude Include="config.h" /> - <ClInclude Include="keygen.h" /> - <ClInclude Include="log.h" /> - <ClInclude Include="osal.h" /> - <ClInclude Include="test.h" /> - <ClInclude Include="utils.h" /> - </ItemGroup> - <ItemGroup> - <ClCompile Include="ttl.cc" /> - <ClCompile Include="append.cc" /> - <ClCompile Include="cases.cc" /> - <ClCompile Include="chrono.cc" /> - <ClCompile Include="config.cc" /> - <ClCompile Include="copy.cc" /> - <ClCompile Include="dead.cc" /> - <ClCompile Include="hill.cc" /> - <ClCompile Include="try.cc" /> - <ClCompile Include="jitter.cc" /> - <ClCompile Include="keygen.cc" /> - <ClCompile Include="log.cc" /> - <ClCompile Include="main.cc" /> - <ClCompile Include="osal-windows.cc"> - <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader> - <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader> - <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader> - <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader> - </ClCompile> - <ClCompile Include="test.cc" /> - <ClCompile Include="utils.cc" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/test/try.cc b/libs/libmdbx/src/test/try.cc index 150abd36de..adb0113096 100644 --- a/libs/libmdbx/src/test/try.cc +++ b/libs/libmdbx/src/test/try.cc @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h" bool testcase_try::run() { db_open(); diff --git a/libs/libmdbx/src/test/ttl.cc b/libs/libmdbx/src/test/ttl.cc index b2650f0c13..782a8b4d56 100644 --- a/libs/libmdbx/src/test/ttl.cc +++ b/libs/libmdbx/src/test/ttl.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -29,11 +29,10 @@ static unsigned edge2count(uint64_t edge, unsigned count_max) { } bool testcase_ttl::run() { - MDBX_dbi dbi; int err = db_open__begin__table_create_open_clean(dbi); if (unlikely(err != MDBX_SUCCESS)) { log_notice("ttl: bailout-prepare due '%s'", mdbx_strerror(err)); - return true; + return false; } /* LY: тест "эмуляцией time-to-live": @@ -54,18 +53,8 @@ bool testcase_ttl::run() { /* LY: для параметризации используем подходящие параметры, которые не имеют * здесь смысла в первоначальном значении. */ - const unsigned window_max_lower = -#ifdef __APPLE__ - 333; -#else - 999; -#endif - const unsigned count_max_lower = -#ifdef __APPLE__ - 333; -#else - 999; -#endif + const unsigned window_max_lower = 333; + const unsigned count_max_lower = 333; const unsigned window_max = (config.params.batch_read > window_max_lower) ? config.params.batch_read @@ -73,8 +62,8 @@ bool testcase_ttl::run() { const unsigned count_max = (config.params.batch_write > count_max_lower) ? config.params.batch_write : count_max_lower; - log_info("ttl: using `batch_read` value %u for window_max", window_max); - log_info("ttl: using `batch_write` value %u for count_max", count_max); + log_verbose("ttl: using `batch_read` value %u for window_max", window_max); + log_verbose("ttl: using `batch_write` value %u for count_max", count_max); uint64_t seed = prng64_map2_white(config.params.keygen.seed) + config.actor_id; @@ -87,14 +76,16 @@ bool testcase_ttl::run() { std::deque<std::pair<uint64_t, unsigned>> fifo; uint64_t serial = 0; + bool rc = false; while (should_continue()) { const uint64_t salt = prng64_white(seed) /* mdbx_txn_id(txn_guard.get()) */; - const unsigned window_width = edge2window(salt, window_max); + const unsigned window_width = + flipcoin_x4() ? 0 : edge2window(salt, window_max); unsigned head_count = edge2count(salt, count_max); - log_verbose("ttl: step #%zu (serial %" PRIu64 - ", window %u, count %u) salt %" PRIu64, - nops_completed, serial, window_width, head_count, salt); + log_debug("ttl: step #%zu (serial %" PRIu64 + ", window %u, count %u) salt %" PRIu64, + nops_completed, serial, window_width, head_count, salt); if (window_width) { while (fifo.size() > window_width) { @@ -104,7 +95,7 @@ bool testcase_ttl::run() { tail_count); fifo.pop_back(); for (unsigned n = 0; n < tail_count; ++n) { - log_trace("ttl: remove-tail %" PRIu64, serial); + log_trace("ttl: remove-tail %" PRIu64, tail_serial); generate_pair(tail_serial); err = mdbx_del(txn_guard.get(), dbi, &key->value, &data->value); if (unlikely(err != MDBX_SUCCESS)) { @@ -158,7 +149,9 @@ bool testcase_ttl::run() { serial = fifo.front().first; fifo.pop_front(); } + report(1); + rc = true; } bailout: @@ -170,10 +163,10 @@ bailout: err = breakable_commit(); if (unlikely(err != MDBX_SUCCESS)) { log_notice("ttl: bailout-clean due '%s'", mdbx_strerror(err)); - return true; + return false; } } else db_table_close(dbi); } - return true; + return rc; } diff --git a/libs/libmdbx/src/test/utils.cc b/libs/libmdbx/src/test/utils.cc index d9b3538b99..311cf544ef 100644 --- a/libs/libmdbx/src/test/utils.cc +++ b/libs/libmdbx/src/test/utils.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -331,6 +331,9 @@ double double_from_upper(uint64_t salt) { } bool flipcoin() { return bleach32((uint32_t)entropy_ticks()) & 1; } +bool flipcoin_x2() { return (bleach32((uint32_t)entropy_ticks()) & 3) == 0; } +bool flipcoin_x3() { return (bleach32((uint32_t)entropy_ticks()) & 7) == 0; } +bool flipcoin_x4() { return (bleach32((uint32_t)entropy_ticks()) & 15) == 0; } bool jitter(unsigned probability_percent) { const uint32_t top = UINT32_MAX - UINT32_MAX % 100; diff --git a/libs/libmdbx/src/test/utils.h b/libs/libmdbx/src/test/utils.h index d1b859acd4..2a5a54de6d 100644 --- a/libs/libmdbx/src/test/utils.h +++ b/libs/libmdbx/src/test/utils.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -151,7 +151,7 @@ template <typename T> static __inline T load(const void *ptr) { #if defined(_MSC_VER) && \ (defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)) return *(const T __unaligned *)ptr; -#elif UNALIGNED_OK +#elif MDBX_UNALIGNED_OK return *(const T *)ptr; #else T local; @@ -161,14 +161,14 @@ template <typename T> static __inline T load(const void *ptr) { memcpy(&local, (const T *)ptr, sizeof(T)); #endif /* __GNUC__ || __clang__ */ return local; -#endif /* UNALIGNED_OK */ +#endif /* MDBX_UNALIGNED_OK */ } template <typename T> static __inline void store(void *ptr, const T &value) { #if defined(_MSC_VER) && \ (defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)) *((T __unaligned *)ptr) = value; -#elif UNALIGNED_OK +#elif MDBX_UNALIGNED_OK *(volatile T *)ptr = value; #else #if defined(__GNUC__) || defined(__clang__) @@ -176,7 +176,7 @@ template <typename T> static __inline void store(void *ptr, const T &value) { #else memcpy(ptr, &value, sizeof(T)); #endif /* __GNUC__ || __clang__ */ -#endif /* UNALIGNED_OK */ +#endif /* MDBX_UNALIGNED_OK */ } } /* namespace unaligned */ @@ -355,5 +355,8 @@ uint64_t prng64(void); void prng_fill(void *ptr, size_t bytes); bool flipcoin(); +bool flipcoin_x2(); +bool flipcoin_x3(); +bool flipcoin_x4(); bool jitter(unsigned probability_percent); void jitter_delay(bool extra = false); |