diff options
Diffstat (limited to 'libs/libmdbx/src')
47 files changed, 3420 insertions, 1384 deletions
diff --git a/libs/libmdbx/src/.travis.yml b/libs/libmdbx/src/.travis.yml index 9359084060..e46991f597 100644 --- a/libs/libmdbx/src/.travis.yml +++ b/libs/libmdbx/src/.travis.yml @@ -7,6 +7,7 @@ compiler: os: - linux +- osx script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi diff --git a/libs/libmdbx/src/Makefile b/libs/libmdbx/src/Makefile index 145b7fadcc..2d261bc7c3 100644 --- a/libs/libmdbx/src/Makefile +++ b/libs/libmdbx/src/Makefile @@ -1,4 +1,4 @@ -# GNU Makefile for libmdbx, https://github.com/leo-yuriev/libmdbx +# GNU Makefile for libmdbx, https://abf.io/erthink/libmdbx ######################################################################## # Configuration. The compiler options must enable threaded compilation. @@ -23,17 +23,18 @@ suffix ?= CC ?= gcc CXX ?= g++ +LD ?= ld CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden -XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 +XCFLAGS ?= -DNDEBUG=1 -DLIBMDBX_EXPORTS=1 CFLAGS += -D_GNU_SOURCE=1 -std=gnu11 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu11,$(CFLAGS)) TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db TESTLOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log # LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old -LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt -EXE_LDFLAGS ?= -pthread -lrt +LDFLAGS ?= $(shell $(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1')$(shell $(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip') +EXE_LDFLAGS ?= -pthread # LY: just for benchmarking IOARENA ?= $(shell \ @@ -44,16 +45,55 @@ NN ?= 25000000 ######################################################################## +ifdef MSVC + UNAME := Windows + LCK_IMPL := windows + TEST_OSAL := windows + TEST_ITER := 42 +else + UNAME := $(shell uname -s 2>/dev/null || echo Unknown) + define uname2lck + case "$(UNAME)" in + Linux) echo linux;; + CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;; + *) echo posix;; + esac + endef + define uname2osal + case "$(UNAME)" in + CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;; + *) echo unix;; + esac + endef + define uname2titer + case "$(UNAME)" in + Darwin*|Mach*) echo 3;; + *) echo 42;; + esac + endef + define uname2suffix + case "$(UNAME)" in + Darwin*|Mach*) echo dylib;; + CYGWIN*|MINGW*|MSYS*|Windows*) echo dll;; + *) echo so;; + esac + endef + LCK_IMPL := $(shell $(uname2lck)) + TEST_OSAL := $(shell $(uname2osal)) + TEST_ITER := $(shell $(uname2titer)) + SO_SUFFIX := $(shell $(uname2suffix)) +endif + HEADERS := mdbx.h -LIBRARIES := libmdbx.a libmdbx.so +LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX) TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 SHELL := /bin/bash -CORE_SRC := $(filter-out src/lck-windows.c, $(wildcard src/*.c)) +CORE_SRC := src/lck-$(LCK_IMPL).c $(filter-out $(wildcard src/lck-*.c), $(wildcard src/*.c)) CORE_INC := $(wildcard src/*.h) CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC)) -TEST_SRC := $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) +TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) TEST_INC := $(wildcard test/*.h) TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) @@ -61,10 +101,10 @@ TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) all: $(LIBRARIES) $(TOOLS) mdbx_test example -mdbx: libmdbx.a libmdbx.so +mdbx: libmdbx.a libmdbx.$(SO_SUFFIX) -example: mdbx.h tutorial/sample-mdbx.c libmdbx.so - $(CC) $(CFLAGS) -I. tutorial/sample-mdbx.c ./libmdbx.so -o example +example: mdbx.h tutorial/sample-mdbx.c libmdbx.$(SO_SUFFIX) + $(CC) $(CFLAGS) -I. tutorial/sample-mdbx.c ./libmdbx.$(SO_SUFFIX) -o example tools: $(TOOLS) @@ -82,16 +122,19 @@ clean: rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o check: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=42 --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ + rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=$(TEST_ITER) --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy check-singleprocess: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=42 --pathname=$(TESTDB) --dont-cleanup-after --hill --copy | tee -a $(TESTLOG) | tail -n 42) \ + rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; \ + ./mdbx_test --repeat=4 --pathname=$(TESTDB) --dont-cleanup-after --hill && \ + ./mdbx_test --repeat=2 --pathname=$(TESTDB) --dont-cleanup-before --dont-cleanup-after --copy \ + | tee -a $(TESTLOG) | tail -n 42) \ && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy check-fault: all rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ - && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy + ; ./mdbx_chk -vvnw $(TESTDB) && ([ ! -e $(TESTDB)-copy ] || ./mdbx_chk -vvn $(TESTDB)-copy) define core-rule $(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile @@ -110,13 +153,13 @@ $(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file)))) libmdbx.a: $(CORE_OBJ) $(AR) rs $@ $? -libmdbx.so: $(CORE_OBJ) +libmdbx.$(SO_SUFFIX): $(CORE_OBJ) $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ mdbx_%: src/tools/mdbx_%.c libmdbx.a $(CC) $(CFLAGS) $^ $(EXE_LDFLAGS) -o $@ -mdbx_test: $(TEST_OBJ) libmdbx.so +mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX) $(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) -o $@ ############################################################################### @@ -142,7 +185,7 @@ bench-$(1)_$(2).txt: $(3) $(IOARENA) Makefile endef -$(eval $(call bench-rule,mdbx,$(NN),libmdbx.so)) +$(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) $(eval $(call bench-rule,sophia,$(NN))) $(eval $(call bench-rule,leveldb,$(NN))) diff --git a/libs/libmdbx/src/README-RU.md b/libs/libmdbx/src/README-RU.md index 97888cbf88..e040b5c4bb 100644 --- a/libs/libmdbx/src/README-RU.md +++ b/libs/libmdbx/src/README-RU.md @@ -1,56 +1,55 @@ +### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans. +<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> +----- + libmdbx ====================================== -**The revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).** - -*The Future will Positive. Всё будет хорошо.* -[](https://travis-ci.org/leo-yuriev/libmdbx) -[](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) -[](https://scan.coverity.com/projects/reopen-libmdbx) +Доработанный и расширенный потомок [Lightning Memory-Mapped Database](https://ru.bmstu.wiki/LMDB_(Lightning_Memory-Mapped_Database)) (aka _LMDB_). +English version is [here](README.md). -English version [by Google](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/leo-yuriev/libmdbx/tree/master) -and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.com%2FReOpen%2Flibmdbx%2Ftree%2Fmaster&lang=ru-en). +_libmdbx_ превосходит LMDB по возможностям и надежности, не уступая в +производительности. _libmdbx_ работает на Linux, FreeBSD, MacOS X и +других ОС соответствующих POSIX.1-2008, а также поддерживает Windows в +качестве дополнительной платформы. -### Project Status +Отдельно ведётся не-публичная разработка следующей версии, в которой +будет кардинальное изменение как API, так и формата базы данных. Цель +этой революции - обеспечение более четкого и надежного API, добавление +новых функций, а также наделение базы данных новыми свойствами. -**Сейчас MDBX _активно перерабатывается_** предстоит -большое изменение как API, так и формата базы данных. К сожалению, -обновление приведет к потере совместимости с предыдущими версиями. +*Всё будет хорошо. The Future will (be) [Positive](https://www.ptsecurity.ru).* -Цель этой революции - обеспечение более четкого надежного API и -добавление новых функции, а также наделение базы данных новыми -свойствами. - -В настоящее время MDBX предназначена для Linux, а также поддерживает -Windows (начиная с Windows Server 2008) в качестве дополнительной -платформы. Поддержка других ОС может быть обеспечена на коммерческой -основе. Однако такие усовершенствования (т. е. pull-requests) могут быть -приняты в мейнстрим только в том случае, если будет доступен -соответствующий публичный и бесплатный сервис непрерывной интеграции -(aka Continuous Integration). +[](https://travis-ci.org/leo-yuriev/libmdbx) +[](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) +[](https://scan.coverity.com/projects/reopen-libmdbx) ## Содержание - [Обзор](#Обзор) - - [Сравнение с другими СУБД](#Сравнение-с-другими-СУБД) - - [История & Acknowledgments](#История) -- [Основные свойства](#Основные-свойства) -- [Доработки и усовершенствования относительно LMDB](#Доработки-и-усовершенствования-относительно-lmdb) -- [Недостатки и Компромиссы](#Недостатки-и-Компромиссы) - - [Проблема долгих чтений](#Проблема-долгих-чтений) - - [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации) + - [Сравнение с другими базами данных](#Сравнение-с-другими-базами-данных) + - [История & Выражение признательности](#История) +- [Описание](#Описание) + - [Ключевые свойства](#Ключевые-свойства) + - [Доработки и усовершенствования относительно LMDB](#Доработки-и-усовершенствования-относительно-lmdb) + - [Недостатки и Компромиссы](#Недостатки-и-Компромиссы) + - [Проблема долгих чтений](#Проблема-долгих-чтений) + - [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации) +- [Использование](#Использование) + - [Сборка](#Сборка) + - [Привязки к другим языкам](#Привязки-к-другим-языкам) - [Сравнение производительности](#Сравнение-производительности) - - [Интегральная производительность](#Интегральная-производительность) - - [Масштабируемость чтения](#Масштабируемость-чтения) - - [Синхронная фиксация](#Синхронная-фиксация) - - [Отложенная фиксация](#Отложенная-фиксация) - - [Асинхронная фиксация](#Асинхронная-фиксация) - - [Потребление ресурсов](#Потребление-ресурсов) + - [Интегральная производительность](#Интегральная-производительность) + - [Масштабируемость чтения](#Масштабируемость-чтения) + - [Синхронная фиксация](#Синхронная-фиксация) + - [Отложенная фиксация](#Отложенная-фиксация) + - [Асинхронная фиксация](#Асинхронная-фиксация) + - [Потребление ресурсов](#Потребление-ресурсов) +----- ## Обзор _libmdbx_ - это встраиваемый key-value движок хранения со специфическим набором свойств и возможностей, ориентированный на создание уникальных -легковесных решений с предельной производительностью под Linux и -Windows. +легковесных решений с предельной производительностью. _libmdbx_ позволяет множеству процессов совместно читать и обновлять несколько key-value таблиц с соблюдением @@ -79,11 +78,12 @@ _libmdbx_ не использует сценарии использования. -### Сравнение с другими СУБД -Ввиду того, что в _libmdbx_ сейчас происходит революция, я посчитал -лучшим решением ограничится здесь ссылкой на [главу Comparison with -other databases](https://github.com/coreos/bbolt#comparison-with-other-databases) -в описании _BoltDB_. +### Сравнение с другими базами данных + +На данный момент, пожалуйста, обратитесь к [главе "сравнение BoltDB с +другими базами +данных"](https://github.com/coreos/bbolt#comparison-with-other-databases), +которая также (в основном) применима к MDBX. ### История @@ -104,16 +104,20 @@ Tables](https://github.com/leo-yuriev/libfpta), aka ["Позитивные Technologies](https://www.ptsecurity.ru). -#### Acknowledgments -Howard Chu (Symas Corporation) - the author of LMDB, from which -originated the MDBX in 2015. +### Выражение признательности + +Говард Чу (Howard Chu) <hyc@openldap.org> является автором движка LMDB, от +которого в 2015 году произошел MDBX. + +Мартин Хеденфальк (Martin Hedenfalk) <martin@bzero.se> является автором кода +`btree.c`, который использовался для начала разработки LMDB. -Martin Hedenfalk <martin@bzero.se> - the author of `btree.c` code, which -was used for begin development of LMDB. +----- +Описание +======== -Основные свойства -================= +## Ключевые свойства _libmdbx_ наследует все ключевые возможности и особенности своего прародителя @@ -168,8 +172,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). кэшированием. Всё необходимое штатно выполняет ядро ОС. -Доработки и усовершенствования относительно LMDB -================================================ +## Доработки и усовершенствования относительно LMDB 1. Автоматическое динамическое управление размером БД согласно параметрам задаваемым функцией `mdbx_env_set_geometry()`, включая шаг @@ -328,7 +331,25 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). > - попытки повторного освобождения памяти; > - повреждение памяти и ошибки сегментации. --------------------------------------------------------------------------------- +32. На **MacOS X** для синхронизации данных с диском _по-умолчанию_ +используется системная функция `fcntl(F_FULLFSYNC)`, так как [только +этим гарантируется сохранность +данных](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) +при сбое электропитания. К сожалению, в сценариях с высокой +интенсивностью пишущих транзакций, использование `F_FULLFSYNC` приводит +к существенной деградации производительности в сравнении с LMDB, где +используется системная функция `fsync()`. Поэтому _libmdbx_ позволяет +переопределить это поведение определением опции +`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` при сборке библиотеки. + +33. На **Windows** _libmdbx_ использует файловые блокировки +`LockFileEx()`, так как это позволяет размещать БД на сетевых дисках, а +также обеспечивает защиту от некомпетентных действий пользователя +([защиту от +дурака](https://ru.wikipedia.org/wiki/%D0%97%D0%B0%D1%89%D0%B8%D1%82%D0%B0_%D0%BE%D1%82_%D0%B4%D1%83%D1%80%D0%B0%D0%BA%D0%B0)). +Поэтому _libmdbx_ может немного отставать в тестах производительность от +LMDB, где используются именованные мьютексы. + ## Недостатки и Компромиссы @@ -386,7 +407,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). > Подробности ниже. -#### Проблема долгих чтений +### Проблема долгих чтений *Следует отметить*, что проблема "сборки мусора" так или иначе существует во всех СУБД (Vacuum в PostgreSQL). Однако в случае _libmdbx_ и LMDB она проявляется более остро, прежде всего из-за высокой @@ -452,7 +473,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). за счет эффективной работы [BBWC](https://en.wikipedia.org/wiki/BBWC) при включении `LIFO RECLAIM` в _libmdbx_. -#### Сохранность данных в режиме асинхронной фиксации +### Сохранность данных в режиме асинхронной фиксации При работе в режиме `WRITEMAP+MAPSYNC` запись измененных страниц выполняется ядром ОС, что имеет ряд преимуществ. Так например, при крахе приложения, ядро ОС сохранит все изменения. @@ -515,6 +536,119 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). -------------------------------------------------------------------------------- +Использование +============= + +## Сборка + +Для сборки на всех платформах кроме Windows вам потребуются не-дремучие +версии: GNU Make, [bash](https://ru.wikipedia.org/wiki/Bash), компиляторы C и C++ совместимые с GCC или CLANG. + +Исторически сборка _libmdbx_ основывается на одном +[Makefile](https://ru.wikipedia.org/wiki/Makefile), что предполагает +разные рецепты сборки в зависимости от целевой платформы. В следующих +версиях планируется переход на использование +[CMake](https://ru.wikipedia.org/wiki/CMake), с отказом от поддержки +других инструментов. + +#### Выгрузка DSO/DLL и деструкторы Thread-Local-Storage объектов +При сборке _libmdbx_ в виде разделяемой библиотеки, либо использовании +статической _libmdbx_ в составе другой динамической библиотеке, +желательно убедиться, что ваша система обеспечивает корректность вызова +деструкторов Thread-Local-Storage объектов при выгрузке динамических +библиотек. + +Если это не так, то при выгрузке динамической библиотеки с _libmdbx_ +внутри возможна либо утечка ресурсов, либо падения из-за вызова +деструкторов из уже выгруженного DSO/DLL объекта. Проблема может +проявляться только в многопоточном приложении, которое производит +выгрузку разделяемых динамических библиотек с кодом _libmdbx_ внутри, +после использования _libmdbx_. Заведомо известно, что TLS-деструкторы +корректно обслуживаются: + +- На всех актуальных версиях Windows (Windows 7 и последующих). + +- На системах c функцией +[`__cxa_thread_atexit_impl()`](https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables) +в стандартной библиотеке C. В том числе на системах с GNU libc версии +2.18 и выше. + +- На системах с libpthread/ntpl из состава GNU libc с исправлением +ошибок [#21031](https://sourceware.org/bugzilla/show_bug.cgi?id=21031) и +[#21032](https://sourceware.org/bugzilla/show_bug.cgi?id=21032), либо +где нет подобных ошибок в реализации pthreads. + +### Linux и другие платформы с GNU Make +Для сборки библиотеки достаточно выполнить `make all` в директории с +исходными текстами, а для выполнения базовых тестов `make check`. + +Если установленный в система `make` не является GNU Make, то при попытке +сборки будет масса ошибок от make. В этом случае, возможно, вместо +`make` вам следует использовать `gmake`, либо даже `gnu-make` и т.п. + +### FreeBSD и родственные платформы +Как правило, на таких системах по-умолчанию используется Berkeley Make. +А GNU Make вызывается командой `gmake` или может отсутствовать. Кроме +этого может отсутствовать [`bash`](https://ru.wikipedia.org/wiki/Bash). + +Вам необходимо установить требуемые компоненты: GNU Make, bash, +компиляторы C и C++ совместимые с GCC или CLANG. После этого для сборки +библиотеки достаточно выполнить `gmake all` (или `make all`) в +директории с исходными текстами, а для выполнения базовых тестов `gmake +check` (или `make check`). + +### Windows +Для сборки libmdbx_ для ОС Windows рекомендуется использовать [Microsoft +Visual Studio](https://ru.wikipedia.org/wiki/Microsoft_Visual_Studio), +но не такие инструменты как MinGW, MSYS или Cygwin. Для этого в набор +исходных кодов _libmdbx_ входят соответствующие файлы проектов +совместимые с Visual Studio 2015, Windows SDK для Windows 8.1 и более +поздними версиями. Достаточно открыть `mdbx.sln` и выполнить сборку +библиотеки. + +Для сборки с более новыми версиями SDK или Visual Studio должно быть +достаточно выполнить "Retarget solution". Для сборки под старые версии +Windows (например Windows XP) или более старыми компиляторами вам +потребуется самостоятельно преобразовать или воссоздать файлы проектов. + +Сборка посредством MinGW, MSYS или Cygwin потенциально возможна. Однако, +эти сценарии не тестируются и вероятно потребуют от вас доработки +`Makefile`. Следует отметить, что в _libmdbx_ предприняты усилия для +устранения runtime зависимостей от CRT и других библиотек Visual Studio. +Для этого достаточно при сборке определить опцию `MDBX_AVOID_CRT`. + +Пример запуска базового сценария тестирования можно найти в +[CI-сценарии](appveyor.yml) для [AppVeyor](https://www.appveyor.com/). +Для выполнения [сценария длительного стохастического +тестирования](test/long_stochastic.sh) потребуется +[`bash`](https://ru.wikipedia.org/wiki/Bash), а само тестирование +рекомендуется выполнять с размещением тестовых данных на +[RAM-диске](https://ru.wikipedia.org/wiki/RAM-%D0%B4%D0%B8%D1%81%D0%BA). + +### MacOS X +Актуальные [нативные сборочные +инструменты](https://ru.wikipedia.org/wiki/Xcode) для MacOS X включают +GNU Make, CLANG и устаревшую версию bash. Поэтому для сборки библиотеки +достаточно выполнить `make all` в директории с исходными текстами, а для +выполнения базовых тестов `make check`. Если же что-то пойдет не так, то +рекомендуется установить [Homebrew](https://brew.sh/) и попробовать ещё +раз. + +Для выполнения [сценария длительного стохастического +тестирования](test/long_stochastic.sh) потребуется установка актуальной +(не устаревшей) версии [`bash`](https://ru.wikipedia.org/wiki/Bash). Для +этого рекомендуется установить [Homebrew](https://brew.sh/), а затем +выполнить `brew install bash`. + +## Привязки к другим языкам + + | Runtime | GitHub | Author | + | -------- | ------ | ------ | + | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | + | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | + +-------------------------------------------------------------------------------- + Сравнение производительности ============================ @@ -526,9 +660,8 @@ SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб. сценарии тестирования [доступны на github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). --------------------------------------------------------------------------------- -### Интегральная производительность +## Интегральная производительность Показана соотнесенная сумма ключевых показателей производительности в трёх бенчмарках: @@ -555,7 +688,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). -------------------------------------------------------------------------------- -### Масштабируемость чтения +## Масштабируемость чтения Для каждого движка показана суммарная производительность при одновременном выполнении запросов чтения/поиска в 1-2-4-8 потоков на @@ -565,7 +698,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). -------------------------------------------------------------------------------- -### Синхронная фиксация +## Синхронная фиксация - Линейная шкала слева и темные прямоугольники соответствуют количеству транзакций в секунду, усредненному за всё время теста. @@ -591,7 +724,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). -------------------------------------------------------------------------------- -### Отложенная фиксация +## Отложенная фиксация - Линейная шкала слева и темные прямоугольники соответствуют количеству транзакций в секунду, усредненному за всё время теста. @@ -622,7 +755,7 @@ _libmdbx_ при этом не ведет WAL, а передает весь ко -------------------------------------------------------------------------------- -### Асинхронная фиксация +## Асинхронная фиксация - Линейная шкала слева и темные прямоугольники соответствуют количеству транзакций в секунду, усредненному за всё время теста. @@ -652,7 +785,7 @@ _libmdbx_ при этом не ведет WAL, а передает весь ко -------------------------------------------------------------------------------- -### Потребление ресурсов +## Потребление ресурсов Показана соотнесенная сумма использованных ресурсов в ходе бенчмарка в режиме отложенной фиксации: diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md index 78cc906763..a5b420f35b 100644 --- a/libs/libmdbx/src/README.md +++ b/libs/libmdbx/src/README.md @@ -1,159 +1,139 @@ +### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans. +<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> +----- + libmdbx ====================================== -**Revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).** +Revised and extended descendant of [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) (aka _LMDB_). +Русскоязычная версия [здесь](README-RU.md). -*The Future will be positive.* -[](https://travis-ci.org/leo-yuriev/libmdbx) -[](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) -[](https://scan.coverity.com/projects/reopen-libmdbx) +_libmdbx_ is superior to LMDB in terms of features and reliability, not +inferior in performance. _libmdbx_ works on Linux, FreeBSD, MacOS X and +other systems compliant with POSIX.1-2008, but also support Windows as a +complementary platform. -## Project Status for now - - - The stable versions - ([_stable/0.0_](https://github.com/leo-yuriev/libmdbx/tree/stable/0.0) - and - [_stable/0.1_](https://github.com/leo-yuriev/libmdbx/tree/stable/0.1) - branches) of _MDBX_ are frozen, i.e. no new features or API changes, but - only bug fixes. - - - The next version - ([_devel_](https://github.com/leo-yuriev/libmdbx/tree/devel) branch) - **is under active non-public development**, i.e. current API and set of - features are extreme volatile. - - - The immediate goal of development is formation of the stable API and - the stable internal database format, which allows realise all PLANNED - FEATURES: - 1. Integrity check by [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree); - 2. Support for [raw block devices](https://en.wikipedia.org/wiki/Raw_device); - 3. Separate place (HDD) for large data items; - 4. Using "[Roaring bitmaps](http://roaringbitmap.org/about/)" inside garbage collector; - 5. Non-sequential reclaiming, like PostgreSQL's [Vacuum](https://www.postgresql.org/docs/9.1/static/sql-vacuum.html); - 6. [Asynchronous lazy data flushing](https://sites.fas.harvard.edu/~cs265/papers/kathuria-2008.pdf) to disk(s); - 7. etc... - -Don't miss libmdbx for other runtimes. - -| Runtime | GitHub | Author | -| ------------- | ------------- | ------------- | -| JVM | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | -| .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | +The next version is under active non-public development, which will +radically change both the API and the database format. The goal of this +revolution is to provide a clearer and more reliable API, add more +features and new database properties. ------ +*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.* -Nowadays MDBX intended for Linux, and support Windows (since Windows -Server 2008) as a complementary platform. Support for other OS could be -implemented on commercial basis. However such enhancements (i.e. pull -requests) could be accepted in mainstream only when corresponding public -and free Continuous Integration service will be available. +[](https://travis-ci.org/leo-yuriev/libmdbx) +[](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) +[](https://scan.coverity.com/projects/reopen-libmdbx) -## Contents +## Table of Contents - [Overview](#overview) - - [Comparison with other DBs](#comparison-with-other-dbs) - - [History & Acknowledgments](#history) -- [Main features](#main-features) -- [Improvements over LMDB](#improvements-over-lmdb) -- [Gotchas](#gotchas) - - [Long-time read transactions problem](#long-time-read-transactions-problem) - - [Data safety in async-write-mode](#data-safety-in-async-write-mode) + - [Comparison with other databases](#comparison-with-other-databases) + - [History & Acknowledgments](#history) +- [Description](#description) + - [Key features](#key-features) + - [Improvements over LMDB](#improvements-over-lmdb) + - [Gotchas](#gotchas) + - [Problem of long-time reading](#problem-of-long-time-reading) + - [Durability in asynchronous writing mode](#durability-in-asynchronous-writing-mode) +- [Usage](#usage) + - [Building](#building) + - [Bindings](#bindings) - [Performance comparison](#performance-comparison) - - [Integral performance](#integral-performance) - - [Read scalability](#read-scalability) - - [Sync-write mode](#sync-write-mode) - - [Lazy-write mode](#lazy-write-mode) - - [Async-write mode](#async-write-mode) - - [Cost comparison](#cost-comparison) + - [Integral performance](#integral-performance) + - [Read scalability](#read-scalability) + - [Sync-write mode](#sync-write-mode) + - [Lazy-write mode](#lazy-write-mode) + - [Async-write mode](#async-write-mode) + - [Cost comparison](#cost-comparison) +----- ## Overview _libmdbx_ is an embedded lightweight key-value database engine oriented -for performance under Linux and Windows. +for performance. _libmdbx_ allows multiple processes to read and update several key-value tables concurrently, while being [ACID](https://en.wikipedia.org/wiki/ACID)-compliant, with minimal -overhead and operation cost of Olog(N). - -_libmdbx_ provides -[serializability](https://en.wikipedia.org/wiki/Serializability) and -consistency of data after crash. Read-write transactions don't block -read-only transactions and are -[serialized](https://en.wikipedia.org/wiki/Serializability) by -[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion). +overhead and Olog(N) operation cost. -_libmdbx_ +_libmdbx_ enforce +[serializability](https://en.wikipedia.org/wiki/Serializability) for +writers by single +[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords [wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom) -provides parallel read transactions without atomic operations or -synchronization primitives. +for parallel readers without atomic/interlocked operations, while +writing and reading transactions do not block each other. + +_libmdbx_ can guarantee consistency after crash depending of operation +mode. _libmdbx_ uses [B+Trees](https://en.wikipedia.org/wiki/B%2B_tree) and -[mmap](https://en.wikipedia.org/wiki/Memory-mapped_file), doesn't use -[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). This might -have caveats for some workloads. +[Memory-Mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), +doesn't use [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) +which might be a caveat for some workloads. -### Comparison with other DBs -Because _libmdbx_ is currently overhauled, I think it's better to just -link [chapter of Comparison with other -databases](https://github.com/coreos/bbolt#comparison-with-other-databases) -here. +### Comparison with other databases +For now please refer to [chapter of "BoltDB comparison with other +databases"](https://github.com/coreos/bbolt#comparison-with-other-databases) +which is also (mostly) applicable to MDBX. ### History The _libmdbx_ design is based on [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). Initial development was going in -[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project, about a -year later it received separate development effort and in autumn 2015 -was isolated to separate project, which was [presented at Highload++ -2015 conference](http://www.highload.ru/2015/abstracts/1831.html). +[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project. About a +year later libmdbx was isolated to separate project, which was +[presented at Highload++ 2015 +conference](http://www.highload.ru/2015/abstracts/1831.html). -Since early 2017 _libmdbx_ is used in [Fast PositiveTables](https://github.com/leo-yuriev/libfpta), -by [Positive Technologies](https://www.ptsecurity.com). +Since early 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta), +and development is funded by [Positive Technologies](https://www.ptsecurity.com). -#### Acknowledgments -Howard Chu (Symas Corporation) - the author of LMDB, from which +### Acknowledgments +Howard Chu <hyc@openldap.org> is the author of LMDB, from which originated the MDBX in 2015. -Martin Hedenfalk <martin@bzero.se> - the author of `btree.c` code, which +Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which was used for begin development of LMDB. +----- + +Description +=========== -Main features -============= +## Key features -_libmdbx_ inherits all keys features and characteristics from +_libmdbx_ inherits all features and characteristics from [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database): -1. Data is stored in ordered map, keys are always sorted, range lookups -are supported. +1. Key-value pairs are stored in ordered map(s), keys are always sorted, +range lookups are supported. -2. Data is [mmaped](https://en.wikipedia.org/wiki/Memory-mapped_file) to -memory of each worker DB process, read transactions are zero-copy. +2. Data is [memory-mapped](https://en.wikipedia.org/wiki/Memory-mapped_file) +into each worker DB process, and could be accessed zero-copy from transactions. 3. Transactions are -[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, thanks to +[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, through to [MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) and [CoW](https://en.wikipedia.org/wiki/Copy-on-write). Writes are strongly serialized and aren't blocked by reads, transactions can't conflict with each other. Reads are guaranteed to get only commited data ([relaxing serializability](https://en.wikipedia.org/wiki/Serializability#Relaxing_serializability)). -4. Reads and queries are +4. Read transactions are [non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm), -don't use [atomic -operations](https://en.wikipedia.org/wiki/Linearizability#High-level_atomic_operations). +don't use [atomic operations](https://en.wikipedia.org/wiki/Linearizability#High-level_atomic_operations). Readers don't block each other and aren't blocked by writers. Read performance scales linearly with CPU core count. - > Though "connect to DB" (start of first read transaction in thread) and - > "disconnect from DB" (shutdown or thread termination) requires to - > acquire a lock to register/unregister current thread from "readers - > table" + > Nonetheless, "connect to DB" (starting the first read transaction in a thread) and + > "disconnect from DB" (closing DB or thread termination) requires a lock + > acquisition to register/unregister at the "readers table". 5. Keys with multiple values are stored efficiently without key -duplication, sorted by value, including integers (reasonable for +duplication, sorted by value, including integers (valuable for secondary indexes). -6. Efficient operation on short fixed length keys, including integer -ones. +6. Efficient operation on short fixed length keys, +including 32/64-bit integer types. 7. [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write Amplification Factor) и RAF (Read Amplification Factor) are Olog(N). @@ -161,14 +141,12 @@ Amplification Factor) и RAF (Read Amplification Factor) are Olog(N). 8. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) and transaction journal. In case of a crash no recovery needed. No need for regular maintenance. Backups can be made on the fly on working DB - without freezing writers. +without freezing writers. -9. No custom memory management, all done with standard OS syscalls. +9. No additional memory management, all done by basic OS services. --------------------------------------------------------------------------------- -Improvements over LMDB -====================== +## Improvements over LMDB 1. Automatic dynamic DB size management according to the parameters specified by `mdbx_env_set_geometry()` function. Including @@ -192,21 +170,23 @@ additional resources for that. [BBWC](https://en.wikipedia.org/wiki/Disk_buffer#Write_acceleration) this may greatly improve write performance. -4. Fast estimation of range query result size via functions `mdbx_estimate_range()`, -`mdbx_estimate_move()` and `mdbx_estimate_distance()`. E.g. for selection the -optimal query execution plan. +4. Fast estimation of range query result size via functions +`mdbx_estimate_range()`, `mdbx_estimate_move()` and +`mdbx_estimate_distance()`. E.g. for selection the optimal query +execution plan. 5. `mdbx_chk` tool for DB integrity check. -6. Support for keys and values of zero length, including sorted -duplicates. +6. Support for keys and values of zero length, including multi-values +(aka sorted duplicates). -7. Ability to assign up to 3 markers to commiting transaction with -`mdbx_canary_put()` and then get them in read transaction by -`mdbx_canary_get()`. +7. Ability to assign up to 3 persistent 64-bit markers to commiting +transaction with `mdbx_canary_put()` and then get them in read +transaction by `mdbx_canary_get()`. 8. Ability to update or delete record and get previous value via -`mdbx_replace()`. Also can update specific multi-value. +`mdbx_replace()`. Also allows update the specific item from multi-value +with the same key. 9. Sequence generation via `mdbx_dbi_sequence()`. @@ -228,30 +208,29 @@ duplicates. * abort current write transaction with returning error code. -11. Ability to open DB in exclusive mode with `MDBX_EXCLUSIVE` flag. +11. Ability to open DB in exclusive mode by `MDBX_EXCLUSIVE` flag. -12. Ability to get how far current read-only snapshot is from latest -version of the DB by `mdbx_txn_straggler()`. +12. Ability to get how far current read-transaction snapshot lags +from the latest version of the DB by `mdbx_txn_straggler()`. -13. Ability to explicitly request update of present record without -creating new record. Implemented as `MDBX_CURRENT` flag for -`mdbx_put()`. +13. Ability to explicitly update the existing record, not insertion +a new one. Implemented as `MDBX_CURRENT` flag for `mdbx_put()`. 14. Fixed `mdbx_cursor_count()`, which returns correct count of -duplicated for all table types and any cursor position. +duplicated (aka multi-value) for all cases and any cursor position. 15. `mdbx_env_info()` to getting additional info, including number of -the oldest snapshot of DB, which is used by one of the readers. +the oldest snapshot of DB, which is used by someone of the readers. 16. `mdbx_del()` doesn't ignore additional argument (specifier) `data` for tables without duplicates (without flag `MDBX_DUPSORT`), if `data` is not null then always uses it to verify record, which is being deleted. -17. Ability to open dbi-table with simultaneous setup of comparators for -keys and values, via `mdbx_dbi_open_ex()`. +17. Ability to open dbi-table with simultaneous with race-free setup +of comparators for keys and values, via `mdbx_dbi_open_ex()`. -18. `mdbx_is_dirty()`to find out if key or value is on dirty page, that +18. `mdbx_is_dirty()`to find out if given key or value is on dirty page, that useful to avoid copy-out before updates. 19. Correct update of current record in `MDBX_CURRENT` mode of @@ -266,21 +245,21 @@ useful to avoid copy-out before updates. 22. Ability to get value by key and duplicates count by `mdbx_get_ex()`. 23. Functions `mdbx_cursor_on_first()` and `mdbx_cursor_on_last()`, -which allows to know if cursor is currently on first or last position +which allows to check cursor is currently on first or last position respectively. -24. Automatic creation of synchronization points (flush changes to -persistent storage) when changes reach set threshold (threshold can be -set by `mdbx_env_set_syncbytes()`). +24. Automatic creation of steady commit-points (flushing data to the +disk) when the volume of changes reaches a threshold, which can be +set by `mdbx_env_set_syncbytes()`. 25. Control over debugging and receiving of debugging messages via `mdbx_setup_debug()`. -26. Function `mdbx_env_pgwalk()` for page-walking all pages in DB. +26. Function `mdbx_env_pgwalk()` for page-walking the DB. -27. Three meta-pages instead of two, this allows to guarantee -consistently update weak sync-points without risking to corrupt last -steady sync-point. +27. Three meta-pages instead of two, that allows to guarantee +consistency of data when updating weak commit-points without the +risk of damaging the last steady commit-point. 28. Guarantee of DB integrity in `WRITEMAP+MAPSYNC` mode: > Current _libmdbx_ gives a choice of safe async-write mode (default) @@ -292,9 +271,9 @@ steady sync-point. creation of steady synchronization point) via `mdbx_env_close_ex()`. 30. If read transaction is aborted via `mdbx_txn_abort()` or -`mdbx_txn_reset()` then DBI-handles, which were opened in it, aren't -closed or deleted. This allows to avoid several types of hard-to-debug -errors. +`mdbx_txn_reset()` then DBI-handles, which were opened during it, +will not be closed or deleted. In several cases this allows +to avoid hard-to-debug errors. 31. All cursors in all read and write transactions can be reused by `mdbx_cursor_renew()` and MUST be freed explicitly. @@ -307,13 +286,29 @@ errors. > - double-free; > - memory corruption and segfaults. --------------------------------------------------------------------------------- + +32. On **Mac OS X** the `fcntl(F_FULLFSYNC)` syscall is used _by +default_ to synchronize data with the disk, as this is [the only way to +guarantee data +durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) +in case of power failure. Unfortunately, in scenarios with high write +intensity, the use of `F_FULLFSYNC` significant degrades performance +compared to LMDB, where the `fsync()` syscall is used. Therefore, +_libmdbx_ allows you to override this behavior by defining the +`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library. + +33. On **Windows** the `LockFileEx()` syscall is used for locking, since +it allows place the database on network drives, and provides protection +against incompetent user actions (aka +[poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore +_libmdbx_ may be a little lag in performance tests from LMDB where a +named mutexes are used. + ## Gotchas -1. At one moment there can be only one writer. But this allows to -serialize writes and eliminate any possibility of conflict or logical -errors during transaction rollback. +1. There cannot be more than one writer at a time. This allows serialize an +updates and eliminate any possibility of conflicts, deadlocks or logical errors. 2. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) means relatively big [WAF](https://en.wikipedia.org/wiki/Write_amplification) @@ -346,7 +341,7 @@ performance bottleneck in `MAPASYNC` mode. > storage then it's much more preferable to use `std::map`. -4. LMDB has a problem of long-time readers which degrades performance +4. _LMDB_ has a problem of long-time readers which degrades performance and bloats DB. > _libmdbx_ addresses that, details below. @@ -357,56 +352,41 @@ of data. > Details below. -#### Long-time read transactions problem +### Problem of long-time reading Garbage collection problem exists in all databases one way or another (e.g. VACUUM in PostgreSQL). But in _libmdbx_ and LMDB it's even more -important because of high performance and deliberate simplification of -internals with emphasis on performance. +discernible because of high transaction rate and intentional internals +simplification in favor of performance. + +Understanding the problem requires some explanation, but can be +difficult for quick perception. So is is reasonable +to simplify this as follows: -* Altering data during long read operation may exhaust available space -on persistent storage. +* Massive altering of data during a parallel long read operation may +exhaust the free DB space. -* If available space is exhausted then any attempt to update data -results in `MAP_FULL` error until long read operation ends. +* If the available space is exhausted, any attempt to update the data +will cause a "MAP_FULL" error until a long read transaction is +completed. -* Main examples of long readers is hot backup and debugging of client -application which actively uses read transactions. +* A good example of long readers is a hot backup or debugging of +a client application while retaining an active read transaction. * In _LMDB_ this results in degraded performance of all operations of -syncing data to persistent storage. - -* _libmdbx_ has a mechanism which aborts such operations and `LIFO RECLAIM` -mode which addresses performance degradation. - -Read operations operate only over snapshot of DB which is consistent on -the moment when read transaction started. This snapshot doesn't change -throughout the transaction but this leads to inability to reclaim the -pages until read transaction ends. - -In _LMDB_ this leads to a problem that memory pages, allocated for -operations during long read, will be used for operations and won't be -reclaimed until DB process terminates. In _LMDB_ they are used in -[FIFO](https://en.wikipedia.org/wiki/FIFO_(computing_and_electronics)) -manner, which causes increased page count and less chance of cache hit -during I/O. In other words: one long-time reader can impact performance -of all database until it'll be reopened. - -_libmdbx_ addresses the problem, details below. Illustrations to this -problem can be found in the -[presentation](http://www.slideshare.net/leoyuriev/lmdb). There is also -example of performance increase thanks to -[BBWC](https://en.wikipedia.org/wiki/Disk_buffer#Write_acceleration) -when `LIFO RECLAIM` enabled in _libmdbx_. - -#### Data safety in async-write mode -In `WRITEMAP+MAPSYNC` mode dirty pages are written to persistent storage -by kernel. This means that in case of application crash OS kernel will -write all dirty data to disk and nothing will be lost. But in case of -hardware malfunction or OS kernel fatal error only some dirty data might -be synced to disk, and there is high probability that pages with -metadata saved, will point to non-saved, hence non-existent, data pages. -In such situation, DB is completely corrupted and can't be repaired even -if there was full sync before the crash via `mdbx_env_sync(). +writing data to persistent storage. + +* _libmdbx_ has the `OOM-KICK` mechanism which allow to abort such +operations and the `LIFO RECLAIM` mode which addresses performance +degradation. + +### Durability in asynchronous writing mode +In `WRITEMAP+MAPSYNC` mode updated (aka dirty) pages are written to +persistent storage by the OS kernel. This means that if the application +fails, the OS kernel will finish writing all updated data to disk and +nothing will be lost. However, in the case of hardware malfunction or OS +kernel fatal error, only some updated data can be written to disk and +the database structure is likely to be destroyed. In such situation, DB +is completely corrupted and can't be repaired. _libmdbx_ addresses this by fully reimplementing write path of data: @@ -414,39 +394,154 @@ _libmdbx_ addresses this by fully reimplementing write path of data: instead their shadow copies are used and their updates are synced after data is flushed to disk. -* During transaction commit _libmdbx_ marks synchronization points as -steady or weak depending on how much synchronization needed between RAM -and persistent storage, e.g. in `WRITEMAP+MAPSYNC` commited transactions -are marked as weak, but during explicit data synchronization - as -steady. +* During transaction commit _libmdbx_ marks it as a steady or weak +depending on synchronization status between RAM and persistent storage. +For instance, in the `WRITEMAP+MAPSYNC` mode committed transactions +are marked as weak by default, but as steady after explicit data flushes. * _libmdbx_ maintains three separate meta-pages instead of two. This -allows to commit transaction with steady or weak synchronization point -without losing two previous synchronization points (one of them can be -steady, and second - weak). This allows to order weak and steady -synchronization points in any order without losing consistency in case -of system crash. - -* During DB open _libmdbx_ rollbacks to the last steady synchronization -point, this guarantees database integrity. - -For data safety pages which form database snapshot with steady -synchronization point must not be updated until next steady -synchronization point. So last steady synchronization point creates -"long-time read" effect. The only difference that in case of memory -exhaustion the problem will be immediately addressed by flushing changes -to persistent storage and forming new steady synchronization point. +allows to commit transaction as steady or weak without losing two +previous commit points (one of them can be steady, and another +weak). Thus, after a fatal system failure, it will be possible to +rollback to the last steady commit point. + +* During DB open _libmdbx_ rollbacks to the last steady commit point, +this guarantees database integrity after a crash. However, if the +database opening in read-only mode, such rollback cannot be performed +which will cause returning the MDBX_WANNA_RECOVERY error. + +For data integrity a pages which form database snapshot with steady +commit point, must not be updated until next steady commit point. +Therefore the last steady commit point creates an effect analogues to +"long-time read". The only difference that now in case of space +exhaustion the problem will be immediately addressed by writing changes +to disk and forming the new steady commit point. + +So in async-write mode _libmdbx_ will always use new pages until the +free DB space will be exhausted or `mdbx_env_sync()` will be invoked, +and the total write traffic to the disk will be the same as in +sync-write mode. + +Currently libmdbx gives a choice between a safe async-write mode +(default) and `UTTERLY_NOSYNC` mode which may lead to DB corruption +after a system crash, i.e. like the LMDB. + +Next version of _libmdbx_ will be automatically create steady commit +points in async-write mode upon completion transfer data to the disk. -So in async-write mode _libmdbx_ will always use new pages until memory -is exhausted or `mdbx_env_sync()` is invoked. Total disk usage will be -almost the same as in sync-write mode. +-------------------------------------------------------------------------------- -Current _libmdbx_ gives a choice of safe async-write mode (default) and -`UTTERLY_NOSYNC` mode which may result in full DB corruption during -system crash as with LMDB. +Usage +===== + +## Building + +To build on all platforms except Windows the prerequirements are the +same: non-obsolete versions of GNU Make, +[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)), C and C++ +compilers compatible with GCC or CLANG. On Windows you will need only : +Microsoft Visual Studio 2015 or later, Windows SDK for Windows 8 or +later. + +Historically, the libmdbx builing is based on single +[Makefile](https://en.wikipedia.org/wiki/Makefile) which assumes +different recipes depending on target platform. In the next versions, it +is planned to switch to [CMake](https://en.wikipedia.org/wiki/CMake), +with the refusal to support other tools. + +#### DSO/DLL unloading and destructors of Thread-Local-Storage objects +When building _libmdbx_ as a shared library or use static _libmdbx_ as a +part of another dynamic library, it is advisable to make sure that your +system ensures the correctness of the call destructors of +Thread-Local-Storage objects when unloading dynamic libraries'. + +If this is not the case, then unloading a dynamic-link library with +_libmdbx_ code inside, can result in either a resource leak or a crash +due to calling destructors from an already unloaded DSO/DLL object. The +problem can only manifest in a multithreaded application, which makes +the unloading of shared dynamic libraries with _libmdbx_ code inside, +after using _libmdbx_. It is known that TLS-destructors are properly +maintained in the following cases: + +- On all modern versions of Windows (Windows 7 and later). + +- On systems with the +[`__cxa_thread_atexit_impl()`](https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables) +function in the standard C library, including systems with GNU libc +version 2.18 and later. + +- On systems with libpthread/ntpl from GNU libc with bug fixes +[#21031](https://sourceware.org/bugzilla/show_bug.cgi?id=21031) and +[#21032](https://sourceware.org/bugzilla/show_bug.cgi?id=21032), or +where there are no similar bugs in the pthreads implementation. + +### Linux and other platforms with GNU Make +To build the library it is enough to execute `make all` in the directory +of source code, and `make check` for execute the basic tests. + +If the `make` installed on the system is not GNU Make, there will be a +lot of errors from make when trying to build. In this case, perhaps you +should use `gmake` instead of `make`, or even `gnu-make`, etc. + +### FreeBSD and related platforms +As a rule, in such systems, the default is to use Berkeley Make. And GNU +Make is called by the gmake command or may be missing. In addition, +[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) may be absent. + +You need to install the required components: GNU Make, bash, C and C++ +compilers compatible with GCC or CLANG. After that, to build the +library, it is enough execute `gmake all` (or `make all`) in the +directory with source code, and `gmake check` (or `make check`) to run +the basic tests. + +### Windows +For building _libmdbx_ on Windows the [Microsoft Visual +Studio](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) is +recommended, but not tools such as MinGW, MSYS, or Cygwin. To do this, +the libmdbx source code includes the set of appropriate project files +that are compatible with Visual Studio 2015, the Windows SDK for Windows +8.1, and later. Just open `mdbx.sln` in Visual Studio and build the +library. + +To build with newer versions of the SDK or Visual Studio, it should be +sufficient to execute "Retarget solution". To build for older versions +of Windows (such as Windows XP) or by older compilers, you will need to +convert or recreate the corresponding project files yourself. + +Building by MinGW, MSYS or Cygwin is potentially possible. However, +these scripts are not tested and will probably require you to modify the +Makefile. It should be noted that in _libmdbx_ was efforts to resolve +runtime dependencies from CRT and other libraries Visual Studio. +For this is enough define the `MDBX_AVOID_CRT` during build. + +An example of running a basic test script can be found in the +[CI-script](appveyor.yml) for [AppVeyor](https://www.appveyor.com/). To +run the [long stochastic test scenario](test/long_stochastic.sh), +[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) is required, and +the such testing is recommended with place the test data on the +[RAM-disk](https://en.wikipedia.org/wiki/RAM_drive). + +### MacOS X +Current [native build tools](https://en.wikipedia.org/wiki/Xcode) for +MacOS X include GNU Make, CLANG and an outdated version of bash. +Therefore, to build the library, it is enough to run `make all` in the +directory with source code, and run `make check` to execute the base +tests. If something goes wrong, it is recommended to install +[Homebrew](https://brew.sh/) and try again. + +To run the [long stochastic test scenario](test/long_stochastic.sh), you +will need to install the current (not outdated) version of +[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). To do this, we +recommend that you install [Homebrew](https://brew.sh/) and then execute +`brew install bash`. + +## Bindings + + | Runtime | GitHub | Author | + | -------- | ------ | ------ | + | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | + | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | -Next version of _libmdbx_ will create steady synchronization points -automatically in async-write mode. -------------------------------------------------------------------------------- @@ -458,9 +553,7 @@ and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz, 8 Gb RAM, SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb. --------------------------------------------------------------------------------- - -### Integral performance +## Integral performance Here showed sum of performance metrics in 3 benchmarks: @@ -486,7 +579,7 @@ Here showed sum of performance metrics in 3 benchmarks: -------------------------------------------------------------------------------- -### Read Scalability +## Read Scalability Summary performance with concurrent read/search queries in 1-2-4-8 threads on 4 CPU cores machine. @@ -495,7 +588,7 @@ threads on 4 CPU cores machine. -------------------------------------------------------------------------------- -### Sync-write mode +## Sync-write mode - Linear scale on left and dark rectangles mean arithmetic mean transactions per second; @@ -517,7 +610,7 @@ and after full run the database contains 10,000 small key-value records. -------------------------------------------------------------------------------- -### Lazy-write mode +## Lazy-write mode - Linear scale on left and dark rectangles mean arithmetic mean of thousands transactions per second; @@ -544,7 +637,7 @@ records. -------------------------------------------------------------------------------- -### Async-write mode +## Async-write mode - Linear scale on left and dark rectangles mean arithmetic mean of thousands transactions per second; @@ -568,7 +661,7 @@ and after full run the database contains 10,000 small key-value records. -------------------------------------------------------------------------------- -### Cost comparison +## Cost comparison Summary of used resources during lazy-write mode benchmarks: diff --git a/libs/libmdbx/src/appveyor.yml b/libs/libmdbx/src/appveyor.yml index ca70d96265..0c0dd8a6a6 100644 --- a/libs/libmdbx/src/appveyor.yml +++ b/libs/libmdbx/src/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.0.{build} +version: 0.3.2.{build} environment: matrix: diff --git a/libs/libmdbx/src/dll.vcxproj b/libs/libmdbx/src/dll.vcxproj index c0743890ce..5464c15875 100644 --- a/libs/libmdbx/src/dll.vcxproj +++ b/libs/libmdbx/src/dll.vcxproj @@ -99,7 +99,7 @@ </PropertyGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <ClCompile> - <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions);MDBX_DEBUG=1</PreprocessorDefinitions> + <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> <WarningLevel>EnableAllWarnings</WarningLevel> <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> @@ -162,7 +162,7 @@ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <ClCompile> <WarningLevel>EnableAllWarnings</WarningLevel> - <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions);MDBX_DEBUG=1</PreprocessorDefinitions> + <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> <StringPooling>true</StringPooling> <TreatWarningAsError>true</TreatWarningAsError> diff --git a/libs/libmdbx/src/libmdbx.cflags b/libs/libmdbx/src/libmdbx.cflags new file mode 100644 index 0000000000..bb8c4c3337 --- /dev/null +++ b/libs/libmdbx/src/libmdbx.cflags @@ -0,0 +1 @@ +-std=c11
\ No newline at end of file diff --git a/libs/libmdbx/src/libmdbx.cxxflags b/libs/libmdbx/src/libmdbx.cxxflags new file mode 100644 index 0000000000..6435dfce2f --- /dev/null +++ b/libs/libmdbx/src/libmdbx.cxxflags @@ -0,0 +1 @@ +-std=c++17
\ No newline at end of file diff --git a/libs/libmdbx/src/libmdbx.files b/libs/libmdbx/src/libmdbx.files index f147554a2a..cdea405b5d 100644 --- a/libs/libmdbx/src/libmdbx.files +++ b/libs/libmdbx/src/libmdbx.files @@ -1,26 +1,20 @@ -CMakeLists.txt -README-RU.md -pcrf_test/CMakeLists.txt -src/tools/CMakeLists.txt -test/CMakeLists.txt -test/append.cc -test/copy.cc -test/ttl.cc -tutorial/CMakeLists.txt -tutorial/sample-mdbx.c AUTHORS +CMakeLists.txt LICENSE Makefile +README-RU.md README.md TODO.md mdbx.h src/bits.h src/defs.h +src/lck-linux.c src/lck-posix.c src/lck-windows.c src/mdbx.c src/osal.c src/osal.h +src/tools/CMakeLists.txt src/tools/mdbx_chk.c src/tools/mdbx_copy.1 src/tools/mdbx_copy.c @@ -33,11 +27,16 @@ src/tools/mdbx_stat.c src/tools/wingetopt.c src/tools/wingetopt.h src/version.c +test/CMakeLists.txt test/actor.cc +test/append.cc test/base.h +test/cases.cc test/chrono.cc test/chrono.h +test/config.cc test/config.h +test/copy.cc test/dead.cc test/hill.cc test/jitter.cc @@ -46,16 +45,18 @@ test/keygen.h test/log.cc test/log.h test/main.cc -test/config.cc -test/cases.cc test/osal-unix.cc test/osal-windows.cc test/osal.h +test/pcrf/CMakeLists.txt test/test.cc test/test.h test/try.cc +test/ttl.cc test/utils.cc test/utils.h +tutorial/CMakeLists.txt tutorial/README.md tutorial/sample-bdb.txt tutorial/sample-mdb.txt +tutorial/sample-mdbx.c diff --git a/libs/libmdbx/src/libmdbx.includes b/libs/libmdbx/src/libmdbx.includes index 0b5ade5690..eb512a01ce 100644 --- a/libs/libmdbx/src/libmdbx.includes +++ b/libs/libmdbx/src/libmdbx.includes @@ -2,5 +2,3 @@ src src/tools test -pcrf_test -tutorial diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index 8f3b1575dc..67b725139a 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -117,7 +117,11 @@ typedef DWORD mdbx_tid_t; typedef int mdbx_filehandle_t; typedef pid_t mdbx_pid_t; typedef pthread_t mdbx_tid_t; +#ifdef ENODATA #define MDBX_ENODATA ENODATA +#else +#define MDBX_ENODATA -1 +#endif #define MDBX_EINVAL EINVAL #define MDBX_EACCESS EACCES #define MDBX_ENOMEM ENOMEM @@ -149,8 +153,8 @@ typedef pthread_t mdbx_tid_t; #else #define __dll_export #endif -#elif defined(__GNUC__) || __has_attribute(visibility) -#define __dll_export __attribute__((visibility("default"))) +#elif defined(__GNUC__) || __has_attribute(__visibility__) +#define __dll_export __attribute__((__visibility__("default"))) #else #define __dll_export #endif @@ -736,6 +740,8 @@ LIBMDBX_API int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, * [out] stat The address of an MDBX_stat structure where the statistics * will be copied */ LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes); +LIBMDBX_API int mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *stat, size_t bytes); /* Return information about the MDBX environment. * @@ -743,6 +749,8 @@ LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes); * [out] stat The address of an MDBX_envinfo structure * where the information will be copied */ LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, size_t bytes); +LIBMDBX_API int mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *info, size_t bytes); /* Flush the data buffers to disk. * @@ -1578,14 +1586,15 @@ LIBMDBX_API char *mdbx_dkey(const MDBX_val *key, char *const buf, LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync); -/* Set threshold to force flush the data buffers to disk, +/* Sets threshold to force flush the data buffers to disk, * even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags - * in the environment. + * in the environment. The value affects all processes which operates with given + * DB until the last process close DB or a new value will be settled. * * Data is always written to disk when mdbx_txn_commit() is called, * but the operating system may keep it buffered. MDBX always flushes * the OS buffers upon commit as well, unless the environment was - * opened with MDBX_NOSYNC or in part MDBX_NOMETASYNC. + * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC. * * The default is 0, than mean no any threshold checked, and no additional * flush will be made. @@ -1597,6 +1606,32 @@ LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync); * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes); +/* Sets relative period since the last unsteay commit to force flush the data + * buffers to disk, even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags + * in the environment. The value affects all processes which operates with given + * DB until the last process close DB or a new value will be settled. + * + * Data is always written to disk when mdbx_txn_commit() is called, + * but the operating system may keep it buffered. MDBX always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC. + * + * Settled period don't checked asynchronously, but only inside the functions. + * mdbx_txn_commit() and mdbx_env_sync(). Therefore, in cases where transactions + * are committed infrequently and/or irregularly, polling by mdbx_env_sync() may + * be a reasonable solution to timeout enforcement. + * + * The default is 0, than mean no any timeout checked, and no additional + * flush will be made. + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] seconds_16dot16 The period in 1/65536 of second when a synchronous + * flush would be made since the last unsteay commit. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env, + unsigned seconds_16dot16); + /* Returns a lag of the reading for the given transaction. * * Returns an information for estimate how much given read-only @@ -1654,6 +1689,7 @@ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); #define MDBX_DBG_AUDIT 16 #define MDBX_DBG_JITTER 32 #define MDBX_DBG_DUMP 64 +#define MDBX_DBG_LEGACY_MULTIOPEN 128 typedef void MDBX_debug_func(int type, const char *function, int line, const char *msg, va_list args); @@ -1675,11 +1711,12 @@ typedef enum { #define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) #define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) -typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned number, void *ctx, - int deep, const char *dbi, size_t page_size, - MDBX_page_type_t type, size_t nentries, - size_t payload_bytes, size_t header_bytes, - size_t unused_bytes); +typedef int +MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, + const int deep, const char *const dbi, + const size_t page_size, const MDBX_page_type_t type, + const size_t nentries, const size_t payload_bytes, + const size_t header_bytes, const size_t unused_bytes); LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); diff --git a/libs/libmdbx/src/packages/rpm/CMakeLists.txt b/libs/libmdbx/src/packages/rpm/CMakeLists.txt index 55fbe964fb..5949e9f0ac 100644 --- a/libs/libmdbx/src/packages/rpm/CMakeLists.txt +++ b/libs/libmdbx/src/packages/rpm/CMakeLists.txt @@ -3,8 +3,8 @@ set(TARGET mdbx) project(${TARGET}) set(MDBX_VERSION_MAJOR 0) -set(MDBX_VERSION_MINOR 2) -set(MDBX_VERSION_RELEASE 0) +set(MDBX_VERSION_MINOR 3) +set(MDBX_VERSION_RELEASE 1) set(MDBX_VERSION_REVISION 0) set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE}) @@ -83,7 +83,7 @@ set(${TARGET}_SRC mdbx.h src/bits.h src/defs.h - src/lck-posix.c + src/lck-linux.c src/mdbx.c src/osal.c src/osal.h diff --git a/libs/libmdbx/src/src/bits.h b/libs/libmdbx/src/src/bits.h index ad335b6b65..fd36cac918 100644 --- a/libs/libmdbx/src/src/bits.h +++ b/libs/libmdbx/src/src/bits.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -15,30 +15,37 @@ /* *INDENT-OFF* */ /* clang-format off */ +/* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */ #ifndef MDBX_DEBUG +#ifdef NDEBUG # define MDBX_DEBUG 0 +#else +# define MDBX_DEBUG 1 +#endif #endif +/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ #if MDBX_DEBUG # undef NDEBUG #endif +#define MDBX_OSX_WANNA_DURABILITY 0 /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ +#define MDBX_OSX_WANNA_SPEED 1 /* using fsync() with chance of data lost on power failure */ +#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY +#define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY +#endif + /*----------------------------------------------------------------------------*/ /* Should be defined before any includes */ -#ifndef _GNU_SOURCE -# define _GNU_SOURCE 1 -#endif -#ifndef _POSIX_C_SOURCE -# define _POSIX_C_SOURCE 200112L -#endif -#ifndef _XOPEN_SOURCE -# define _XOPEN_SOURCE 500 -#endif #ifndef _FILE_OFFSET_BITS # define _FILE_OFFSET_BITS 64 #endif +#ifdef __APPLE__ +#define _DARWIN_C_SOURCE +#endif + #ifdef _MSC_VER # if _MSC_VER < 1400 # error "Microsoft Visual C++ 8.0 (Visual Studio 2005) or later version is required" @@ -156,7 +163,7 @@ /* The version number for a database's datafile format. */ #define MDBX_DATA_VERSION 2 /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION 2 +#define MDBX_LOCK_VERSION 3 /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -198,74 +205,6 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) -/* Reader Lock Table - * - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent - * read transactions started by the same thread need no further locking to - * proceed. - * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. */ - -/* The actual reader record, with cacheline padding. */ -typedef struct MDBX_reader { - /* Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. */ - volatile txnid_t mr_txnid; - - /* The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * - * NOTE: We currently don't check for stale records. - * We simply re-init the table when we know that we're the only process - * opening the lock file. */ - - /* The process ID of the process owning this reader txn. */ - volatile mdbx_pid_t mr_pid; - /* The thread ID of the thread owning this txn. */ - volatile mdbx_tid_t mr_tid; - - /* cache line alignment */ - uint8_t pad[MDBX_CACHELINE_SIZE - - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % - MDBX_CACHELINE_SIZE]; -} MDBX_reader; - /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -324,6 +263,12 @@ typedef struct MDBX_meta { /* txnid that committed this page, the second of a two-phase-update pair */ volatile txnid_t mm_txnid_b; + + /* Number of non-meta pages which were put in GC after COW. May be 0 in case + * DB was previously handled by libmdbx without corresponding feature. + * This value in couple with mr_snapshot_pages_retired allows fast estimation + * of "how much reader is restraining GC recycling". */ + uint64_t mm_pages_retired; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -412,6 +357,84 @@ typedef struct MDBX_page { #define MAX_MAPSIZE MAX_MAPSIZE32 #endif /* MDBX_WORDBITS */ +#pragma pack(pop) + +/* Reader Lock Table + * + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. + * + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ + +/* The actual reader record, with cacheline padding. */ +typedef struct MDBX_reader { + /* Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. */ + volatile txnid_t mr_txnid; + + /* The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * + * NOTE: We currently don't check for stale records. + * We simply re-init the table when we know that we're the only process + * opening the lock file. */ + + /* The thread ID of the thread owning this txn. */ + union { + volatile mdbx_tid_t mr_tid; + volatile uint64_t mr_tid_u64; + }; + /* The process ID of the process owning this reader txn. */ + union { + volatile mdbx_pid_t mr_pid; + volatile uint32_t mr_pid_u32; + }; + /* The number of pages used in the reader's MVCC snapshot, + * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ + volatile pgno_t mr_snapshot_pages_used; + /* Number of retired pages at the time this reader starts transaction. So, + * at any time the difference mm_pages_retired - mr_snapshot_pages_retired + * will give the number of pages which this reader restraining from reuse. */ + volatile uint64_t mr_snapshot_pages_retired; +} MDBX_reader; + /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { /* Stamp identifying this as an MDBX file. @@ -424,68 +447,62 @@ typedef struct MDBX_lockinfo { /* Flags which environment was opened. */ volatile uint32_t mti_envmode; + /* Threshold of un-synced-with-disk pages for auto-sync feature, + * zero means no-threshold, i.e. auto-sync is disabled. */ + volatile pgno_t mti_autosync_threshold; + /* Period for timed auto-sync feature, i.e. at the every steady checkpoint + * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. + * The time value is represented in a suitable system-dependent form, for + * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). + * Zero means timed auto-sync is disabled. */ + volatile uint64_t mti_autosync_period; + + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ #ifdef MDBX_OSAL_LOCK - /* Mutex protecting write-txn. */ - union { - MDBX_OSAL_LOCK mti_wmutex; - uint8_t pad_mti_wmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)]; - }; + /* Mutex protecting write-txn. */ + MDBX_OSAL_LOCK mti_wmutex; #endif -#define MDBX_lockinfo_SIZE_A \ - (8 /* mti_magic_and_version */ + 4 /* mti_os_and_format */ + \ - 4 /* mti_envmode */ + MDBX_OSAL_LOCK_SIZE /* mti_wmutex */ + \ - MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_wmutex */) - /* cache-line alignment */ - uint8_t - pad_a[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_A % MDBX_CACHELINE_SIZE]; - - /* The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. */ - volatile unsigned mti_numreaders; + volatile txnid_t mti_oldest_reader; -#ifdef MDBX_OSAL_LOCK - /* Mutex protecting readers registration access to this table. */ - union { - MDBX_OSAL_LOCK mti_rmutex; - uint8_t pad_mti_rmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)]; - }; -#endif + /* Timestamp for auto-sync feature, i.e. the steady checkpoint should be + * created at the first commit that will be not early this timestamp. + * The time value is represented in a suitable system-dependent form, for + * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). + * Zero means timed auto-sync is not pending. */ + volatile uint64_t mti_unsynced_timeout; - volatile txnid_t mti_oldest; - volatile uint32_t mti_readers_refresh_flag; + /* Number un-synced-with-disk pages for auto-sync feature. */ + volatile pgno_t mti_unsynced_pages; -#define MDBX_lockinfo_SIZE_B \ - (sizeof(unsigned) /* mti_numreaders */ + \ - MDBX_OSAL_LOCK_SIZE /* mti_rmutex */ + sizeof(txnid_t) /* mti_oldest */ + \ - sizeof(uint32_t) /* mti_readers_refresh_flag */ + \ - MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_rmutex */) + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ - /* cache-line alignment */ - uint8_t - pad_b[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_B % MDBX_CACHELINE_SIZE]; +#ifdef MDBX_OSAL_LOCK + /* Mutex protecting readers registration access to this table. */ + MDBX_OSAL_LOCK mti_rmutex; +#endif - MDBX_reader mti_readers[1]; + /* The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. */ + volatile unsigned mti_numreaders; + volatile unsigned mti_readers_refresh_flag; + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ + MDBX_reader mti_readers[1]; } MDBX_lockinfo; -#pragma pack(pop) - -#define MDBX_LOCKINFO_WHOLE_SIZE \ - ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ - ~((size_t)MDBX_CACHELINE_SIZE - 1)) - /* Lockfile format signature: version, features and field layout */ #define MDBX_LOCK_FORMAT \ - ((MDBX_OSAL_LOCK_SIGN << 16) + \ - (uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1)) + (MDBX_OSAL_LOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ + (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 29) #define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) -#define MDBX_LOCK_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) #ifndef MDBX_ASSUME_MALLOC_OVERHEAD #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u) @@ -768,8 +785,6 @@ struct MDBX_env { unsigned me_psize2log; /* log2 of DB page size */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ - /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ - unsigned me_close_readers; mdbx_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_dbi me_maxdbs; /* size of the DB table */ @@ -798,14 +813,24 @@ struct MDBX_env { unsigned me_maxgc_ov1page; /* Max size of a node on a page */ unsigned me_nodemax; - unsigned me_maxkey_limit; /* max size of a key */ - mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ - size_t me_sync_pending; /* Total dirty/non-sync'ed bytes - * since the last mdbx_env_sync() */ - size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ + unsigned me_maxkey_limit; /* max size of a key */ + mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ + volatile uint64_t *me_unsynced_timeout; + volatile uint64_t *me_autosync_period; + volatile pgno_t *me_unsynced_pages; + volatile pgno_t *me_autosync_threshold; MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ - txnid_t me_oldest_stub; + struct { +#ifdef MDBX_OSAL_LOCK + MDBX_OSAL_LOCK wmutex; +#endif + txnid_t oldest; + uint64_t unsynced_timeout; + uint64_t autosync_period; + pgno_t autosync_pending; + pgno_t autosync_threshold; + } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif @@ -826,7 +851,6 @@ struct MDBX_env { /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else - mdbx_fastmutex_t me_lckless_wmutex; mdbx_fastmutex_t me_remap_guard; #endif }; @@ -840,22 +864,16 @@ typedef struct MDBX_ntxn { /*----------------------------------------------------------------------------*/ /* Debug and Logging stuff */ +#ifndef mdbx_runtime_flags /* avoid override from tools */ extern int mdbx_runtime_flags; +#endif extern MDBX_debug_func *mdbx_debug_logger; extern txnid_t mdbx_debug_edge; void mdbx_debug_log(int type, const char *function, int line, const char *fmt, - ...) -#if defined(__GNUC__) || __has_attribute(format) - __attribute__((format(printf, 4, 5))) -#endif - ; + ...) __printf_args(4, 5); -void mdbx_panic(const char *fmt, ...) -#if defined(__GNUC__) || __has_attribute(format) - __attribute__((format(printf, 1, 2))) -#endif - ; +void mdbx_panic(const char *fmt, ...) __printf_args(1, 2); #if MDBX_DEBUG @@ -1212,10 +1230,23 @@ static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { } static __inline void mdbx_jitter4testing(bool tiny) { -#ifndef NDEBUG +#if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); #else (void)tiny; #endif } + +/* Controls checking PID against reuse DB environment after the fork() */ +#ifndef MDBX_TXN_CHECKPID +#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) +/* PID check could be ommited: + * - on Linux when madvise(MADV_DONTFORK) is available. i.e. after the fork() + * mapped pages will not be available for child process. + * - in Windows where fork() not available. */ +#define MDBX_TXN_CHECKPID 0 +#else +#define MDBX_TXN_CHECKPID 1 +#endif +#endif /* MDBX_TXN_CHECKPID */ diff --git a/libs/libmdbx/src/src/defs.h b/libs/libmdbx/src/src/defs.h index a672b8bdf0..622f1c13b6 100644 --- a/libs/libmdbx/src/src/defs.h +++ b/libs/libmdbx/src/src/defs.h @@ -104,8 +104,8 @@ /*----------------------------------------------------------------------------*/ #ifndef __always_inline -# if defined(__GNUC__) || __has_attribute(always_inline) -# define __always_inline __inline __attribute__((always_inline)) +# if defined(__GNUC__) || __has_attribute(__always_inline__) +# define __always_inline __inline __attribute__((__always_inline__)) # elif defined(_MSC_VER) # define __always_inline __forceinline # else @@ -114,8 +114,8 @@ #endif /* __always_inline */ #ifndef __noinline -# if defined(__GNUC__) || __has_attribute(noinline) -# define __noinline __attribute__((noinline)) +# if defined(__GNUC__) || __has_attribute(__noinline__) +# define __noinline __attribute__((__noinline__)) # elif defined(_MSC_VER) # define __noinline __declspec(noinline) # elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) @@ -126,16 +126,16 @@ #endif /* __noinline */ #ifndef __must_check_result -# if defined(__GNUC__) || __has_attribute(warn_unused_result) -# define __must_check_result __attribute__((warn_unused_result)) +# if defined(__GNUC__) || __has_attribute(__warn_unused_result__) +# define __must_check_result __attribute__((__warn_unused_result__)) # else # define __must_check_result # endif #endif /* __must_check_result */ #ifndef __deprecated -# if defined(__GNUC__) || __has_attribute(deprecated) -# define __deprecated __attribute__((deprecated)) +# if defined(__GNUC__) || __has_attribute(__deprecated__) +# define __deprecated __attribute__((__deprecated__)) # elif defined(_MSC_VER) # define __deprecated __declspec(deprecated) # else @@ -163,8 +163,8 @@ #endif /* __noop */ #ifndef __fallthrough -# if __GNUC_PREREQ(7, 0) || __has_attribute(fallthrough) -# define __fallthrough __attribute__((fallthrough)) +# if __GNUC_PREREQ(7, 0) || __has_attribute(__fallthrough__) +# define __fallthrough __attribute__((__fallthrough__)) # else # define __fallthrough __noop() # endif @@ -189,8 +189,8 @@ #endif /* __prefetch */ #ifndef __noreturn -# if defined(__GNUC__) || __has_attribute(noreturn) -# define __noreturn __attribute__((noreturn)) +# if defined(__GNUC__) || __has_attribute(__noreturn__) +# define __noreturn __attribute__((__noreturn__)) # elif defined(_MSC_VER) # define __noreturn __declspec(noreturn) # else @@ -199,8 +199,14 @@ #endif /* __noreturn */ #ifndef __nothrow -# if defined(__GNUC__) || __has_attribute(nothrow) -# define __nothrow __attribute__((nothrow)) +# if defined(__cplusplus) +# if __cplusplus < 201703L +# define __nothrow throw() +# else +# define __nothrow noexcept(true) +# endif /* __cplusplus */ +# elif defined(__GNUC__) || __has_attribute(__nothrow__) +# define __nothrow __attribute__((__nothrow__)) # elif defined(_MSC_VER) && defined(__cplusplus) # define __nothrow __declspec(nothrow) # else @@ -214,8 +220,8 @@ * Such a function can be subject to common subexpression elimination * and loop optimization just as an arithmetic operator would be. * These functions should be declared with the attribute pure. */ -# if defined(__GNUC__) || __has_attribute(pure) -# define __pure_function __attribute__((pure)) +# if defined(__GNUC__) || __has_attribute(__pure__) +# define __pure_function __attribute__((__pure__)) # else # define __pure_function # endif @@ -231,27 +237,27 @@ * data pointed to must not be declared const. Likewise, a function * that calls a non-const function usually must not be const. * It does not make sense for a const function to return void. */ -# if defined(__GNUC__) || __has_attribute(const) -# define __const_function __attribute__((const)) +# if defined(__GNUC__) || __has_attribute(__const__) +# define __const_function __attribute__((__const__)) # else # define __const_function # endif #endif /* __const_function */ -#ifndef __dll_hidden -# if defined(__GNUC__) || __has_attribute(visibility) -# define __hidden __attribute__((visibility("hidden"))) +#ifndef __hidden +# if defined(__GNUC__) || __has_attribute(__visibility__) +# define __hidden __attribute__((__visibility__("hidden"))) # else # define __hidden # endif -#endif /* __dll_hidden */ +#endif /* __hidden */ #ifndef __optimize # if defined(__OPTIMIZE__) -# if defined(__clang__) && !__has_attribute(optimize) +# if defined(__clang__) && !__has_attribute(__optimize__) # define __optimize(ops) -# elif defined(__GNUC__) || __has_attribute(optimize) -# define __optimize(ops) __attribute__((optimize(ops))) +# elif defined(__GNUC__) || __has_attribute(__optimize__) +# define __optimize(ops) __attribute__((__optimize__(ops))) # else # define __optimize(ops) # endif @@ -263,12 +269,13 @@ #ifndef __hot # if defined(__OPTIMIZE__) # if defined(__e2k__) -# define __hot __attribute__((hot)) __optimize(3) -# elif defined(__clang__) && !__has_attribute(hot) +# define __hot __attribute__((__hot__)) __optimize(3) +# elif defined(__clang__) && !__has_attribute(__hot_) \ + && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__)) /* just put frequently used functions in separate section */ -# define __hot __attribute__((section("text.hot"))) __optimize("O3") -# elif defined(__GNUC__) || __has_attribute(hot) -# define __hot __attribute__((hot)) __optimize("O3") +# define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +# elif defined(__GNUC__) || __has_attribute(__hot__) +# define __hot __attribute__((__hot__)) __optimize("O3") # else # define __hot __optimize("O3") # endif @@ -280,12 +287,13 @@ #ifndef __cold # if defined(__OPTIMIZE__) # if defined(__e2k__) -# define __cold __attribute__((cold)) __optimize(1) -# elif defined(__clang__) && !__has_attribute(cold) +# define __cold __attribute__((__cold__)) __optimize(1) +# elif defined(__clang__) && !__has_attribute(cold) \ + && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__)) /* just put infrequently used functions in separate section */ -# define __cold __attribute__((section("text.unlikely"))) __optimize("Os") +# define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") # elif defined(__GNUC__) || __has_attribute(cold) -# define __cold __attribute__((cold)) __optimize("Os") +# define __cold __attribute__((__cold__)) __optimize("Os") # else # define __cold __optimize("Os") # endif @@ -295,8 +303,8 @@ #endif /* __cold */ #ifndef __flatten -# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(flatten)) -# define __flatten __attribute__((flatten)) +# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__)) +# define __flatten __attribute__((__flatten__)) # else # define __flatten # endif @@ -338,9 +346,9 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__))); # define mdbx_func_ "<mdbx_unknown>" #endif -#if defined(__GNUC__) || __has_attribute(format) +#if defined(__GNUC__) || __has_attribute(__format__) #define __printf_args(format_index, first_arg) \ - __attribute__((format(printf, format_index, first_arg))) + __attribute__((__format__(printf, format_index, first_arg))) #else #define __printf_args(format_index, first_arg) #endif diff --git a/libs/libmdbx/src/src/lck-linux.c b/libs/libmdbx/src/src/lck-linux.c new file mode 100644 index 0000000000..925941b229 --- /dev/null +++ b/libs/libmdbx/src/src/lck-linux.c @@ -0,0 +1,428 @@ +/* + * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + */ + +#if !(defined(__linux__) || defined(__gnu_linux__)) +#error "This implementation of locking only supports Linux,\ + where is no interaction between the types of lock placed\ + by flock() and fcntl()." +#endif + +#include "./bits.h" +#include <sys/utsname.h> + +/* Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDBX_USE_ROBUST=0. */ +#ifndef MDBX_USE_ROBUST +/* Howard Chu: Android currently lacks Robust Mutex support */ +#if defined(EOWNERDEAD) && \ + !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ + with Robust Mutex too. */ \ + && (!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) || \ + _POSIX_C_SOURCE >= 200809L) +#define MDBX_USE_ROBUST 1 +#else +#define MDBX_USE_ROBUST 0 +#endif +#endif /* MDBX_USE_ROBUST */ + +/*----------------------------------------------------------------------------*/ +/* global constructor/destructor */ + +uint32_t mdbx_linux_kernel_version; +static __cold __attribute__((__constructor__)) void +mdbx_global_constructor(void) { + struct utsname buffer; + if (uname(&buffer) == 0) { + int i = 0; + char *p = buffer.release; + while (*p && i < 4) { + if (*p >= '0' && *p <= '9') { + long number = strtol(p, &p, 10); + if (number > 0) { + if (number > 255) + number = 255; + mdbx_linux_kernel_version += number << (24 - i * 8); + } + ++i; + } else { + ++p; + } + } + } + + mdbx_rthc_global_init(); +} + +static __cold __attribute__((__destructor__)) void +mdbx_global_destructor(void) { + mdbx_rthc_global_dtor(); +} + +/*----------------------------------------------------------------------------*/ +/* lck */ + +/* Описание реализации блокировок для Linux: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct MDBX_lockinfo) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * + * Используется два вида файловых блокировок flock() и fcntl(F_SETLK), + * как для lck-файла, так и для основного файла БД: + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive блокировок используется + * комбинация flock() и fcntl(F_SETLK) блокировки одного байта lck-файла + * в нулевой позиции (нулевая позиция не используется механизмом контроля + * процессов-читателей, так как pid пользовательского процесса в Linux + * всегда больше 0). + * - Кроме этого, flock() блокировка основного файла БД используется при работе + * в режимах без lck-файла, как в в read-only, так и в эксклюзивном. + * - Блокировки flock() и fcntl(F_SETLK) в Linux работают независимо. Поэтому + * их комбинирование позволяет предотвратить совместное использование БД + * через NFS, что позволяет fcntl(F_SETLK), одновременно защитившись + * от проблем не-аторманости flock() при переходе между эксклюзивным + * и атомарным режимами блокировок. + */ + +static int op_setlk, op_setlkw, op_getlk; +static void __cold choice_fcntl() { + assert(!op_setlk && !op_setlkw && !op_getlk); +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) + if (mdbx_linux_kernel_version > + 0x030f0000 /* OFD locks are available since 3.15, but engages here + only for 3.16 and larer kernels (LTS) for reliability reasons */ + && (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0) { + op_setlk = F_OFD_SETLK; + op_setlkw = F_OFD_SETLKW; + op_getlk = F_OFD_GETLK; + return; + } +#endif /* OFD locks */ + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; +} + +#ifndef OFF_T_MAX +#define OFF_T_MAX \ + ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) +#endif +#define LCK_WHOLE OFF_T_MAX + +static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, + off_t len) { + for (;;) { + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + if (fcntl(fd, cmd, &lock_op) == 0) { + if (cmd == op_getlk) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + return 0; + } + int rc = errno; + if (rc != EINTR || cmd == op_setlkw) + return rc; + } +} + +static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { + assert(lfd != INVALID_HANDLE_VALUE); + if (flock(lfd, LOCK_EX | LOCK_NB)) + return errno; + int rc = mdbx_lck_op(lfd, op_setlk, F_WRLCK, 0, 1); + if (rc != 0 && fallback2shared) { + while (flock(lfd, LOCK_SH)) { + int rc = errno; + if (rc != EINTR) + return rc; + } + } + return rc; +} + +static __inline int mdbx_lck_shared(int lfd) { + assert(lfd != INVALID_HANDLE_VALUE); + while (flock(lfd, LOCK_SH)) { + int rc = errno; + if (rc != EINTR) + return rc; + } + return mdbx_lck_op(lfd, op_setlkw, F_RDLCK, 0, 1); +} + +int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; +} + +int mdbx_rpid_set(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + return mdbx_lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); +} + +int mdbx_rpid_clear(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + return mdbx_lck_op(env->me_lfd, op_setlkw, F_UNLCK, env->me_pid, 1); +} + +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(pid > 0); + return mdbx_lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); +} + +/*---------------------------------------------------------------------------*/ + +static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, + const int rc); + +int __cold mdbx_lck_init(MDBX_env *env) { + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; + +#if MDBX_USE_ROBUST +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ + !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#else + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#endif + if (rc) + goto bailout; +#endif /* MDBX_USE_ROBUST */ + +#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ + + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc) + goto bailout; + + rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); + +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +} + +void __cold mdbx_lck_destroy(MDBX_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* try get exclusive access */ + if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) { + mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); + int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); + if (rc == 0) + rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); + assert(rc == 0); + (void)rc; + /* file locks would be released (by kernel) + * while the me_lfd will be closed */ + } + } +} + +static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_lock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_trylock(mutex); + if (unlikely(rc != 0 && rc != EBUSY)) + rc = mdbx_mutex_failed(env, mutex, rc); + return (rc != EBUSY) ? rc : MDBX_BUSY; +} + +static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_unlock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +int mdbx_rdt_lock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); + return rc; +} + +void mdbx_rdt_unlock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); + if (unlikely(MDBX_IS_ERROR(rc))) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +int mdbx_txn_lock(MDBX_env *env, bool dontwait) { + mdbx_trace(">>"); + int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) + : mdbx_robust_lock(env, env->me_wmutex); + mdbx_trace("<< rc %d", rc); + return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; +} + +void mdbx_txn_unlock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_unlock(env, env->me_wmutex); + mdbx_trace("<< rc %d", rc); + if (unlikely(MDBX_IS_ERROR(rc))) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +static int __cold internal_seize_lck(int lfd) { + assert(lfd != INVALID_HANDLE_VALUE); + + /* try exclusive access */ + int rc = mdbx_lck_exclusive(lfd, false); + if (rc == 0) + /* got exclusive */ + return MDBX_RESULT_TRUE; + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { + /* get shared access */ + rc = mdbx_lck_shared(lfd); + if (rc == 0) { + /* got shared, try exclusive again */ + rc = mdbx_lck_exclusive(lfd, true); + if (rc == 0) + /* now got exclusive */ + return MDBX_RESULT_TRUE; + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) + /* unable exclusive, but stay shared */ + return MDBX_RESULT_FALSE; + } + } + assert(MDBX_IS_ERROR(rc)); + return rc; +} + +int __cold mdbx_lck_seize(MDBX_env *env) { + assert(env->me_fd != INVALID_HANDLE_VALUE); + if (unlikely(op_setlk == 0)) + choice_fcntl(); + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + int rc = mdbx_lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + LCK_WHOLE); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + return rc; + } + return MDBX_RESULT_TRUE; + } + + if ((env->me_flags & MDBX_RDONLY) == 0) { + /* Check that another process don't operates in without-lck mode. */ + int rc = mdbx_lck_op(env->me_fd, op_setlk, F_WRLCK, env->me_pid, 1); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lock-against-without-lck", rc); + return rc; + } + } + + return internal_seize_lck(env->me_lfd); +} + +static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, + const int err) { + int rc = err; +#if MDBX_USE_ROBUST + if (err == EOWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + + int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); + rc = MDBX_SUCCESS; + if (!rlocked) { + if (unlikely(env->me_txn)) { + /* env is hosed if the dead thread was ours */ + env->me_flags |= MDBX_FATAL_ERROR; + env->me_txn = NULL; + rc = MDBX_PANIC; + } + } + mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + + int check_rc = mdbx_reader_check0(env, rlocked, NULL); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ + !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L + int mreco_rc = pthread_mutex_consistent_np(mutex); +#else + int mreco_rc = pthread_mutex_consistent(mutex); +#endif + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + + if (unlikely(mreco_rc)) + mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); + + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) + pthread_mutex_unlock(mutex); + return rc; + } +#else + (void)mutex; +#endif /* MDBX_USE_ROBUST */ + + mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + if (rc != EDEADLK) + env->me_flags |= MDBX_FATAL_ERROR; + return rc; +} diff --git a/libs/libmdbx/src/src/lck-posix.c b/libs/libmdbx/src/src/lck-posix.c index e8d4539a1b..3b93157980 100644 --- a/libs/libmdbx/src/src/lck-posix.c +++ b/libs/libmdbx/src/src/lck-posix.c @@ -18,11 +18,7 @@ * even though they don't support Robust Mutexes. * Compile with -DMDBX_USE_ROBUST=0. */ #ifndef MDBX_USE_ROBUST -/* Howard Chu: Android currently lacks Robust Mutex support */ -#if defined(EOWNERDEAD) && \ - !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ - with Robust Mutex too. */ \ - && __GLIBC_PREREQ(2, 10) +#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__) #define MDBX_USE_ROBUST 1 #else #define MDBX_USE_ROBUST 0 @@ -32,95 +28,199 @@ /*----------------------------------------------------------------------------*/ /* rthc */ -static __cold __attribute__((constructor)) void mdbx_global_constructor(void) { +static __cold __attribute__((__constructor__)) void +mdbx_global_constructor(void) { mdbx_rthc_global_init(); } -static __cold __attribute__((destructor)) void mdbx_global_destructor(void) { +static __cold __attribute__((__destructor__)) void +mdbx_global_destructor(void) { mdbx_rthc_global_dtor(); } /*----------------------------------------------------------------------------*/ /* lck */ +/* Описание реализации блокировок для POSIX: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct MDBX_lockinfo) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * + * Для блокировки файлов Используется только fcntl(F_SETLK), так как: + * - lockf() оперирует только эксклюзивной блокировкой и требует + * открытия файла в RW-режиме. + * - flock() не гарантирует атомарности при смене блокировок + * и оперирует только всем файлом целиком. + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive выполняется блокировка + * основного файла БД и при успехе lck-файла. + */ + #ifndef OFF_T_MAX -#define OFF_T_MAX (sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) +#define OFF_T_MAX \ + ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) +#endif +#ifndef PID_T_MAX +#define PID_T_MAX INT_MAX #endif -#define LCK_WHOLE OFF_T_MAX -static int mdbx_lck_op(mdbx_filehandle_t fd, int op, short lck, off_t offset, +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) +#define OP_SETLK F_OFD_SETLK +#define OP_SETLKW F_OFD_SETLKW +#define OP_GETLK F_OFD_GETLK +#else +#define OP_SETLK F_SETLK +#define OP_SETLKW F_SETLKW +#define OP_GETLK F_GETLK +#endif /* OFD locks */ + +static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, off_t len) { for (;;) { - int rc; struct flock lock_op; memset(&lock_op, 0, sizeof(lock_op)); lock_op.l_type = lck; lock_op.l_whence = SEEK_SET; lock_op.l_start = offset; lock_op.l_len = len; - if ((rc = fcntl(fd, op, &lock_op)) == 0) { - if (op == F_GETLK && lock_op.l_type != F_UNLCK) - rc = -lock_op.l_pid; - } else if ((rc = errno) == EINTR) { - continue; - } - return rc; - } -} - -static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { - assert(lfd != INVALID_HANDLE_VALUE); - if (flock(lfd, LOCK_EX | LOCK_NB)) - return errno; - int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); - if (rc != 0 && fallback2shared) { - while (flock(lfd, LOCK_SH)) { - int rc = errno; - if (rc != EINTR) - return rc; + if (fcntl(fd, cmd, &lock_op) == 0) { + if (cmd == OP_GETLK) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + return 0; } - } - return rc; -} - -static __inline int mdbx_lck_shared(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - while (flock(lfd, LOCK_SH)) { int rc = errno; - if (rc != EINTR) + if (rc != EINTR || cmd == F_SETLKW) return rc; } - return mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); -} - -int mdbx_lck_downgrade(MDBX_env *env, bool complete) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; } int mdbx_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1); } int mdbx_rpid_clear(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1); } -/* Checks reader by pid. - * - * Returns: - * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE, if pid is dead (lock acquired) - * or otherwise the errcode. */ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); - if (rc == 0) - return MDBX_RESULT_FALSE; - if (rc < 0 && -rc == pid) + assert(pid > 0 && pid <= PID_T_MAX); + assert(PID_T_MAX < OFF_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1); +} + +int __cold mdbx_lck_seize(MDBX_env *env) { + assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + int rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + return rc; + } return MDBX_RESULT_TRUE; + } + + /* try exclusive access */ + int rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX); + if (rc == 0) { + continue_exclusive: + /* got dxb-exclusive, continue lck-exclusive */ + rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX); + if (rc == 0) { + /* got both exclusive */ + return MDBX_RESULT_TRUE; + } + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lck-after-dxb-exclusive", rc); + assert(MDBX_IS_ERROR(rc)); + goto bailout; + } + + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { + rc = mdbx_lck_op(env->me_fd, OP_SETLKW, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + env->me_pid, 1); + if (rc == 0) { + /* got dxb-shared, try again dxb-exclusive */ + rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX); + if (rc == 0) + goto continue_exclusive; + + /* continue lck-shared */ + rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); + if (rc == 0) { + /* got both dxb and lck shared lock */ + return MDBX_RESULT_FALSE; + } + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc); + } else { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc); + } + assert(MDBX_IS_ERROR(rc)); + } + +bailout: + (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + assert(MDBX_IS_ERROR(rc)); + return rc; +} + +int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1); + if (rc == 0) + rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); + if (unlikely(rc != 0)) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc); + goto bailout; + } + if (complete) { + rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + env->me_pid, 1); + if (unlikely(rc != 0)) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc); + goto bailout; + } + } + return MDBX_SUCCESS; + +bailout: + (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + assert(MDBX_IS_ERROR(rc)); return rc; } @@ -140,11 +240,7 @@ int __cold mdbx_lck_init(MDBX_env *env) { goto bailout; #if MDBX_USE_ROBUST -#if __GLIBC_PREREQ(2, 12) rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); -#else - rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); -#endif if (rc) goto bailout; #endif /* MDBX_USE_ROBUST */ @@ -172,19 +268,28 @@ bailout: } void __cold mdbx_lck_destroy(MDBX_env *env) { + /* File locks would be released (by kernel) while the file-descriptors + * will be closed. But to avoid false-positive EDEADLK from the kernel, + * locks should be released here explicitly with properly order. */ if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) { + if (env->me_lck && + mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX) == 0 && + mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) { mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); assert(rc == 0); (void)rc; - /* file locks would be released (by kernel) - * while the me_lfd will be closed */ + msync(env->me_lck, env->me_os_psize, MS_ASYNC); } + (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); } + if (env->me_fd != INVALID_HANDLE_VALUE) + (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); } static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { @@ -239,64 +344,6 @@ void mdbx_txn_unlock(MDBX_env *env) { mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } -static int __cold internal_seize_lck(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - - /* try exclusive access */ - int rc = mdbx_lck_exclusive(lfd, false); - if (rc == 0) - /* got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { - /* get shared access */ - rc = mdbx_lck_shared(lfd); - if (rc == 0) { - /* got shared, try exclusive again */ - rc = mdbx_lck_exclusive(lfd, true); - if (rc == 0) - /* now got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) - /* unable exclusive, but stay shared */ - return MDBX_RESULT_FALSE; - } - } - assert(MDBX_IS_ERROR(rc)); - return rc; -} - -int __cold mdbx_lck_seize(MDBX_env *env) { - assert(env->me_fd != INVALID_HANDLE_VALUE); - - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, F_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - LCK_WHOLE); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); - return rc; - } - return MDBX_RESULT_TRUE; - } - - if ((env->me_flags & MDBX_RDONLY) == 0) { - /* Check that another process don't operates in without-lck mode. */ - int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_WRLCK, env->me_pid, 1); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "lock-against-without-lck", rc); - return rc; - } - } - - return internal_seize_lck(env->me_lfd); -} - -#if !__GLIBC_PREREQ(2, 12) && !defined(pthread_mutex_consistent) -#define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) -#endif - static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, const int err) { int rc = err; @@ -331,6 +378,8 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, pthread_mutex_unlock(mutex); return rc; } +#else + (void)mutex; #endif /* MDBX_USE_ROBUST */ mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); diff --git a/libs/libmdbx/src/src/lck-windows.c b/libs/libmdbx/src/src/lck-windows.c index 822ba9c293..34a6e5a5dc 100644 --- a/libs/libmdbx/src/src/lck-windows.c +++ b/libs/libmdbx/src/src/lck-windows.c @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -102,7 +102,7 @@ static # ifdef _WIN64 const # endif - PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((section(".CRT$XLB"), used)) = mdbx_dll_callback; + PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_dll_callback; #else # error FIXME #endif @@ -179,7 +179,7 @@ void mdbx_txn_unlock(MDBX_env *env) { #define LCK_LO_OFFSET 0 #define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders) #define LCK_UP_OFFSET LCK_LO_LEN -#define LCK_UP_LEN (MDBX_LOCKINFO_WHOLE_SIZE - LCK_UP_OFFSET) +#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET) #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN @@ -414,7 +414,9 @@ int mdbx_lck_seize(MDBX_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_RESULT_TRUE /* files were must be opened non-shareable */; + return MDBX_RESULT_TRUE /* nope since files were must be opened + non-shareable */ + ; if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ @@ -459,7 +461,8 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) { assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_SUCCESS /* files were must be opened non-shareable */; + return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ + ; /* 1) must be at E-E (exclusive-write) */ if (!complete) { @@ -664,6 +667,7 @@ MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; +MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_NtFsControlFile mdbx_NtFsControlFile; static void mdbx_winnt_import(void) { @@ -688,21 +692,14 @@ static void mdbx_winnt_import(void) { mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } - mdbx_GetFileInformationByHandleEx = - (MDBX_GetFileInformationByHandleEx)GetProcAddress( - hKernel32dll, "GetFileInformationByHandleEx"); +#define GET_KERNEL32_PROC(ENTRY) \ + mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(hKernel32dll, #ENTRY) - mdbx_GetVolumeInformationByHandleW = - (MDBX_GetVolumeInformationByHandleW)GetProcAddress( - hKernel32dll, "GetVolumeInformationByHandleW"); - - mdbx_GetFinalPathNameByHandleW = - (MDBX_GetFinalPathNameByHandleW)GetProcAddress( - hKernel32dll, "GetFinalPathNameByHandleW"); - - mdbx_SetFileInformationByHandle = - (MDBX_SetFileInformationByHandle)GetProcAddress( - hKernel32dll, "SetFileInformationByHandle"); + GET_KERNEL32_PROC(GetFileInformationByHandleEx); + GET_KERNEL32_PROC(GetVolumeInformationByHandleW); + GET_KERNEL32_PROC(GetFinalPathNameByHandleW); + GET_KERNEL32_PROC(SetFileInformationByHandle); + GET_KERNEL32_PROC(PrefetchVirtualMemory); const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); mdbx_NtFsControlFile = diff --git a/libs/libmdbx/src/src/mdbx.c b/libs/libmdbx/src/src/mdbx.c index 7a5e73b7f1..35d8ae1b68 100644 --- a/libs/libmdbx/src/src/mdbx.c +++ b/libs/libmdbx/src/src/mdbx.c @@ -291,7 +291,17 @@ typedef struct rthc_entry_t { static CRITICAL_SECTION rthc_critical_section; #else int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, void *dso_symbol) - __attribute__((weak)); + __attribute__((__weak__)); +#ifdef __APPLE__ /* FIXME: Thread-Local Storage destructors & DSO-unloading */ +int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, + void *dso_symbol) { + (void)dtor; + (void)obj; + (void)dso_symbol; + return -1; +} +#endif /* __APPLE__ */ + static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t mdbx_rthc_cond = PTHREAD_COND_INITIALIZER; static mdbx_thread_key_t mdbx_rthc_key; @@ -515,9 +525,9 @@ __cold void mdbx_rthc_global_dtor(void) { mdbx_thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - mdbx_trace("== [%i] = key %u, %p ... %p, rthc %p (%+i), " + mdbx_trace("== [%i] = key %zu, %p ... %p, rthc %p (%+i), " "rthc-pid %i, current-pid %i", - i, key, rthc_table[i].begin, rthc_table[i].end, rthc, + i, (size_t)key, rthc_table[i].begin, rthc_table[i].end, rthc, (int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid); if (rthc->mr_pid == self_pid) { rthc->mr_pid = 0; @@ -553,8 +563,8 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, return rc; mdbx_rthc_lock(); - mdbx_trace(">> key 0x%x, rthc_count %u, rthc_limit %u", *key, rthc_count, - rthc_limit); + mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)*key, + rthc_count, rthc_limit); if (rthc_count == rthc_limit) { rthc_entry_t *new_table = mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, @@ -568,13 +578,14 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, rthc_table = new_table; rthc_limit *= 2; } - mdbx_trace("== [%i] = key %u, %p ... %p", rthc_count, *key, begin, end); + mdbx_trace("== [%i] = key %zu, %p ... %p", rthc_count, (size_t)*key, begin, + end); rthc_table[rthc_count].key = *key; rthc_table[rthc_count].begin = begin; rthc_table[rthc_count].end = end; ++rthc_count; - mdbx_trace("<< key 0x%x, rthc_count %u, rthc_limit %u", *key, rthc_count, - rthc_limit); + mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)*key, + rthc_count, rthc_limit); mdbx_rthc_unlock(); return MDBX_SUCCESS; @@ -587,8 +598,8 @@ bailout: __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { mdbx_thread_key_delete(key); mdbx_rthc_lock(); - mdbx_trace(">> key 0x%x, rthc_count %u, rthc_limit %u", key, rthc_count, - rthc_limit); + mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)key, + rthc_count, rthc_limit); for (unsigned i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].key) { @@ -614,8 +625,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { } } - mdbx_trace("<< key 0x%x, rthc_count %u, rthc_limit %u", key, rthc_count, - rthc_limit); + mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, + rthc_count, rthc_limit); mdbx_rthc_unlock(); } @@ -1331,7 +1342,7 @@ const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, NULL); return size ? buf : NULL; -#elif defined(_GNU_SOURCE) +#elif defined(_GNU_SOURCE) && defined(__GLIBC__) /* GNU-specific */ msg = strerror_r(errnum, buf, buflen); #elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) @@ -2008,7 +2019,7 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env, mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid); meta->mm_txnid_a = txnid; (void)env; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); } static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, @@ -2019,7 +2030,7 @@ static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, mdbx_jitter4testing(true); meta->mm_txnid_b = txnid; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, @@ -2139,7 +2150,9 @@ static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { static __hot txnid_t mdbx_reclaiming_detent(const MDBX_env *env) { if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) - return env->me_txn->mt_txnid - 1; + return likely(env->me_txn0->mt_owner == mdbx_thread_self()) + ? env->me_txn0->mt_txnid - 1 + : mdbx_meta_txnid_fluid(env, mdbx_meta_head(env)); return mdbx_meta_txnid_stable(env, mdbx_meta_steady(env)); } @@ -2160,13 +2173,13 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0); MDBX_env *env = txn->mt_env; const txnid_t edge = mdbx_reclaiming_detent(env); - mdbx_tassert(txn, edge <= txn->mt_txnid - 1); + mdbx_tassert(txn, edge <= txn->mt_txnid); MDBX_lockinfo *const lck = env->me_lck; - if (unlikely(env->me_lck == NULL /* exclusive mode */)) - return env->me_oldest_stub = edge; + if (unlikely(lck == NULL /* exclusive mode */)) + return env->me_lckless_stub.oldest = edge; - const txnid_t last_oldest = lck->mti_oldest; + const txnid_t last_oldest = lck->mti_oldest_reader; mdbx_tassert(txn, edge >= last_oldest); if (likely(last_oldest == edge)) return edge; @@ -2179,7 +2192,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { txnid_t oldest = edge; lck->mti_readers_refresh_flag = nothing_changed; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); const unsigned snap_nreaders = lck->mti_numreaders; for (unsigned i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid) { @@ -2195,12 +2208,38 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { if (oldest != last_oldest) { mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest); - mdbx_tassert(txn, oldest >= lck->mti_oldest); - lck->mti_oldest = oldest; + mdbx_tassert(txn, oldest >= lck->mti_oldest_reader); + lck->mti_oldest_reader = oldest; } return oldest; } +/* Find largest mvcc-snapshot still referenced. */ +static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used; + const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid; + mdbx_memory_barrier(); + if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used || + snap_txnid != lck->mti_readers[i].mr_txnid)) + goto retry; + if (largest < snap_pages && + lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid && + snap_txnid <= env->me_txn0->mt_txnid) + largest = snap_pages; + } + } + } + + return largest; +} + /* Add a page to the txn's dirty list */ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) = @@ -2252,8 +2291,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, * 2) At least on Windows 10 1803 the entire mapped section is unavailable * for short time during NtExtendSection() or VirtualAlloc() execution. * - * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! - */ + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); array_onstack.count = 0; suspended = &array_onstack; @@ -2286,7 +2324,7 @@ bailout: env->me_dbgeo.upper = limit_bytes; if (env->me_txn) { mdbx_tassert(env->me_txn, size_pgno >= env->me_txn->mt_next_pgno); - env->me_txn->mt_end_pgno = size_pgno; + env->me_txn->mt_end_pgno = env->me_txn0->mt_end_pgno = size_pgno; } #ifdef USE_VALGRIND if (prev_mapsize != env->me_mapsize || prev_mapaddr != env->me_map) { @@ -2985,42 +3023,47 @@ __cold static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { return rc; } - MDBX_meta *head = mdbx_meta_head(env); - if (!META_IS_STEADY(head) || env->me_sync_pending) { - - if (force || (env->me_sync_threshold && - env->me_sync_pending >= env->me_sync_threshold)) + const MDBX_meta *head = mdbx_meta_head(env); + pgno_t unsynced_pages = *env->me_unsynced_pages; + if (!META_IS_STEADY(head) || unsynced_pages) { + const pgno_t autosync_threshold = *env->me_autosync_threshold; + const uint64_t unsynced_timeout = *env->me_unsynced_timeout; + if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || + (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (outside_txn && - env->me_sync_pending > - pgno2bytes(env, 16 /* FIXME: define threshold */) && - (flags & MDBX_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); + if (outside_txn) { + if (unsynced_pages > /* FIXME: define threshold */ 16 && + (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0) { + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - mdbx_txn_unlock(env); + mdbx_txn_unlock(env); - /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, - flags & MDBX_MAPASYNC) - : mdbx_filesync(env->me_fd, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* LY: pre-sync without holding lock to reduce latency for writer(s) */ + int rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) + : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - rc = mdbx_txn_lock(env, nonblock); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = mdbx_txn_lock(env, nonblock); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* LY: head may be changed. */ - head = mdbx_meta_head(env); + /* LY: head and unsynced_pages may be changed. */ + head = mdbx_meta_head(env); + unsynced_pages = *env->me_unsynced_pages; + } + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); } - if (!META_IS_STEADY(head) || env->me_sync_pending) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR, + if (!META_IS_STEADY(head) || + ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) { + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, container_of(head, MDBX_page, mp_data)->mp_pgno, - mdbx_durable_str(head), env->me_sync_pending); + mdbx_durable_str(head), unsynced_pages); MDBX_meta meta = *head; int rc = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta); if (unlikely(rc != MDBX_SUCCESS)) { @@ -3129,18 +3172,27 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { MDBX_env *env = txn->mt_env; int rc; +#if MDBX_TXN_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } +#endif /* MDBX_TXN_CHECKPID */ - STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE); + STATIC_ASSERT(sizeof(MDBX_reader) == 32); +#ifdef MDBX_OSAL_LOCK + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wmutex) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rmutex) % MDBX_CACHELINE_SIZE == 0); +#else + STATIC_ASSERT( + offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0); STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == 0); +#endif STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - pgno_t upper_pgno = 0; + pgno_t upper_limit_pgno = 0; if (flags & MDBX_TXN_RDONLY) { txn->mt_flags = MDBX_TXN_RDONLY; MDBX_reader *r = txn->mt_ro_reader; @@ -3207,17 +3259,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { r = &env->me_lck->mti_readers[slot]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the - * slot, next publish it in mtb.mti_numreaders. After + * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ r->mr_pid = 0; r->mr_txnid = ~(txnid_t)0; r->mr_tid = tid; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); if (slot == nreaders) env->me_lck->mti_numreaders = ++nreaders; - if (env->me_close_readers < nreaders) - env->me_close_readers = nreaders; r->mr_pid = env->me_pid; mdbx_rdt_unlock(env); @@ -3232,14 +3282,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_jitter4testing(false); const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); - if (r) { + if (likely(r)) { + r->mr_snapshot_pages_used = meta->mm_geo.next; r->mr_txnid = snap; mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_tid == mdbx_thread_self()); mdbx_assert(env, r->mr_txnid == snap); - mdbx_coherent_barrier(); + mdbx_compiler_barrier(); env->me_lck->mti_readers_refresh_flag = true; + mdbx_flush_noncoherent_cpu_writeback(); } mdbx_jitter4testing(true); @@ -3247,7 +3299,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_txnid = snap; txn->mt_next_pgno = meta->mm_geo.next; txn->mt_end_pgno = meta->mm_geo.now; - upper_pgno = meta->mm_geo.upper; + upper_limit_pgno = meta->mm_geo.upper; memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; @@ -3269,6 +3321,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); } else { /* Not yet touching txn == env->me_txn0, it may be active */ mdbx_jitter4testing(false); @@ -3317,7 +3371,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_geo.next; txn->mt_end_pgno = meta->mm_geo.now; - upper_pgno = meta->mm_geo.upper; + upper_limit_pgno = meta->mm_geo.upper; } /* Setup db info */ @@ -3338,12 +3392,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } else { const size_t size = pgno2bytes(env, txn->mt_end_pgno); if (unlikely(size > env->me_mapsize)) { - if (upper_pgno > MAX_PAGENO || - bytes2pgno(env, pgno2bytes(env, upper_pgno)) != upper_pgno) { + if (upper_limit_pgno > MAX_PAGENO || + bytes2pgno(env, pgno2bytes(env, upper_limit_pgno)) != + upper_limit_pgno) { rc = MDBX_MAP_RESIZED; goto bailout; } - rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_pgno); + rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_limit_pgno); if (rc != MDBX_SUCCESS) { if (rc == MDBX_RESULT_TRUE) rc = MDBX_MAP_RESIZED; @@ -3407,8 +3462,10 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; +#if MDBX_TXN_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) env->me_flags |= MDBX_FATAL_ERROR; +#endif /* MDBX_TXN_CHECKPID */ if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; @@ -3606,10 +3663,12 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { MDBX_env *env = txn->mt_env; static const char *const names[] = MDBX_END_NAMES; +#if MDBX_TXN_CHECKPID if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } +#endif /* MDBX_TXN_CHECKPID */ mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO "/%" PRIaPGNO, @@ -3618,13 +3677,21 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { #if defined(_WIN32) || defined(_WIN64) if (txn->mt_flags & MDBX_SHRINK_ALLOWED) mdbx_srwlock_ReleaseShared(&env->me_remap_guard); #endif if (txn->mt_ro_reader) { + mdbx_ensure(env, /* paranoia is appropriate here */ + txn->mt_txnid == txn->mt_ro_reader->mr_txnid && + txn->mt_ro_reader->mr_txnid >= + env->me_lck->mti_oldest_reader); + txn->mt_ro_reader->mr_snapshot_pages_used = 0; txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; + mdbx_memory_barrier(); env->me_lck->mti_readers_refresh_flag = true; if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -3632,7 +3699,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { txn->mt_ro_reader = NULL; } } - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; @@ -3741,7 +3808,7 @@ static __inline int mdbx_backlog_size(MDBX_txn *txn) { ? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) + txn->mt_loose_count : 0; - return reclaimed_and_loose + txn->mt_end_pgno - txn->mt_next_pgno; + return reclaimed_and_loose; } static __inline int mdbx_backlog_extragap(MDBX_env *env) { @@ -3754,7 +3821,9 @@ static __inline int mdbx_backlog_extragap(MDBX_env *env) { * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { /* LY: extra page(s) for b-tree rebalancing */ - const int extra = mdbx_backlog_extragap(txn->mt_env); + const int extra = + mdbx_backlog_extragap(txn->mt_env) + + MDBX_PNL_SIZEOF(txn->mt_befree_pages) / txn->mt_env->me_maxkey_limit; if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { mc->mc_flags &= ~C_RECLAIMING; @@ -3762,11 +3831,10 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(rc)) return rc; - int backlog; - while (unlikely((backlog = mdbx_backlog_size(txn)) < extra)) { + while (unlikely(mdbx_backlog_size(txn) < extra)) { rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); if (unlikely(rc)) { - if (unlikely(rc != MDBX_NOTFOUND)) + if (rc != MDBX_NOTFOUND) return rc; break; } @@ -3777,6 +3845,20 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { return MDBX_SUCCESS; } +static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc, + size_t bytes) { + const int wanna = + (int)OVPAGES(txn->mt_env, bytes) + mdbx_backlog_extragap(txn->mt_env); + if (unlikely(wanna > mdbx_backlog_size(txn))) { + mc->mc_flags &= ~C_RECLAIMING; + do { + if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS) + break; + } while (wanna > mdbx_backlog_size(txn)); + mc->mc_flags |= C_RECLAIMING; + } +} + /* Count all the pages in each DB and in the freelist and make sure * it matches the actual number of pages being used. * All named DBs must be open for a correct count. */ @@ -4108,7 +4190,7 @@ retry: mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); mc.mc_flags |= C_RECLAIMING; - if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND)) + if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } /* Write to last page of freeDB */ @@ -4116,6 +4198,7 @@ retry: key.iov_base = &txn->mt_txnid; do { data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4325,6 +4408,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); + mdbx_prep_backlog_data(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); if (unlikely(rc != MDBX_SUCCESS)) @@ -4422,9 +4506,10 @@ retry: if (unlikely(chunk > left)) { mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); - if (loop < 5 || chunk - left > env->me_maxgc_ov1page) { + if ((loop < 5 && chunk - left > loop / 2) || + chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (loop < 21) + if (loop < 7) mc.mc_flags &= ~C_GCFREEZE; } chunk = left; @@ -4545,8 +4630,7 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { } dp->mp_flags &= ~P_DIRTY; dp->mp_validator = 0 /* TODO */; - env->me_sync_pending += - IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; + *env->me_unsynced_pages += IS_OVERFLOW(dp) ? dp->mp_pages : 1; } goto done; } @@ -4567,8 +4651,9 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { dp->mp_flags &= ~P_DIRTY; dp->mp_validator = 0 /* TODO */; pos = pgno2bytes(env, pgno); - size = IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; - env->me_sync_pending += size; + const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + *env->me_unsynced_pages += npages; + size = pgno2bytes(env, npages); } /* Write up to MDBX_COMMIT_PAGES dirty pages at a time. */ if (pos != next_pos || n == MDBX_COMMIT_PAGES || wsize + size > MAX_WRITE) { @@ -4579,6 +4664,17 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { mdbx_debug("Write error: %s", mdbx_strerror(rc)); return rc; } + +#if MDBX_CPU_CACHE_MMAP_NONCOHERENT +#if defined(__linux__) || defined(__gnu_linux__) + if (linux_kernel_version >= 0x02060b00) + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the + * whole cache is always flushed. */ +#endif /* Linux */ + mdbx_invalidate_mmap_noncoherent_cache(env->me_map + wpos, wsize); +#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ + n = 0; } if (i > pagecount) @@ -4594,7 +4690,16 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { n++; } - mdbx_invalidate_cache(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); +#if MDBX_CPU_CACHE_MMAP_NONCOHERENT && \ + (defined(__linux__) || defined(__gnu_linux__)) + if (linux_kernel_version < 0x02060b00) { + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the whole + * cache is always flushed. */ + mdbx_invalidate_mmap_noncoherent_cache(env->me_map, + pgno2bytes(env, txn->mt_next_pgno)); + } +#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT && Linux */ for (i = keep; ++i <= pagecount;) { dp = dl[i].ptr; @@ -4665,10 +4770,12 @@ int mdbx_txn_commit(MDBX_txn *txn) { return MDBX_THREAD_MISMATCH; MDBX_env *env = txn->mt_env; +#if MDBX_TXN_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } +#endif /* MDBX_TXN_CHECKPID */ if (txn->mt_child) { rc = mdbx_txn_commit(txn->mt_child); @@ -5281,12 +5388,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0); + mdbx_assert(env, !META_IS_STEADY(head) || *env->me_unsynced_pages != 0); mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); - const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next); - if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + if (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) { + /* Check auto-sync conditions */ + const pgno_t autosync_threshold = *env->me_autosync_threshold; + const uint64_t unsynced_timeout = *env->me_unsynced_timeout; + if ((autosync_threshold && *env->me_unsynced_pages >= autosync_threshold) || + (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout)) + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + } /* LY: check conditions to shrink datafile */ const pgno_t backlog_gap = @@ -5295,51 +5407,68 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.shrink + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; - const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) - mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + const pgno_t largest = mdbx_find_largest( + env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next + : pending->mm_geo.next); + if (pending->mm_geo.now > largest && + pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; + const pgno_t with_backlog_gap = largest + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) + mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + } } } /* LY: step#1 - sync previously written/updated data-pages */ - int rc = MDBX_RESULT_TRUE; - if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) { + int rc = *env->me_unsynced_pages ? MDBX_RESULT_TRUE /* carry non-steady */ + : MDBX_RESULT_FALSE /* carry steady */; + if (rc != MDBX_RESULT_FALSE && (flags & MDBX_NOSYNC) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - MDBX_meta *const steady = mdbx_meta_steady(env); + MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env); if (flags & MDBX_WRITEMAP) { + const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next); rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; + rc = MDBX_RESULT_TRUE /* carry non-steady */; if ((flags & MDBX_MAPASYNC) == 0) { - if (unlikely(pending->mm_geo.next > steady->mm_geo.now)) { - rc = mdbx_filesize_sync(env->me_fd); + if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) { + rc = mdbx_filesync(env->me_fd, MDBX_SYNC_SIZE); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - env->me_sync_pending = 0; + rc = MDBX_RESULT_FALSE /* carry steady */; } } else { - rc = mdbx_filesync(env->me_fd, pending->mm_geo.next > steady->mm_geo.now); + rc = mdbx_filesync(env->me_fd, + (pending->mm_geo.next > recent_steady_meta->mm_geo.now) + ? MDBX_SYNC_DATA | MDBX_SYNC_SIZE + : MDBX_SYNC_DATA); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - env->me_sync_pending = 0; } } /* Steady or Weak */ - if (env->me_sync_pending == 0) { + if (rc == MDBX_RESULT_FALSE /* carry steady */) { pending->mm_datasync_sign = mdbx_meta_sign(pending); + *env->me_unsynced_pages = 0; + *env->me_unsynced_timeout = 0; } else { + assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + const uint64_t autosync_period = *env->me_autosync_period; + if (autosync_period && *env->me_unsynced_timeout == 0) + *env->me_unsynced_timeout = mdbx_osal_monotime() + autosync_period; pending->mm_datasync_sign = (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC ? MDBX_DATASIGN_NONE @@ -5425,7 +5554,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; mdbx_jitter4testing(true); - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); /* LY: 'commit' the meta */ mdbx_meta_update_end(env, target, pending->mm_txnid_b); @@ -5444,7 +5573,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, sizeof(head->mm_canary)) == 0); } target->mm_datasync_sign = pending->mm_datasync_sign; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); mdbx_jitter4testing(true); } else { rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), @@ -5458,7 +5587,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, (uint8_t *)target - env->me_map); goto fail; } - mdbx_invalidate_cache(target, sizeof(MDBX_meta)); + mdbx_invalidate_mmap_noncoherent_cache(target, sizeof(MDBX_meta)); } /* LY: step#3 - sync meta-pages. */ @@ -5477,7 +5606,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { - rc = mdbx_filesync(env->me_fd, false); + rc = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; } @@ -5582,7 +5711,7 @@ int __cold mdbx_env_create(MDBX_env **penv) { mdbx_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } - rc = mdbx_fastmutex_init(&env->me_lckless_wmutex); + rc = mdbx_fastmutex_init(&env->me_lckless_stub.wmutex); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_fastmutex_destroy(&env->me_remap_guard); mdbx_fastmutex_destroy(&env->me_dbi_lock); @@ -5601,14 +5730,15 @@ bailout: return rc; } -static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { +static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive, + const size_t usedsize) { int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper); if (unlikely(rc != MDBX_SUCCESS)) return rc; #ifdef MADV_DONTFORK - if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) + if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTFORK) != 0)) return errno; #endif @@ -5624,22 +5754,81 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { MADV_DONTDUMP); #endif -#ifdef MADV_REMOVE - if (usedsize && (env->me_flags & MDBX_WRITEMAP)) { - (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, - MADV_REMOVE); - } + if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) { +#ifdef MADV_REMOVE_OR_FREE + const size_t used_alined2os = mdbx_roundup2(usedsize, env->me_os_psize); + if (used_alined2os < env->me_mapsize) + (void)madvise(env->me_map + used_alined2os, + env->me_mapsize - used_alined2os, MADV_REMOVE_OR_FREE); #else - (void)usedsize; + (void)usedsize; #endif + } -#if defined(MADV_RANDOM) && defined(MADV_WILLNEED) - /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ - if (madvise(env->me_map, env->me_mapsize, - (env->me_flags & MDBX_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) +#ifdef POSIX_FADV_RANDOM + /* this also checks that the file size is valid for a particular FS */ + rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM); + if (unlikely(rc != 0)) + return rc; +#elif defined(F_RDAHEAD) + if (unlikely(fcntl(env->me_fd, F_RDAHEAD, 0) == -1)) + return errno; +#endif + +#if defined(MADV_RANDOM) + if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_RANDOM) != 0)) + return errno; +#elif defined(POSIX_MADV_RANDOM) + rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); + if (unlikely(rc != 0)) return errno; #endif + /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ + if (env->me_flags & MDBX_NORDAHEAD) { +#ifdef POSIX_FADV_DONTNEED + rc = posix_fadvise(env->me_fd, 0, env->me_mapsize, POSIX_FADV_DONTNEED); + if (unlikely(rc != 0)) + return rc; +#endif +#if defined(MADV_DONTNEED) + if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTNEED) != 0)) + return errno; +#elif defined(POSIX_MADV_DONTNEED) + rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_DONTNEED); + if (unlikely(rc != 0)) + return errno; +#endif + } else { +#ifdef POSIX_FADV_WILLNEED + rc = posix_fadvise(env->me_fd, 0, usedsize, POSIX_FADV_WILLNEED); + if (unlikely(rc != 0)) + return rc; +#elif defined(F_RDADVISE) + struct radvisory hint; + hint.ra_offset = 0; + hint.ra_count = usedsize; + if (unlikely(fcntl(env->me_fd, F_RDADVISE, &hint) == -1)) + return errno; +#endif +#if defined(MADV_WILLNEED) + if (unlikely(madvise(env->me_map, usedsize, MADV_WILLNEED) != 0)) + return errno; +#elif defined(POSIX_MADV_WILLNEED) + rc = posix_madvise(env->me_map, usedsize, POSIX_MADV_WILLNEED); + if (unlikely(rc != 0)) + return errno; +#endif +#if defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map; + hint.NumberOfBytes = usedsize; + (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); + } +#endif /* Windows */ + } + #ifdef USE_VALGRIND env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); @@ -5658,8 +5847,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; +#if MDBX_TXN_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) env->me_flags |= MDBX_FATAL_ERROR; +#endif /* MDBX_TXN_CHECKPID */ if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; @@ -5688,6 +5879,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, need_unlock = true; } MDBX_meta *head = mdbx_meta_head(env); + if (!inside_txn) { + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); + } if (pagesize < 0) pagesize = env->me_psize; @@ -5707,7 +5902,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (shrink_threshold < 0) shrink_threshold = pgno2bytes(env, head->mm_geo.shrink); - const size_t usedbytes = pgno2bytes(env, head->mm_geo.next); + const size_t usedbytes = + pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; @@ -5832,6 +6028,15 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (bytes2pgno(env, shrink_threshold) > UINT16_MAX) shrink_threshold = pgno2bytes(env, UINT16_MAX); +#ifdef POSIX_FADV_RANDOM + if (env->me_fd != INVALID_HANDLE_VALUE) { + /* this also checks that the file size is valid for a particular FS */ + rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM); + if (unlikely(rc != 0)) + goto bailout; + } +#endif + /* save user's geo-params for future open/create */ env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; @@ -5900,7 +6105,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, goto bailout; head = /* base address could be changed */ mdbx_meta_head(env); } - env->me_sync_pending += env->me_psize; + *env->me_unsynced_pages += 1; mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1); rc = mdbx_sync_locked(env, env->me_flags, &meta); } @@ -5964,7 +6169,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before_mmap; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; @@ -6133,9 +6338,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } } - err = mdbx_env_map(env, (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - ? 0 - : expected_bytes); + err = mdbx_env_map(env, lck_rc /* exclusive status */, expected_bytes); if (err != MDBX_SUCCESS) return err; @@ -6201,7 +6404,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { if (err) return err; - mdbx_invalidate_cache(env->me_map, pgno2bytes(env, NUM_METAS)); + mdbx_invalidate_mmap_noncoherent_cache(env->me_map, + pgno2bytes(env, NUM_METAS)); mdbx_ensure(env, undo_txnid == mdbx_meta_txnid_fluid(env, head)); mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env)); continue; @@ -6261,7 +6465,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); mdbx_meta_set_txnid(env, &meta, txnid + 1); - env->me_sync_pending += env->me_psize; + *env->me_unsynced_pages += 1; err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); if (err) { mdbx_info("error %d, while updating meta.geo: " @@ -6302,10 +6506,14 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (MDBX_IS_ERROR(rc)) return rc; - env->me_oldest = &env->me_oldest_stub; + env->me_oldest = &env->me_lckless_stub.oldest; + env->me_unsynced_timeout = &env->me_lckless_stub.unsynced_timeout; + env->me_autosync_period = &env->me_lckless_stub.autosync_period; + env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending; + env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold; env->me_maxreaders = UINT_MAX; #ifdef MDBX_OSAL_LOCK - env->me_wmutex = &env->me_lckless_wmutex; + env->me_wmutex = &env->me_lckless_stub.wmutex; #endif mdbx_debug("lck-setup:%s%s%s", " lck-less", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", @@ -6396,8 +6604,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; } else { - if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC && - env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC_DEVEL) { + if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { mdbx_error("lock region has invalid magic/version"); return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID @@ -6411,7 +6618,11 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } mdbx_assert(env, !MDBX_IS_ERROR(rc)); - env->me_oldest = &env->me_lck->mti_oldest; + env->me_oldest = &env->me_lck->mti_oldest_reader; + env->me_unsynced_timeout = &env->me_lck->mti_unsynced_timeout; + env->me_autosync_period = &env->me_lck->mti_autosync_period; + env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages; + env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold; #ifdef MDBX_OSAL_LOCK env->me_wmutex = &env->me_lck->mti_wmutex; #endif @@ -6658,6 +6869,10 @@ static void __cold mdbx_env_close0(MDBX_env *env) { if (env->me_lck) mdbx_munmap(&env->me_lck_mmap); env->me_oldest = nullptr; + env->me_unsynced_timeout = nullptr; + env->me_autosync_period = nullptr; + env->me_unsynced_pages = nullptr; + env->me_autosync_threshold = nullptr; mdbx_lck_destroy(env); if (env->me_lfd != INVALID_HANDLE_VALUE) { @@ -6673,6 +6888,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { if (unlikely(!env)) return MDBX_EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -6712,11 +6928,14 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { DeleteCriticalSection(&env->me_windowsbug_lock); #else mdbx_ensure(env, - mdbx_fastmutex_destroy(&env->me_lckless_wmutex) == MDBX_SUCCESS); - mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ +#ifdef MDBX_OSAL_LOCK + mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_lckless_stub.wmutex) == + MDBX_SUCCESS); +#endif + env->me_pid = 0; env->me_signature = 0; mdbx_free(env); @@ -11253,7 +11472,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, } #ifndef MDBX_WBUF -#define MDBX_WBUF (1024 * 1024) +#define MDBX_WBUF ((size_t)1024 * 1024) #endif #define MDBX_EOF 0x10 /* mdbx_env_copyfd1() is done reading */ @@ -11281,23 +11500,7 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { uint8_t *ptr; int toggle = 0; int rc; - -#if defined(F_SETNOSIGPIPE) - /* OS X delivers SIGPIPE to the whole process, not the thread that caused it. - * Disable SIGPIPE using platform specific fcntl. */ - int enabled = 1; - if (fcntl(my->mc_fd, F_SETNOSIGPIPE, &enabled)) - my->mc_error = errno; -#endif - -#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64) - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - rc = pthread_sigmask(SIG_BLOCK, &set, NULL); - if (rc != 0) - my->mc_error = rc; -#endif + size_t offset = pgno2bytes(my->mc_env, NUM_METAS); mdbx_condmutex_lock(&my->mc_condmutex); while (!my->mc_error) { @@ -11309,18 +11512,12 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { - rc = mdbx_write(my->mc_fd, ptr, wsize); + rc = mdbx_pwrite(my->mc_fd, ptr, wsize, offset); if (rc != MDBX_SUCCESS) { -#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64) - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise (at least OS X) - * gives it to the process on thread-exit (ITS#8504). */ - int tmp; - sigwait(&set, &tmp); - } -#endif my->mc_error = rc; + break; } + offset += wsize; } /* If there's an overflow page tail, write it too */ @@ -11654,25 +11851,47 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, return rc; } + mdbx_jitter4testing(false); + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ - memcpy(buffer, env->me_map, pgno2bytes(env, NUM_METAS)); + memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the spanshot copy */ (MDBX_meta *)(buffer + ((uint8_t *)mdbx_meta_head(env) - env->me_map)); - const uint64_t size = - mdbx_roundup2(pgno2bytes(env, headcopy->mm_geo.now), env->me_os_psize); - mdbx_txn_unlock(env); - /* Update signature to steady */ headcopy->mm_datasync_sign = mdbx_meta_sign(headcopy); + mdbx_txn_unlock(env); /* Copy the data */ - rc = mdbx_pwrite(fd, env->me_map + pgno2bytes(env, NUM_METAS), - pgno2bytes(env, read_txn->mt_next_pgno - NUM_METAS), - pgno2bytes(env, NUM_METAS)); + const uint64_t whole_size = + mdbx_roundup2(pgno2bytes(env, read_txn->mt_end_pgno), env->me_os_psize); + const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); + mdbx_jitter4testing(false); +#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) + for (off_t in_offset = meta_bytes; in_offset < (off_t)used_size;) { + off_t out_offset = in_offset; + ssize_t bytes_copied = copy_file_range( + env->me_fd, &in_offset, fd, &out_offset, used_size - in_offset, 0); + if (unlikely(bytes_copied <= 0)) { + rc = bytes_copied ? errno : MDBX_ENODATA; + break; + } + } +#else + uint8_t *data_buffer = buffer + meta_bytes; + for (size_t offset = meta_bytes; offset < used_size;) { + const size_t chunk = + (MDBX_WBUF < used_size - offset) ? MDBX_WBUF : used_size - offset; + memcpy(data_buffer, env->me_map + offset, chunk); + rc = mdbx_pwrite(fd, data_buffer, chunk, offset); + if (unlikely(rc != MDBX_SUCCESS)) + break; + offset += chunk; + } +#endif - if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_ftruncate(fd, size); + if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) + rc = mdbx_ftruncate(fd, whole_size); return rc; } @@ -11689,8 +11908,10 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const size_t buffer_size = pgno2bytes(env, NUM_METAS) + - ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : 0); + const size_t buffer_size = + pgno2bytes(env, NUM_METAS) + + ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : MDBX_WBUF); + uint8_t *buffer = NULL; rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) @@ -11708,7 +11929,7 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, /* Firstly write a stub to meta-pages. * Now we sure to incomplete copy will not be used. */ memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); rc = (flags & MDBX_CP_COMPACT) @@ -11718,12 +11939,15 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, mdbx_txn_abort(read_txn); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_filesync(fd, true); + rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + mdbx_memalign_free(buffer); return rc; } @@ -11782,6 +12006,12 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, unsigned flags) { } int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + if (unlikely(flags & ~CHANGEABLE)) return MDBX_EINVAL; @@ -11802,6 +12032,9 @@ int __cold mdbx_env_get_flags(MDBX_env *env, unsigned *arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + *arg = env->me_flags & (CHANGEABLE | CHANGELESS); return MDBX_SUCCESS; } @@ -11809,6 +12042,10 @@ int __cold mdbx_env_get_flags(MDBX_env *env, unsigned *arg) { int __cold mdbx_env_set_userctx(MDBX_env *env, void *ctx) { if (unlikely(!env)) return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + env->me_userctx = ctx; return MDBX_SUCCESS; } @@ -11820,6 +12057,10 @@ void *__cold mdbx_env_get_userctx(MDBX_env *env) { int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { if (unlikely(!env)) return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + #if MDBX_DEBUG env->me_assert_func = func; return MDBX_SUCCESS; @@ -11833,6 +12074,9 @@ int __cold mdbx_env_get_path(MDBX_env *env, const char **arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + *arg = env->me_path; return MDBX_SUCCESS; } @@ -11841,6 +12085,9 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + *arg = env->me_fd; return MDBX_SUCCESS; } @@ -11850,7 +12097,8 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) { * [in] db the MDBX_db record containing the stats to return. * [out] arg the address of an MDBX_stat structure to receive the stats. * Returns 0, this function always succeeds. */ -static int __cold mdbx_stat0(MDBX_env *env, MDBX_db *db, MDBX_stat *arg) { +static int __cold mdbx_stat0(const MDBX_env *env, const MDBX_db *db, + MDBX_stat *arg) { arg->ms_psize = env->me_psize; arg->ms_depth = db->md_depth; arg->ms_branch_pages = db->md_branch_pages; @@ -11861,54 +12109,108 @@ static int __cold mdbx_stat0(MDBX_env *env, MDBX_db *db, MDBX_stat *arg) { } int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { - MDBX_meta *meta; + return mdbx_env_stat2(env, NULL, arg, bytes); +} - if (unlikely(env == NULL || arg == NULL)) +int __cold mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *arg, size_t bytes) { + if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; + + if (txn) { + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + } + if (env) { + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + if (txn && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + } + if (unlikely(bytes != sizeof(MDBX_stat))) return MDBX_EINVAL; - meta = mdbx_meta_head(env); - return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); + const MDBX_db *db = + txn ? &txn->mt_dbs[MAIN_DBI] : &mdbx_meta_head(env)->mm_dbs[MAIN_DBI]; + return mdbx_stat0(txn ? txn->mt_env : env, db, arg); } int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { + return mdbx_env_info2(env, NULL, arg, bytes); +} - if (unlikely(env == NULL || arg == NULL)) +int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *arg, size_t bytes) { + if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; - if (bytes != sizeof(MDBX_envinfo)) + if (txn) { + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + } + if (env) { + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + if (txn && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + } else { + env = txn->mt_env; + } + + if (unlikely(bytes != sizeof(MDBX_envinfo))) return MDBX_EINVAL; const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - const MDBX_meta *meta; - do { - meta = mdbx_meta_head(env); - arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, meta); + while (1) { + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; + + const MDBX_meta *const recent_meta = mdbx_meta_head(env); + arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta); arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); arg->mi_meta0_sign = meta0->mm_datasync_sign; arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); arg->mi_meta1_sign = meta1->mm_datasync_sign; arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); arg->mi_meta2_sign = meta2->mm_datasync_sign; - arg->mi_last_pgno = meta->mm_geo.next - 1; - arg->mi_geo.lower = pgno2bytes(env, meta->mm_geo.lower); - arg->mi_geo.upper = pgno2bytes(env, meta->mm_geo.upper); - arg->mi_geo.current = pgno2bytes(env, meta->mm_geo.now); - arg->mi_geo.shrink = pgno2bytes(env, meta->mm_geo.shrink); - arg->mi_geo.grow = pgno2bytes(env, meta->mm_geo.grow); + + const MDBX_meta *txn_meta = recent_meta; + arg->mi_last_pgno = txn_meta->mm_geo.next - 1; + arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); + if (txn) { + arg->mi_last_pgno = txn->mt_next_pgno - 1; + arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); + + const txnid_t wanna_meta_txnid = + (txn->mt_flags & MDBX_RDONLY) ? txn->mt_txnid : txn->mt_txnid - 1; + txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; + txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; + txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; + } + arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); + arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); + arg->mi_geo.shrink = pgno2bytes(env, txn_meta->mm_geo.shrink); + arg->mi_geo.grow = pgno2bytes(env, txn_meta->mm_geo.grow); + arg->mi_mapsize = env->me_mapsize; mdbx_compiler_barrier(); - } while (unlikely(arg->mi_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) || - arg->mi_meta0_sign != meta0->mm_datasync_sign || - arg->mi_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) || - arg->mi_meta1_sign != meta1->mm_datasync_sign || - arg->mi_meta2_txnid != mdbx_meta_txnid_fluid(env, meta2) || - arg->mi_meta2_sign != meta2->mm_datasync_sign || - meta != mdbx_meta_head(env) || - arg->mi_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); + if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) && + arg->mi_meta0_sign == meta0->mm_datasync_sign && + arg->mi_meta1_txnid == mdbx_meta_txnid_fluid(env, meta1) && + arg->mi_meta1_sign == meta1->mm_datasync_sign && + arg->mi_meta2_txnid == mdbx_meta_txnid_fluid(env, meta2) && + arg->mi_meta2_sign == meta2->mm_datasync_sign && + recent_meta == mdbx_meta_head(env) && + arg->mi_recent_txnid == mdbx_meta_txnid_fluid(env, recent_meta))) + break; + } arg->mi_maxreaders = env->me_maxreaders; arg->mi_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX; @@ -12235,6 +12537,12 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { } int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_EINVAL; @@ -12437,7 +12745,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; txn->mt_dbs[dbi].md_seq = 0; - txn->mt_flags |= MDBX_TXN_DIRTY; } @@ -12561,8 +12868,12 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { } int __cold mdbx_reader_check(MDBX_env *env, int *dead) { - if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + if (unlikely(!env)) return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + if (dead) *dead = 0; return mdbx_reader_check0(env, false, dead); @@ -12575,10 +12886,12 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { mdbx_assert(env, rdt_locked >= 0); +#if MDBX_TXN_CHECKPID if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } +#endif /* MDBX_TXN_CHECKPID */ MDBX_lockinfo *const lck = env->me_lck; if (unlikely(lck == NULL)) { @@ -12671,36 +12984,51 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { } int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { - unsigned ret = mdbx_runtime_flags; - mdbx_runtime_flags = flags; - -#ifdef __linux__ - if (flags & MDBX_DBG_DUMP) { - int core_filter_fd = open("/proc/self/coredump_filter", O_TRUNC | O_RDWR); - if (core_filter_fd >= 0) { - char buf[32]; - const unsigned r = pread(core_filter_fd, buf, sizeof(buf), 0); - if (r > 0 && r < sizeof(buf)) { - buf[r] = 0; - unsigned long mask = strtoul(buf, NULL, 16); - if (mask != ULONG_MAX) { - mask |= 1 << 3 /* Dump file-backed shared mappings */; - mask |= 1 << 6 /* Dump shared huge pages */; - mask |= 1 << 8 /* Dump shared DAX pages */; - unsigned w = snprintf(buf, sizeof(buf), "0x%lx\n", mask); - if (w > 0 && w < sizeof(buf)) { - w = pwrite(core_filter_fd, buf, w, 0); - (void)w; + const int rc = mdbx_runtime_flags; + if (flags != -1) { +#if !MDBX_DEBUG + flags &= MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN; +#else + flags &= MDBX_DBG_ASSERT | MDBX_DBG_PRINT | MDBX_DBG_TRACE | + MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | MDBX_DBG_DUMP | + MDBX_DBG_LEGACY_MULTIOPEN; +#endif +#if defined(__linux__) || defined(__gnu_linux__) + if ((mdbx_runtime_flags ^ flags) & MDBX_DBG_DUMP) { + /* http://man7.org/linux/man-pages/man5/core.5.html */ + const unsigned long dump_bits = + 1 << 3 /* Dump file-backed shared mappings */ + | 1 << 6 /* Dump shared huge pages */ + | 1 << 8 /* Dump shared DAX pages */; + const int core_filter_fd = + open("/proc/self/coredump_filter", O_TRUNC | O_RDWR); + if (core_filter_fd != -1) { + char buf[32]; + intptr_t bytes = pread(core_filter_fd, buf, sizeof(buf), 0); + if (bytes > 0 && (size_t)bytes < sizeof(buf)) { + buf[bytes] = 0; + const unsigned long present_mask = strtoul(buf, NULL, 16); + const unsigned long wanna_mask = (flags & MDBX_DBG_DUMP) + ? present_mask | dump_bits + : present_mask & ~dump_bits; + if (wanna_mask != present_mask) { + bytes = snprintf(buf, sizeof(buf), "0x%lx\n", wanna_mask); + if (bytes > 0 && (size_t)bytes < sizeof(buf)) { + bytes = pwrite(core_filter_fd, buf, bytes, 0); + (void)bytes; + } } } + close(core_filter_fd); } - close(core_filter_fd); } +#endif /* Linux */ + mdbx_runtime_flags = flags; } -#endif /* __linux__ */ - mdbx_debug_logger = logger; - return ret; + if (-1 != (intptr_t)logger) + mdbx_debug_logger = logger; + return rc; } static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { @@ -12769,7 +13097,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { if (rc > 1) { asleep->mr_tid = 0; asleep->mr_pid = 0; - mdbx_coherent_barrier(); + mdbx_flush_noncoherent_cpu_writeback(); } } } @@ -12788,8 +13116,31 @@ int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - env->me_sync_threshold = bytes; - return env->me_map ? mdbx_env_sync(env, false) : MDBX_SUCCESS; + if (unlikely(env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) + return MDBX_EACCESS; + + if (unlikely(!env->me_map)) + return MDBX_EPERM; + + *env->me_autosync_threshold = bytes2pgno(env, bytes + env->me_psize - 1); + return bytes ? mdbx_env_sync(env, false) : MDBX_SUCCESS; +} + +int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) + return MDBX_EACCESS; + + if (unlikely(!env->me_map)) + return MDBX_EPERM; + + *env->me_autosync_period = mdbx_osal_16dot16_to_monotime(seconds_16dot16); + return seconds_16dot16 ? mdbx_env_sync(env, false) : MDBX_SUCCESS; } int __cold mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oomfunc) { @@ -12811,7 +13162,7 @@ MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDBX_env *env) { #ifdef __SANITIZE_THREAD__ /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ -__attribute__((no_sanitize_thread, noinline)) +__attribute__((__no_sanitize_thread__, __noinline__)) #endif int mdbx_txn_straggler(MDBX_txn *txn, int *percent) { @@ -12849,9 +13200,9 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) } typedef struct mdbx_walk_ctx { - MDBX_txn *mw_txn; void *mw_user; MDBX_pgvisitor_func *mw_visitor; + MDBX_cursor mw_cursor; } mdbx_walk_ctx_t; /* Depth-first tree traversal. */ @@ -12860,13 +13211,8 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (unlikely(pgno == P_INVALID)) return MDBX_SUCCESS; /* empty db */ - MDBX_cursor mc; - memset(&mc, 0, sizeof(mc)); - mc.mc_snum = 1; - mc.mc_txn = ctx->mw_txn; - MDBX_page *mp; - int rc = mdbx_page_get(&mc, pgno, &mp, NULL); + int rc = mdbx_page_get(&ctx->mw_cursor, pgno, &mp, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12922,7 +13268,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, MDBX_page *op; pgno_t large_pgno; memcpy(&large_pgno, NODEDATA(node), sizeof(pgno_t)); - rc = mdbx_page_get(&mc, large_pgno, &op, NULL); + rc = mdbx_page_get(&ctx->mw_cursor, large_pgno, &op, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -12934,13 +13280,14 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, const size_t over_header = PAGEHDRSZ; const size_t over_payload = NODEDSZ(node); - const size_t over_unused = pgno2bytes(ctx->mw_txn->mt_env, op->mp_pages) - - over_payload - over_header; - - rc = ctx->mw_visitor(large_pgno, op->mp_pages, ctx->mw_user, deep, dbi, - pgno2bytes(ctx->mw_txn->mt_env, op->mp_pages), - MDBX_page_large, 1, over_payload, over_header, - over_unused); + const size_t over_unused = + pgno2bytes(ctx->mw_cursor.mc_txn->mt_env, op->mp_pages) - + over_payload - over_header; + + rc = ctx->mw_visitor( + large_pgno, op->mp_pages, ctx->mw_user, deep, dbi, + pgno2bytes(ctx->mw_cursor.mc_txn->mt_env, op->mp_pages), + MDBX_page_large, 1, over_payload, over_header, over_unused); } break; case F_SUBDATA /* sub-db */: { @@ -12957,7 +13304,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, } break; case F_DUPDATA /* short sub-page */: { - if (NODEDSZ(node) < PAGEHDRSZ) + if (unlikely(NODEDSZ(node) < PAGEHDRSZ)) return MDBX_CORRUPTED; MDBX_page *sp = NODEDATA(node); @@ -13013,11 +13360,11 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, } rc = ctx->mw_visitor(mp->mp_pgno, 1, ctx->mw_user, deep, dbi, - ctx->mw_txn->mt_env->me_psize, type, nkeys, payload_size, - header_size, unused_size + align_bytes); + ctx->mw_cursor.mc_txn->mt_env->me_psize, type, nkeys, + payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) - return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : MDBX_SUCCESS; + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; for (int i = 0; i < nkeys; i++) { if (type == MDBX_page_dupfixed_leaf) @@ -13026,6 +13373,11 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, MDBX_node *node = NODEPTR(mp, i); if (type == MDBX_page_branch) { rc = mdbx_env_walk(ctx, dbi, NODEPGNO(node), deep + 1); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_RESULT_TRUE) + return rc; + break; + } continue; } @@ -13084,7 +13436,9 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, return MDBX_THREAD_MISMATCH; mdbx_walk_ctx_t ctx; - ctx.mw_txn = txn; + memset(&ctx, 0, sizeof(ctx)); + ctx.mw_cursor.mc_snum = 1; + ctx.mw_cursor.mc_txn = txn; ctx.mw_user = user; ctx.mw_visitor = visitor; @@ -13131,11 +13485,9 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { txn->mt_canary.v = txn->mt_txnid; if ((txn->mt_flags & MDBX_TXN_DIRTY) == 0) { - MDBX_env *env = txn->mt_env; txn->mt_flags |= MDBX_TXN_DIRTY; - env->me_sync_pending += env->me_psize; + *txn->mt_env->me_unsynced_pages += 1; } - return MDBX_SUCCESS; } @@ -14118,7 +14470,7 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, //---------------------------------------------------------------------------- #ifdef __SANITIZE_ADDRESS__ -LIBMDBX_API __attribute__((weak)) const char *__asan_default_options() { +LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { return "symbolize=1:allow_addr2line=1:" #ifdef _DEBUG "debug=1:" diff --git a/libs/libmdbx/src/src/osal.c b/libs/libmdbx/src/src/osal.c index e3997e2a45..7d0ff083ed 100644 --- a/libs/libmdbx/src/src/osal.c +++ b/libs/libmdbx/src/src/osal.c @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> @@ -153,13 +153,52 @@ typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { /*----------------------------------------------------------------------------*/ -#if !defined(_MSC_VER) && \ +#if _POSIX_C_SOURCE > 200212 && \ /* workaround for avoid musl libc wrong prototype */ ( \ defined(__GLIBC__) || defined(__GNU_LIBRARY__)) /* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */ -__nothrow __noreturn void __assert_fail(const char *assertion, const char *file, - unsigned line, const char *function); -#endif /* _MSC_VER */ +__extern_C void __assert_fail(const char *assertion, const char *file, + unsigned line, const char *function) +#ifdef __THROW + __THROW +#else + __nothrow +#endif /* __THROW */ + __noreturn; + +#elif defined(__APPLE__) || defined(__MACH__) +__extern_C void __assert_rtn(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + __noreturn +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; + +#define __assert_fail(assertion, file, line, function) \ + __assert_rtn(function, file, line, assertion) +#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ + defined(__DragonFly__) +__extern_C void __assert(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + __noreturn +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; +#define __assert_fail(assertion, file, line, function) \ + __assert(function, file, line, assertion) + +#endif /* __assert_fail */ void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, int line) { @@ -271,12 +310,15 @@ int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { (void)alignment; *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; -#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L - *result = memalign(alignment, bytes); +#elif defined(_ISOC11_SOURCE) + *result = aligned_alloc(alignment, bytes); return *result ? MDBX_SUCCESS : errno; #elif _POSIX_VERSION >= 200112L *result = nullptr; return posix_memalign(result, alignment, bytes); +#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L + *result = memalign(alignment, bytes); + return *result ? MDBX_SUCCESS : errno; #else #error FIXME #endif @@ -519,16 +561,30 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, (void)exclusive; #ifdef O_CLOEXEC flags |= O_CLOEXEC; -#endif +#endif /* O_CLOEXEC */ *fd = open(pathname, flags, mode); if (*fd < 0) return errno; -#if defined(FD_CLOEXEC) && defined(F_GETFD) - flags = fcntl(*fd, F_GETFD); - if (flags >= 0) - (void)fcntl(*fd, F_SETFD, flags | FD_CLOEXEC); -#endif + +#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) + int fd_flags = fcntl(*fd, F_GETFD); + if (fd_flags != -1) + (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); +#endif /* FD_CLOEXEC && !O_CLOEXEC */ + + if ((flags & (O_RDONLY | O_WRONLY | O_RDWR)) == O_WRONLY) { + /* assume for MDBX_env_copy() and friends output */ +#if defined(O_DIRECT) + int fd_flags = fcntl(*fd, F_GETFD); + if (fd_flags != -1) + (void)fcntl(*fd, F_SETFL, fd_flags | O_DIRECT); +#endif /* O_DIRECT */ +#if defined(F_NOCACHE) + (void)fcntl(*fd, F_NOCACHE, 1); +#endif /* F_NOCACHE */ + } #endif + return MDBX_SUCCESS; } @@ -544,7 +600,6 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) - OVERLAPPED ov; ov.hEvent = 0; ov.Offset = (DWORD)offset; @@ -569,37 +624,43 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, uint64_t offset) { + while (true) { #if defined(_WIN32) || defined(_WIN64) - if (bytes > MAX_WRITE) - return ERROR_INVALID_PARAMETER; - - OVERLAPPED ov; - ov.hEvent = 0; - ov.Offset = (DWORD)offset; - ov.OffsetHigh = HIGH_DWORD(offset); + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); - DWORD written; - if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) - return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; - return GetLastError(); + DWORD written; + if (unlikely(!WriteFile(fd, buf, + (bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, &ov))) + return GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; #else - int rc; - intptr_t written; - do { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwrite(fd, buf, bytes, offset); + const intptr_t written = + pwrite(fd, buf, (bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; - rc = errno; - } while (rc == EINTR); - return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } #endif + bytes -= written; + offset += written; + buf = (char *)buf + written; + } } int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written) { -#if defined(_WIN32) || defined(_WIN64) +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) size_t written = 0; for (int i = 0; i < iovcnt; ++i) { int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); @@ -625,57 +686,23 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { -#ifdef SIGPIPE - sigset_t set, old; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - int rc = pthread_sigmask(SIG_BLOCK, &set, &old); - if (rc != 0) - return rc; -#endif - - const char *ptr = buf; - for (;;) { - size_t chunk = (MAX_WRITE < bytes) ? MAX_WRITE : bytes; +int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) - DWORD written; - if (unlikely(!WriteFile(fd, ptr, (DWORD)chunk, &written, NULL))) - return GetLastError(); + return ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) == 0 || + FlushFileBuffers(fd)) + ? MDBX_SUCCESS + : GetLastError(); #else - intptr_t written = write(fd, ptr, chunk); - if (written < 0) { - int rc = errno; -#ifdef SIGPIPE - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). */ - int tmp; - sigwait(&set, &tmp); - written = 0; - continue; - } - pthread_sigmask(SIG_SETMASK, &old, NULL); -#endif - return rc; - } -#endif - if (likely(bytes == (size_t)written)) { -#ifdef SIGPIPE - pthread_sigmask(SIG_SETMASK, &old, NULL); -#endif - return MDBX_SUCCESS; - } - ptr += written; - bytes -= written; - } -} -int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) { -#if defined(_WIN32) || defined(_WIN64) - (void)filesize_changed; - return FlushFileBuffers(fd) ? MDBX_SUCCESS : GetLastError(); -#else +#if defined(__APPLE__) && \ + MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY + if (mode_bits & MDBX_SYNC_IODQ) + return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; +#endif /* MacOS */ +#if defined(__linux__) || defined(__gnu_linux__) + if (mode_bits == MDBX_SYNC_SIZE && mdbx_linux_kernel_version >= 0x03060000) + return MDBX_SUCCESS; +#endif /* Linux */ int rc; do { #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 @@ -684,12 +711,12 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) { * * For more info about of a corresponding fdatasync() bug * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - if (!filesize_changed) { + if ((mode_bits & MDBX_SYNC_SIZE) == 0) { if (fdatasync(fd) == 0) return MDBX_SUCCESS; } else #else - (void)filesize_changed; + (void)mode_bits; #endif if (fsync(fd) == 0) return MDBX_SUCCESS; @@ -699,22 +726,6 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) { #endif } -int mdbx_filesize_sync(mdbx_filehandle_t fd) { -#if defined(_WIN32) || defined(_WIN64) - (void)fd; - /* Nothing on Windows (i.e. newer 100% steady) */ - return MDBX_SUCCESS; -#else - for (;;) { - if (fsync(fd) == 0) - return MDBX_SUCCESS; - int rc = errno; - if (rc != EINTR) - return rc; - } -#endif -} - int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; @@ -803,8 +814,21 @@ int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { return MDBX_SUCCESS; return GetLastError(); #else +#ifdef __linux__ + if (async && mdbx_linux_kernel_version > 0x02061300) + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op, + since the kernel properly tracks dirty pages and flushes them to storage + as necessary. */ + return MDBX_SUCCESS; +#endif /* Linux */ const int mode = async ? MS_ASYNC : MS_SYNC; - return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; + int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; +#if defined(__APPLE__) && \ + MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY + if (rc == MDBX_SUCCESS && mode == MS_SYNC) + rc = likely(fcntl(map->fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; +#endif /* MacOS */ + return rc; #endif } @@ -1179,11 +1203,24 @@ retry_mapview:; return rc; #else if (limit != map->length) { - void *ptr = mremap(map->address, map->length, limit, MREMAP_MAYMOVE); - if (ptr == MAP_FAILED) - return errno; +#if defined(_GNU_SOURCE) && \ + !(defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ + defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__)) + void *ptr = mremap(map->address, map->length, limit, + /* LY: in case changing the mapping size calling code + must guarantees the absence of competing threads, and + a willingness to another base address */ + MREMAP_MAYMOVE); + if (ptr == MAP_FAILED) { + int err = errno; + return (err == EAGAIN || err == ENOMEM) ? MDBX_RESULT_TRUE : err; + } map->address = ptr; map->length = limit; +#else + return MDBX_RESULT_TRUE; +#endif /* mremap() <= _GNU_SOURCE && !__FreeBSD__ */ } return (flags & MDBX_RDONLY) ? MDBX_SUCCESS : mdbx_ftruncate(map->fd, size); #endif @@ -1214,3 +1251,71 @@ __cold void mdbx_osal_jitter(bool tiny) { #endif } } + +#if defined(_WIN32) || defined(_WIN64) +#elif defined(__APPLE__) || defined(__MACH__) +#include <mach/mach_time.h> +#elif defined(__linux__) || defined(__gnu_linux__) +static __cold clockid_t choice_monoclock() { + struct timespec probe; +#if defined(CLOCK_BOOTTIME) + if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) + return CLOCK_BOOTTIME; +#elif defined(CLOCK_MONOTONIC_RAW) + if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0) + return CLOCK_MONOTONIC_RAW; +#elif defined(CLOCK_MONOTONIC_COARSE) + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0) + return CLOCK_MONOTONIC_COARSE; +#endif + return CLOCK_MONOTONIC; +} +#endif + +uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +#if defined(_WIN32) || defined(_WIN64) + static LARGE_INTEGER performance_frequency; + if (performance_frequency.QuadPart == 0) + QueryPerformanceFrequency(&performance_frequency); + const uint64_t ratio = performance_frequency.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + static uint64_t ratio; + if (!ratio) { + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + ratio = UINT64_C(1000000000) * ti.denom / ti.numer; + } +#else + const uint64_t ratio = UINT64_C(1000000000); +#endif + return (ratio * seconds_16dot16 + 32768) >> 16; +} + +uint64_t mdbx_osal_monotime(void) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + counter.QuadPart = 0; + QueryPerformanceCounter(&counter); + return counter.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + return mach_absolute_time(); +#else + +#if defined(__linux__) || defined(__gnu_linux__) + static clockid_t posix_clockid = -1; + if (unlikely(posix_clockid < 0)) + posix_clockid = choice_monoclock(); +#elif defined(CLOCK_MONOTONIC) +#define posix_clockid CLOCK_MONOTONIC +#else +#define posix_clockid CLOCK_REALTIME +#endif + + struct timespec ts; + if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) { + ts.tv_nsec = 0; + ts.tv_sec = 0; + } + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#endif +} diff --git a/libs/libmdbx/src/src/osal.h b/libs/libmdbx/src/src/osal.h index 0208a52254..bd51f34a81 100644 --- a/libs/libmdbx/src/src/osal.h +++ b/libs/libmdbx/src/src/osal.h @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> @@ -41,7 +41,6 @@ /*----------------------------------------------------------------------------*/ /* C99 includes */ - #include <inttypes.h> #include <stddef.h> #include <stdint.h> @@ -50,11 +49,32 @@ #include <assert.h> #include <fcntl.h> #include <limits.h> -#include <malloc.h> #include <stdio.h> #include <string.h> #include <time.h> +/* C11 stdalign.h */ +#if __has_include(<stdalign.h>) +#include <stdalign.h> +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define alignas(N) _Alignas(N) +#elif defined(_MSC_VER) +#define alignas(N) __declspec(align(N)) +#elif __has_attribute(__aligned__) || defined(__GNUC__) +#define alignas(N) __attribute__((__aligned__(N))) +#else +#error "FIXME: Required _alignas() or equivalent." +#endif + +/*----------------------------------------------------------------------------*/ +/* Systems includes */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ + defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__) +#include <sys/cdefs.h> +#else +#include <malloc.h> #ifndef _POSIX_C_SOURCE #ifdef _POSIX_SOURCE #define _POSIX_C_SOURCE 1 @@ -62,14 +82,12 @@ #define _POSIX_C_SOURCE 0 #endif #endif +#endif /* !xBSD */ #ifndef _XOPEN_SOURCE #define _XOPEN_SOURCE 0 #endif -/*----------------------------------------------------------------------------*/ -/* Systems includes */ - #if defined(_WIN32) || defined(_WIN64) #define WIN32_LEAN_AND_MEAN #include <tlhelp32.h> @@ -174,6 +192,22 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #define SSIZE_MAX INTPTR_MAX #endif +#if !defined(MADV_DODUMP) && defined(MADV_CORE) +#define MADV_DODUMP MADV_CORE +#endif /* MADV_CORE -> MADV_DODUMP */ + +#if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE) +#define MADV_DONTDUMP MADV_NOCORE +#endif /* MADV_NOCORE -> MADV_DONTDUMP */ + +#ifndef MADV_REMOVE_OR_FREE +#ifdef MADV_REMOVE +#define MADV_REMOVE_OR_FREE MADV_REMOVE +#elif defined(MADV_FREE) +#define MADV_REMOVE_OR_FREE MADV_FREE +#endif +#endif /* MADV_REMOVE_OR_FREE */ + #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ defined(i486) || defined(__i486) || defined(__i486__) || \ defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ @@ -378,14 +412,14 @@ static __inline void mdbx_memory_barrier(void) { /*----------------------------------------------------------------------------*/ /* Cache coherence and invalidation */ -#ifndef MDBX_CACHE_IS_COHERENT +#ifndef MDBX_CPU_WRITEBACK_IS_COHERENT #if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ defined(__hppa__) -#define MDBX_CACHE_IS_COHERENT 1 +#define MDBX_CPU_WRITEBACK_IS_COHERENT 1 #else -#define MDBX_CACHE_IS_COHERENT 0 +#define MDBX_CPU_WRITEBACK_IS_COHERENT 0 #endif -#endif /* MDBX_CACHE_IS_COHERENT */ +#endif /* MDBX_CPU_WRITEBACK_IS_COHERENT */ #ifndef MDBX_CACHELINE_SIZE #if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) @@ -397,40 +431,54 @@ static __inline void mdbx_memory_barrier(void) { #endif #endif /* MDBX_CACHELINE_SIZE */ -#if MDBX_CACHE_IS_COHERENT -#define mdbx_coherent_barrier() mdbx_compiler_barrier() +#if MDBX_CPU_WRITEBACK_IS_COHERENT +#define mdbx_flush_noncoherent_cpu_writeback() mdbx_compiler_barrier() #else -#define mdbx_coherent_barrier() mdbx_memory_barrier() +#define mdbx_flush_noncoherent_cpu_writeback() mdbx_memory_barrier() #endif -#if defined(__mips) || defined(__mips__) || defined(__mips64) || \ - defined(__mips64) || defined(_M_MRX000) || defined(_MIPS_) -/* Only MIPS has explicit cache control */ +#if __has_include(<sys/cachectl.h>) +#include <sys/cachectl.h> +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS should have explicit cache control */ #include <sys/cachectl.h> #endif -static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { - mdbx_coherent_barrier(); +#ifndef MDBX_CPU_CACHE_MMAP_NONCOHERENT #if defined(__mips) || defined(__mips__) || defined(__mips64) || \ - defined(__mips64) || defined(_M_MRX000) || defined(_MIPS_) -#if defined(DCACHE) + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS has cache coherency issues. */ +#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 1 +#else +/* LY: assume no relevant mmap/dcache issues. */ +#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 0 +#endif +#endif /* ndef MDBX_CPU_CACHE_MMAP_NONCOHERENT */ + +static __inline void mdbx_invalidate_mmap_noncoherent_cache(void *addr, + size_t nbytes) { +#if MDBX_CPU_CACHE_MMAP_NONCOHERENT +#ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ cacheflush(addr, nbytes, DCACHE); #else -#error "Sorry, cacheflush() for MIPS not implemented" -#endif /* __mips__ */ -#else - /* LY: assume no relevant mmap/dcache issues. */ +#error "Oops, cacheflush() not available" +#endif /* DCACHE */ +#else /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ (void)addr; (void)nbytes; -#endif +#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ } /*----------------------------------------------------------------------------*/ /* libc compatibility stuff */ -#if __GLIBC_PREREQ(2, 1) +#if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ + (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else @@ -444,6 +492,10 @@ int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); /* max bytes to write in one call */ #define MAX_WRITE UINT32_C(0x3fff0000) +#if defined(__linux__) || defined(__gnu_linux__) +extern uint32_t mdbx_linux_kernel_version; +#endif /* Linux */ + /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ @@ -494,14 +546,19 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count); int mdbx_thread_create(mdbx_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); int mdbx_thread_join(mdbx_thread_t thread); -int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync); +enum mdbx_syncmode_bits { + MDBX_SYNC_DATA = 1, + MDBX_SYNC_SIZE = 2, + MDBX_SYNC_IODQ = 4 +}; + +int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits); int mdbx_filesize_sync(mdbx_filehandle_t fd); int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); @@ -560,6 +617,8 @@ static __inline mdbx_tid_t mdbx_thread_self(void) { } void mdbx_osal_jitter(bool tiny); +uint64_t mdbx_osal_monotime(void); +uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -572,27 +631,82 @@ void mdbx_osal_jitter(bool tiny); #define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017) #endif /* MDBX_OSAL_LOCK */ -#ifdef MDBX_OSAL_LOCK -#define MDBX_OSAL_LOCK_SIZE sizeof(MDBX_OSAL_LOCK) -#else -#define MDBX_OSAL_LOCK_SIZE 0 -#endif /* MDBX_OSAL_LOCK_SIZE */ - +/// \brief Инициализация объектов синхронизации внутри текущего процесса +/// связанных с экземпляром MDBX_env. +/// \return Код ошибки или 0 в случае успеха. int mdbx_lck_init(MDBX_env *env); +/// \brief Отключение от общих межпроцесных объектов и разрушение объектов +/// синхронизации внутри текущего процесса связанных с экземпляром MDBX_env. +void mdbx_lck_destroy(MDBX_env *env); + +/// \brief Подключение к общим межпроцесным объектам блокировки с попыткой +/// захвата блокировки максимального уровня (разделяемой при недоступности +/// эксклюзивной). +/// В зависимости от реализации и/или платформы (Windows) может +/// захватывать блокировку не-операционного супер-уровня (например, для +/// инициализации разделяемых объектов синхронизации), которая затем будет +/// понижена до операционно-эксклюзивной или разделяемой посредством +/// явного вызова mdbx_lck_downgrade(). +/// \return +/// MDBX_RESULT_TRUE (-1) - если удалось захватить эксклюзивную блокировку и, +/// следовательно, текущий процесс является первым и единственным +/// после предыдущего использования БД. +/// MDBX_RESULT_FALSE (0) - если удалось захватить только разделяемую +/// блокировку и, следовательно, БД уже открыта и используется другими +/// процессами. +/// Иначе (не 0 и не -1) - код ошибки. int mdbx_lck_seize(MDBX_env *env); + +/// \brief Снижает уровень первоначальной захваченной блокировки до +/// операционного уровня определяемого аргументом. +/// \param +/// complete = TRUE - понижение до разделяемой блокировки. +/// complete = FALSE - понижение до эксклюзивной операционной блокировки. +/// \return Код ошибки или 0 в случае успеха. int mdbx_lck_downgrade(MDBX_env *env, bool complete); -void mdbx_lck_destroy(MDBX_env *env); +/// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации. +/// \return Код ошибки или 0 в случае успеха. int mdbx_rdt_lock(MDBX_env *env); + +/// \brief Разблокирует lck-файл и/или таблицу читателей после (де)регистрации. void mdbx_rdt_unlock(MDBX_env *env); +/// \brief Захватывает блокировку для изменения БД (при старте пишущей +/// транзакции). Транзакции чтения при этом никак не блокируются. +/// \return Код ошибки или 0 в случае успеха. LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dontwait); + +/// \brief Освобождает блокировку по окончанию изменения БД (после завершения +/// пишущей транзакции). LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Устанавливает alive-флажок присутствия (индицирующую блокировку) +/// читателя для pid текущего процесса. Функции может выполнить не более +/// необходимого минимума для корректной работы mdbx_rpid_check() в других +/// процессах. +/// \return Код ошибки или 0 в случае успеха. int mdbx_rpid_set(MDBX_env *env); + +/// \brief Снимает alive-флажок присутствия (индицирующую блокировку) +/// читателя для pid текущего процесса. Функции может выполнить не более +/// необходимого минимума для корректной работы mdbx_rpid_check() в других +/// процессах. +/// \return Код ошибки или 0 в случае успеха. int mdbx_rpid_clear(MDBX_env *env); +/// \brief Проверяет жив ли процесс-читатель с заданным pid +/// по alive-флажку присутствия (индицирующей блокировку), +/// либо любым другим способом. +/// \return +/// MDBX_RESULT_TRUE (-1) - если процесс-читатель с соответствующим pid жив +/// и работает с БД (индицирующая блокировка присутствует). +/// MDBX_RESULT_FALSE (0) - если процесс-читатель с соответствующим pid +/// отсутствует или не работает с БД (индицирующая блокировка отсутствует). +/// Иначе (не 0 и не -1) - код ошибки. +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); + #if defined(_WIN32) || defined(_WIN64) typedef union MDBX_srwlock { struct { @@ -618,7 +732,6 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); - extern MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, @@ -630,7 +743,6 @@ extern MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); - extern MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -639,18 +751,21 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); - extern MDBX_NtFsControlFile mdbx_NtFsControlFile; -#endif /* Windows */ +#if _WIN32_WINNT < _WIN32_WINNT_WIN8 +typedef struct _WIN32_MEMORY_RANGE_ENTRY { + PVOID VirtualAddress; + SIZE_T NumberOfBytes; +} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; +#endif -/* Checks reader by pid. - * - * Returns: - * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE, if pid is dead (lock acquired) - * or otherwise the errcode. */ -int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); +typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( + HANDLE hProcess, ULONG_PTR NumberOfEntries, + PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); +extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; + +#endif /* Windows */ /*----------------------------------------------------------------------------*/ /* Atomics */ @@ -755,8 +870,8 @@ static __inline bool mdbx_atomic_compare_and_swap64(volatile uint64_t *p, /*----------------------------------------------------------------------------*/ -#if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER < 1920 -/* LY: MSVC 2015/2017 has buggy/inconsistent PRIuPTR/PRIxPTR macros +#if defined(_MSC_VER) && _MSC_VER >= 1900 +/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ #undef PRIuPTR #undef PRIiPTR diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.c b/libs/libmdbx/src/src/tools/mdbx_chk.c index eb8d2681b1..9d3b9e063c 100644 --- a/libs/libmdbx/src/src/tools/mdbx_chk.c +++ b/libs/libmdbx/src/src/tools/mdbx_chk.c @@ -20,6 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ +/* Avoid reference to mdbx_runtime_flags from assert() */ +#define mdbx_runtime_flags (~0u) #include "../bits.h" typedef struct flagbit { @@ -73,10 +75,10 @@ typedef struct { } walk_dbi_t; struct { - walk_dbi_t dbi[MAX_DBI]; short *pagemap; uint64_t total_payload_bytes; uint64_t pgcount; + walk_dbi_t dbi[MAX_DBI]; } walk; #define dbi_free walk.dbi[FREE_DBI] @@ -91,7 +93,7 @@ MDBX_txn *txn; MDBX_envinfo envinfo; MDBX_stat envstat; size_t maxkeysize, userdb_count, skipped_subdb; -uint64_t reclaimable_pages, gc_pages, lastpgno, unused_pages; +uint64_t reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; char ignore_wrong_order, quiet; const char *only_subdb; @@ -105,11 +107,7 @@ struct problem { struct problem *problems_list; uint64_t total_problems; -static void -#ifdef __GNUC__ - __attribute__((format(printf, 1, 2))) -#endif - print(const char *msg, ...) { +static void __printf_args(1, 2) print(const char *msg, ...) { if (!quiet) { va_list args; @@ -120,11 +118,7 @@ static void } } -static void -#ifdef __GNUC__ - __attribute__((format(printf, 1, 2))) -#endif - error(const char *msg, ...) { +static void __printf_args(1, 2) error(const char *msg, ...) { total_problems++; if (!quiet) { @@ -181,10 +175,8 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { return last = dbi; } -static void -#ifdef __GNUC__ - __attribute__((format(printf, 4, 5))) -#endif +static void __printf_args(4, 5) + problem_add(const char *object, uint64_t entry_number, const char *msg, const char *extra, ...) { total_problems++; @@ -252,11 +244,12 @@ static size_t problems_pop(struct problem *list) { return count; } -static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, int deep, - const char *dbi_name_or_tag, size_t page_size, - MDBX_page_type_t pagetype, size_t nentries, - size_t payload_bytes, size_t header_bytes, - size_t unused_bytes) { +static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, + void *const ctx, const int deep, + const char *const dbi_name_or_tag, const size_t page_size, + const MDBX_page_type_t pagetype, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { (void)ctx; if (deep > 42) { problem_add("deep", deep, "too large", nullptr); @@ -326,23 +319,24 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, int deep, } bool already_used = false; - do { - if (pgno >= lastpgno) - problem_add("page", pgno, "wrong page-no", + for (unsigned n = 0; n < pgnumber; ++n) { + uint64_t spanpgno = pgno + n; + if (spanpgno >= alloc_pages) + problem_add("page", spanpgno, "wrong page-no", "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i", - pagetype_caption, pgno, lastpgno, deep); - else if (walk.pagemap[pgno]) { - walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[pgno] - 1]; - problem_add( - "page", pgno, (branch && coll_dbi == dbi) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, deep); + pagetype_caption, spanpgno, alloc_pages, deep); + else if (walk.pagemap[spanpgno]) { + walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; + problem_add("page", spanpgno, + (branch && coll_dbi == dbi) ? "loop" : "already used", + "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, + deep); already_used = true; } else { - walk.pagemap[pgno] = (short)(dbi - walk.dbi + 1); + walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1); dbi->pages.total += 1; } - ++pgno; - } while (--pgnumber); + } if (already_used) return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ @@ -443,28 +437,38 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (envinfo.mi_latter_reader_txnid > txnid) reclaimable_pages += number; - pgno_t prev = - MDBX_PNL_ASCENDING ? NUM_METAS - 1 : (pgno_t)envinfo.mi_last_pgno + 1; + pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; pgno_t span = 1; for (unsigned i = 0; i < number; ++i) { const pgno_t pgno = iptr[i]; - if (pgno < NUM_METAS || pgno > envinfo.mi_last_pgno) + if (pgno < NUM_METAS) + problem_add("entry", txnid, "wrong idl entry", + "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); + else if (pgno >= backed_pages) + problem_add("entry", txnid, "wrong idl entry", + "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, + backed_pages); + else if (pgno >= alloc_pages) problem_add("entry", txnid, "wrong idl entry", - "%u < %" PRIaPGNO " < %" PRIu64, NUM_METAS, pgno, - envinfo.mi_last_pgno); + "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, + alloc_pages - 1); else { if (MDBX_PNL_DISORDERED(prev, pgno)) { bad = " [bad sequence]"; problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " <> %" PRIaPGNO, prev, pgno); + "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, + (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), + i, pgno); } - if (walk.pagemap && walk.pagemap[pgno]) { - if (walk.pagemap[pgno] > 0) + if (walk.pagemap) { + int idx = walk.pagemap[pgno]; + if (idx == 0) + walk.pagemap[pgno] = -1; + else if (idx > 0) problem_add("page", pgno, "already used", "by %s", - walk.dbi[walk.pagemap[pgno] - 1].name); + walk.dbi[idx - 1].name); else problem_add("page", pgno, "already listed in GC", nullptr); - walk.pagemap[pgno] = -1; } } prev = pgno; @@ -700,7 +704,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (handler) { rc = handler(record_count, &key, &data); - if (rc) + if (MDBX_IS_ERROR(rc)) goto bailout; } @@ -733,7 +737,7 @@ bailout: } mdbx_cursor_close(mc); - return rc || problems_count; + return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS; } static void usage(char *prog) { @@ -1008,7 +1012,7 @@ int main(int argc, char *argv[]) { #if defined(_WIN32) || defined(_WIN64) rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION #else - rc == EBUSY + rc == EBUSY || rc == EAGAIN #endif )) { envflags &= ~MDBX_EXCLUSIVE; @@ -1048,20 +1052,105 @@ int main(int argc, char *argv[]) { } maxkeysize = rc; - rc = mdbx_env_info(env, &envinfo, sizeof(envinfo)); + rc = mdbx_env_info2(env, txn, &envinfo, sizeof(envinfo)); if (rc) { error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } - rc = mdbx_env_stat(env, &envstat, sizeof(envstat)); + rc = mdbx_env_stat2(env, txn, &envstat, sizeof(envstat)); if (rc) { error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } - lastpgno = envinfo.mi_last_pgno + 1; + mdbx_filehandle_t dxb_fd; + rc = mdbx_env_get_fd(env, &dxb_fd); + if (rc) { + error("mdbx_env_get_fd failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + uint64_t dxb_filesize = 0; +#if defined(_WIN32) || defined(_WIN64) + { + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(dxb_fd, &info)) + rc = GetLastError(); + else + dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; + } +#else + { + struct stat st; + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + if (fstat(dxb_fd, &st)) + rc = errno; + else + dxb_filesize = st.st_size; + } +#endif + if (rc) { + error("mdbx_filesize failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + errno = 0; + const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize; + alloc_pages = txn->mt_next_pgno; + backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; +#if !(defined(_WIN32) || defined(_WIN64)) + if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY && + backed_pages != dxbfile_pages) { + print(" ! backed-pages %" PRIu64 " != file-pages %" PRIu64 "\n", + backed_pages, dxbfile_pages); + ++problems_meta; + } +#endif /* !Windows */ + if (dxbfile_pages < NUM_METAS) + print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS); + if (backed_pages < NUM_METAS) + print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS); + if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS) + goto bailout; + if (backed_pages > MAX_PAGENO) { + print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n", + backed_pages, MAX_PAGENO); + ++problems_meta; + backed_pages = MAX_PAGENO; + } + + if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { + if (backed_pages > dxbfile_pages) { + print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", + backed_pages, dxbfile_pages); + ++problems_meta; + backed_pages = dxbfile_pages; + } + if (alloc_pages > backed_pages) { + print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", + alloc_pages, backed_pages); + ++problems_meta; + alloc_pages = backed_pages; + } + } else { + /* LY: DB may be shrinked by writer downto the allocated pages. */ + if (alloc_pages > backed_pages) { + print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", + alloc_pages, backed_pages); + ++problems_meta; + alloc_pages = backed_pages; + } + if (alloc_pages > dxbfile_pages) { + print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n", + alloc_pages, dxbfile_pages); + ++problems_meta; + alloc_pages = dxbfile_pages; + } + if (backed_pages > dxbfile_pages) + backed_pages = dxbfile_pages; + } if (verbose) { print(" - pagesize %u (%u system), max keysize %" PRIuPTR @@ -1094,17 +1183,17 @@ int main(int argc, char *argv[]) { print(" - performs check for meta-pages clashes\n"); if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { - print(" - meta-%d and meta-%d are clashed\n", 0, 1); + print(" ! meta-%d and meta-%d are clashed\n", 0, 1); ++problems_meta; } if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { - print(" - meta-%d and meta-%d are clashed\n", 1, 2); + print(" ! meta-%d and meta-%d are clashed\n", 1, 2); ++problems_meta; } if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { - print(" - meta-%d and meta-%d are clashed\n", 2, 0); + print(" ! meta-%d and meta-%d are clashed\n", 2, 0); ++problems_meta; } @@ -1129,7 +1218,7 @@ int main(int argc, char *argv[]) { print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(NULL); - walk.pagemap = mdbx_calloc((size_t)lastpgno, sizeof(*walk.pagemap)); + walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); if (!walk.pagemap) { rc = errno ? errno : MDBX_ENOMEM; error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -1150,7 +1239,7 @@ int main(int argc, char *argv[]) { goto bailout; } - for (uint64_t n = 0; n < lastpgno; ++n) + for (uint64_t n = 0; n < alloc_pages; ++n) if (!walk.pagemap[n]) unused_pages += 1; @@ -1178,7 +1267,9 @@ int main(int argc, char *argv[]) { uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed; if (all_leaf) { print(", leaf %" PRIu64, all_leaf); - if (verbose > 2) + if (verbose > 2 && + (dbi->pages.leaf_dupfixed | dbi->pages.subleaf_dupsort | + dbi->pages.subleaf_dupsort)) print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64 ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")", dbi->pages.leaf, dbi->pages.subleaf_dupsort, @@ -1238,15 +1329,16 @@ int main(int argc, char *argv[]) { uint64_t value = envinfo.mi_mapsize / envstat.ms_psize; double percent = value / 100.0; print(" - space: %" PRIu64 " total pages", value); - value = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; - print(", backed %" PRIu64 " (%.1f%%)", value, value / percent); - print(", allocated %" PRIu64 " (%.1f%%)", lastpgno, lastpgno / percent); + print(", backed %" PRIu64 " (%.1f%%)", backed_pages, + backed_pages / percent); + print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages, + alloc_pages / percent); if (verbose > 1) { - value = envinfo.mi_mapsize / envstat.ms_psize - lastpgno; + value = envinfo.mi_mapsize / envstat.ms_psize - alloc_pages; print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); - value = lastpgno - gc_pages; + value = alloc_pages - gc_pages; print(", used %" PRIu64 " (%.1f%%)", value, value / percent); print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent); @@ -1259,16 +1351,16 @@ int main(int argc, char *argv[]) { } value = - envinfo.mi_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages; + envinfo.mi_mapsize / envstat.ms_psize - alloc_pages + reclaimable_pages; print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } if (problems_maindb == 0 && problems_freedb == 0) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { - if (walk.pgcount != lastpgno - gc_pages) { + if (walk.pgcount != alloc_pages - gc_pages) { error("used pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", - walk.pgcount, lastpgno - gc_pages); + walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { error("gc pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", unused_pages, diff --git a/libs/libmdbx/src/src/tools/mdbx_copy.c b/libs/libmdbx/src/src/tools/mdbx_copy.c index 9b0c833a37..f700b1785f 100644 --- a/libs/libmdbx/src/src/tools/mdbx_copy.c +++ b/libs/libmdbx/src/src/tools/mdbx_copy.c @@ -20,6 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ +/* Avoid reference to mdbx_runtime_flags from assert() */ +#define mdbx_runtime_flags (~0u) #include "../bits.h" #if defined(_WIN32) || defined(_WIN64) diff --git a/libs/libmdbx/src/src/tools/mdbx_dump.c b/libs/libmdbx/src/src/tools/mdbx_dump.c index 07951548dd..8cb9c86e46 100644 --- a/libs/libmdbx/src/src/tools/mdbx_dump.c +++ b/libs/libmdbx/src/src/tools/mdbx_dump.c @@ -20,7 +20,10 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ +/* Avoid reference to mdbx_runtime_flags from assert() */ +#define mdbx_runtime_flags (~0u) #include "../bits.h" + #include <ctype.h> #define PRINT 1 diff --git a/libs/libmdbx/src/src/tools/mdbx_load.c b/libs/libmdbx/src/src/tools/mdbx_load.c index 9789e83a62..7bbc52b87e 100644 --- a/libs/libmdbx/src/src/tools/mdbx_load.c +++ b/libs/libmdbx/src/src/tools/mdbx_load.c @@ -20,7 +20,10 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ +/* Avoid reference to mdbx_runtime_flags from assert() */ +#define mdbx_runtime_flags (~0u) #include "../bits.h" + #include <ctype.h> #if defined(_WIN32) || defined(_WIN64) @@ -263,10 +266,9 @@ static int readline(MDBX_val *out, MDBX_val *buf) { if (mode & PRINT) { while (c2 < end) { - if (*c2 == '\\') { + if (unlikely(*c2 == '\\')) { if (c2[1] == '\\') { - c1++; - c2 += 2; + *c1++ = '\\'; } else { if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { Eof = 1; @@ -274,8 +276,8 @@ static int readline(MDBX_val *out, MDBX_val *buf) { return EOF; } *c1++ = (char)unhex(++c2); - c2 += 2; } + c2 += 2; } else { /* copies are redundant when no escapes were used */ *c1++ = *c2++; diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.c b/libs/libmdbx/src/src/tools/mdbx_stat.c index e459121d76..d4ebc2e654 100644 --- a/libs/libmdbx/src/src/tools/mdbx_stat.c +++ b/libs/libmdbx/src/src/tools/mdbx_stat.c @@ -20,6 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ +/* Avoid reference to mdbx_runtime_flags from assert() */ +#define mdbx_runtime_flags (~0u) #include "../bits.h" #if defined(_WIN32) || defined(_WIN64) diff --git a/libs/libmdbx/src/test/append.cc b/libs/libmdbx/src/test/append.cc index c5e7e91d9e..273f68b810 100644 --- a/libs/libmdbx/src/test/append.cc +++ b/libs/libmdbx/src/test/append.cc @@ -15,11 +15,12 @@ #include "test.h" bool testcase_append::run() { - db_open(); - - txn_begin(false); - MDBX_dbi dbi = db_table_open(true); - db_table_clear(dbi); + MDBX_dbi dbi; + int err = db_open__begin__table_create_open_clean(dbi); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("append: bailout-prepare due '%s'", mdbx_strerror(err)); + return true; + } keyvalue_maker.setup(config.params, config.actor_id, 0 /* thread_number */); /* LY: тест наполнения таблиц в append-режиме, @@ -41,7 +42,10 @@ bool testcase_append::run() { simple_checksum inserted_checksum; uint64_t inserted_number = 0; uint64_t serial_count = 0; + unsigned txn_nops = 0; + uint64_t commited_inserted_number = inserted_number; + simple_checksum commited_inserted_checksum = inserted_checksum; while (should_continue()) { const keygen::serial_t serial = serial_count; if (!keyvalue_maker.increment(serial_count, 1)) { @@ -57,10 +61,19 @@ bool testcase_append::run() { if (cmp == 0 && (config.params.table_flags & MDBX_DUPSORT)) cmp = mdbx_dcmp(txn_guard.get(), dbi, &data->value, &last_data->value); - int err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, flags); + err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, flags); + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("append: bailout-insert due '%s'", mdbx_strerror(err)); + txn_end(true); + inserted_number = commited_inserted_number; + inserted_checksum = commited_inserted_checksum; + break; + } + if (cmp > 0) { if (unlikely(err != MDBX_SUCCESS)) failure_perror("mdbx_put(appenda-a)", err); + memcpy(last_key->value.iov_base, key->value.iov_base, last_key->value.iov_len = key->value.iov_len); memcpy(last_data->value.iov_base, data->value.iov_base, @@ -74,22 +87,40 @@ bool testcase_append::run() { } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("append: bailout-commit due '%s'", mdbx_strerror(err)); + inserted_number = commited_inserted_number; + inserted_checksum = commited_inserted_checksum; + break; + } + commited_inserted_number = inserted_number; + commited_inserted_checksum = inserted_checksum; txn_nops = 0; } report(1); } - txn_restart(false, true); + if (txn_guard) { + err = breakable_commit(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("append: bailout-commit due '%s'", mdbx_strerror(err)); + inserted_number = commited_inserted_number; + inserted_checksum = commited_inserted_checksum; + } + } //---------------------------------------------------------------------------- + txn_begin(true); cursor_open(dbi); MDBX_val check_key, check_data; - int err = + err = mdbx_cursor_get(cursor_guard.get(), &check_key, &check_data, MDBX_FIRST); - if (unlikely(err != MDBX_SUCCESS)) - failure_perror("mdbx_cursor_get(MDBX_FIRST)", err); + if (likely(inserted_number)) { + if (unlikely(err != MDBX_SUCCESS)) + failure_perror("mdbx_cursor_get(MDBX_FIRST)", err); + } simple_checksum read_checksum; uint64_t read_count = 0; @@ -115,15 +146,18 @@ bool testcase_append::run() { read_checksum.value, inserted_checksum.value); cursor_close(); + txn_end(true); //---------------------------------------------------------------------------- - if (txn_guard) - txn_end(false); if (dbi) { if (config.params.drop_table && !mode_readonly()) { txn_begin(false); db_table_drop(dbi); - txn_end(false); + err = breakable_commit(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("append: bailout-clean due '%s'", mdbx_strerror(err)); + return true; + } } else db_table_close(dbi); } diff --git a/libs/libmdbx/src/test/base.h b/libs/libmdbx/src/test/base.h index 0b4d26e51b..5ca134a81d 100644 --- a/libs/libmdbx/src/test/base.h +++ b/libs/libmdbx/src/test/base.h @@ -39,6 +39,10 @@ #include <SDKDDKVer.h> #endif /* WINDOWS */ +#ifdef __APPLE__ +#define _DARWIN_C_SOURCE +#endif + #include <errno.h> #include <limits.h> #include <stdio.h> diff --git a/libs/libmdbx/src/test/cases.cc b/libs/libmdbx/src/test/cases.cc index 023a80020c..a98834a457 100644 --- a/libs/libmdbx/src/test/cases.cc +++ b/libs/libmdbx/src/test/cases.cc @@ -63,8 +63,7 @@ void testcase_setup(const char *casename, actor_params ¶ms, log_notice(">>> testcase_setup(%s)", casename); configure_actor(last_space_id, ac_jitter, nullptr, params); configure_actor(last_space_id, ac_hill, nullptr, params); - configure_actor(last_space_id, ac_jitter, nullptr, params); - configure_actor(last_space_id, ac_hill, nullptr, params); + configure_actor(last_space_id, ac_ttl, nullptr, params); configure_actor(last_space_id, ac_jitter, nullptr, params); configure_actor(last_space_id, ac_hill, nullptr, params); configure_actor(last_space_id, ac_ttl, nullptr, params); diff --git a/libs/libmdbx/src/test/config.cc b/libs/libmdbx/src/test/config.cc index bfae5c14df..dd150e9a91 100644 --- a/libs/libmdbx/src/test/config.cc +++ b/libs/libmdbx/src/test/config.cc @@ -1,4 +1,4 @@ -/* +/* * Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -412,6 +412,8 @@ void dump(const char *title) { i->params.max_tables); log_info("drop table: %s\n", i->params.drop_table ? "Yes" : "No"); + log_info("ignore MDBX_MAP_FULL error: %s\n", + i->params.ignore_dbfull ? "Yes" : "No"); indent.pop(); } diff --git a/libs/libmdbx/src/test/config.h b/libs/libmdbx/src/test/config.h index b8a4b6827d..89889d8eb6 100644 --- a/libs/libmdbx/src/test/config.h +++ b/libs/libmdbx/src/test/config.h @@ -38,7 +38,8 @@ enum actor_status { as_running, as_successful, as_killed, - as_failed + as_failed, + as_coredump, }; const char *testcase2str(const actor_testcase); @@ -101,6 +102,22 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool parse_option(int argc, char *const argv[], int &narg, const char *option, int32_t &value, const int32_t minval, const int32_t maxval, const int32_t default_value = -1); + +inline bool parse_option_intptr(int argc, char *const argv[], int &narg, + const char *option, intptr_t &value, + const intptr_t minval, const intptr_t maxval, + const intptr_t default_value = -1) { + static_assert(sizeof(intptr_t) == 4 || sizeof(intptr_t) == 8, "WTF?"); + if (sizeof(intptr_t) == 8) + return parse_option(argc, argv, narg, option, + *reinterpret_cast<int64_t *>(&value), int64_t(minval), + int64_t(maxval), int64_t(default_value)); + else + return parse_option(argc, argv, narg, option, + *reinterpret_cast<int32_t *>(&value), int32_t(minval), + int32_t(maxval), int32_t(default_value)); +} + //----------------------------------------------------------------------------- #pragma pack(push, 1) @@ -248,6 +265,7 @@ struct actor_params_pod { keygen_params_pod keygen; bool drop_table; + bool ignore_dbfull; }; struct actor_config_pod { diff --git a/libs/libmdbx/src/test/darwin/LICENSE b/libs/libmdbx/src/test/darwin/LICENSE new file mode 100644 index 0000000000..6a0dd3066b --- /dev/null +++ b/libs/libmdbx/src/test/darwin/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2015, Aleksey Demakov +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/libs/libmdbx/src/test/darwin/README.md b/libs/libmdbx/src/test/darwin/README.md new file mode 100644 index 0000000000..a6a8fd1a91 --- /dev/null +++ b/libs/libmdbx/src/test/darwin/README.md @@ -0,0 +1,8 @@ +# DarwinPthreadBarrier + +A pthread_barrier_t implementation for Mac OS/X + +There is no pthread_barrier_t in Mac OS/X pthreads. This project fixes +this omission by providing a simple-minded barrier implementation based +on a pair of pthread_mutex_t and pthread_cond_t. + diff --git a/libs/libmdbx/src/test/darwin/pthread_barrier.c b/libs/libmdbx/src/test/darwin/pthread_barrier.c new file mode 100644 index 0000000000..054aa00708 --- /dev/null +++ b/libs/libmdbx/src/test/darwin/pthread_barrier.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2015, Aleksey Demakov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pthread_barrier.h" + +#include <errno.h> + +#ifdef __APPLE__ + +int pthread_barrierattr_init(pthread_barrierattr_t *attr) { + memset(attr, 0, sizeof(pthread_barrierattr_t)); + int m = pthread_mutexattr_init(&attr->mattr); + int c = pthread_condattr_init(&attr->cattr); + return m ? m : c; +} + +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr) { + int c = pthread_condattr_destroy(&attr->cattr); + int m = pthread_mutexattr_destroy(&attr->mattr); + return m ? m : c; +} + +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *__restrict attr, + int *__restrict pshared) { + return pthread_condattr_getpshared(&attr->cattr, pshared); +} + +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared) { + int m = pthread_mutexattr_setpshared(&attr->mattr, pshared); + int c = pthread_condattr_setpshared(&attr->cattr, pshared); + return m ? m : c; +} + +int pthread_barrier_init(pthread_barrier_t *__restrict barrier, + const pthread_barrierattr_t *__restrict attr, + unsigned count) { + if (count == 0) + return errno = EINVAL; + + int rc = pthread_mutex_init(&barrier->mutex, attr ? &attr->mattr : 0); + if (rc) + return rc; + + rc = pthread_cond_init(&barrier->cond, attr ? &attr->cattr : 0); + if (rc) { + int errno_save = errno; + pthread_mutex_destroy(&barrier->mutex); + errno = errno_save; + return rc; + } + + barrier->limit = count; + barrier->count = 0; + barrier->phase = 0; + return 0; +} + +int pthread_barrier_destroy(pthread_barrier_t *barrier) { + pthread_mutex_destroy(&barrier->mutex); + pthread_cond_destroy(&barrier->cond); + return 0; +} + +int pthread_barrier_wait(pthread_barrier_t *barrier) { + int rc = pthread_mutex_lock(&barrier->mutex); + if (rc) + return rc; + + barrier->count++; + if (barrier->count >= barrier->limit) { + barrier->phase++; + barrier->count = 0; + pthread_cond_broadcast(&barrier->cond); + pthread_mutex_unlock(&barrier->mutex); + return PTHREAD_BARRIER_SERIAL_THREAD; + } else { + unsigned phase = barrier->phase; + do + pthread_cond_wait(&barrier->cond, &barrier->mutex); + while (phase == barrier->phase); + pthread_mutex_unlock(&barrier->mutex); + return 0; + } +} + +#endif /* __APPLE__ */ diff --git a/libs/libmdbx/src/test/darwin/pthread_barrier.h b/libs/libmdbx/src/test/darwin/pthread_barrier.h new file mode 100644 index 0000000000..efa9b9b751 --- /dev/null +++ b/libs/libmdbx/src/test/darwin/pthread_barrier.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015, Aleksey Demakov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PTHREAD_BARRIER_H +#define PTHREAD_BARRIER_H + +#include <pthread.h> + +#ifdef __APPLE__ + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(PTHREAD_BARRIER_SERIAL_THREAD) +#define PTHREAD_BARRIER_SERIAL_THREAD (1) +#endif + +#if !defined(PTHREAD_PROCESS_PRIVATE) +#define PTHREAD_PROCESS_PRIVATE (42) +#endif +#if !defined(PTHREAD_PROCESS_SHARED) +#define PTHREAD_PROCESS_SHARED (43) +#endif + +typedef struct { + pthread_mutexattr_t mattr; + pthread_condattr_t cattr; +} pthread_barrierattr_t; + +typedef struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + unsigned int limit; + unsigned int count; + unsigned int phase; +} pthread_barrier_t; + +int pthread_barrierattr_init(pthread_barrierattr_t *attr); +int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); + +int pthread_barrierattr_getpshared(const pthread_barrierattr_t *__restrict attr, + int *__restrict pshared); +int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared); + +int pthread_barrier_init(pthread_barrier_t *__restrict barrier, + const pthread_barrierattr_t *__restrict attr, + unsigned int count); +int pthread_barrier_destroy(pthread_barrier_t *barrier); + +int pthread_barrier_wait(pthread_barrier_t *barrier); + +#ifdef __cplusplus +} +#endif + +#endif /* __APPLE__ */ + +#endif /* PTHREAD_BARRIER_H */ diff --git a/libs/libmdbx/src/test/gc.sh b/libs/libmdbx/src/test/gc.sh deleted file mode 100644 index 9c1407f53e..0000000000 --- a/libs/libmdbx/src/test/gc.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -set -euo pipefail -make check -TESTDB_PREFIX=${1:-/dev/shm/mdbx-gc-test}. - -function rep9 { printf "%*s" $1 '' | tr ' ' '9'; } -function join { local IFS="$1"; shift; echo "$*"; } -function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; } - -options=(writemap coalesce lifo) - -function bits2list { - local -n arr=$1 - local i - local list=() - for ((i=0; i<${#arr[@]}; ++i)) do - list[$i]=$(bit2option $1 $2 $i) - done - join , "${list[@]}" -} - -function probe { - echo "=============================================== $(date)" - echo "${caption}: $*" - rm -f ${TESTDB_PREFIX}* \ - && ./mdbx_test --repeat=12 --pathname=${TESTDB_PREFIX}db "$@" | lz4 > ${TESTDB_PREFIX}log.lz4 \ - && ./mdbx_chk -nvvv ${TESTDB_PREFIX}db | tee ${TESTDB_PREFIX}chk \ - && ([ ! -e ${TESTDB_PREFIX}db-copy ] || ./mdbx_chk -nvvv ${TESTDB_PREFIX}db-copy | tee ${TESTDB_PREFIX}chk-copy) \ - || (echo "FAILED"; exit 1) -} - -############################################################################### - -count=0 -for nops in {2..7}; do - for ((wbatch=nops-1; wbatch > 0; --wbatch)); do - loops=$(((333 >> nops) / nops + 3)) - for ((rep=0; rep++ < loops; )); do - for ((bits=2**${#options[@]}; --bits >= 0; )); do - seed=$(date +%N) - caption="Probe #$((++count)) int-key,w/o-dups, repeat ${rep} of ${loops}" probe \ - --pagesize=min --size=6G --table=+key.integer,-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} basic - caption="Probe #$((++count)) int-key,with-dups, repeat ${rep} of ${loops}" probe \ - --pagesize=min --size=6G --table=+key.integer,+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} basic - caption="Probe #$((++count)) int-key,int-data, repeat ${rep} of ${loops}" probe \ - --pagesize=min --size=6G --table=+key.integer,+data.integer --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} basic - caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \ - --pagesize=min --size=6G --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} basic - caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \ - --pagesize=min --size=6G --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ - --keygen.seed=${seed} basic - done - done - done -done - -echo "=== ALL DONE ====================== $(date)" diff --git a/libs/libmdbx/src/test/hill.cc b/libs/libmdbx/src/test/hill.cc index 5b083e1fcc..1b03ddf0fc 100644 --- a/libs/libmdbx/src/test/hill.cc +++ b/libs/libmdbx/src/test/hill.cc @@ -15,11 +15,12 @@ #include "test.h" bool testcase_hill::run() { - db_open(); - - txn_begin(false); - MDBX_dbi dbi = db_table_open(true); - txn_end(false); + MDBX_dbi dbi; + int err = db_open__begin__table_create_open_clean(dbi); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("hill: bailout-prepare due '%s'", mdbx_strerror(err)); + return true; + } /* LY: тест "холмиком": * - сначала наполняем таблицу циклическими CRUD-манипуляциями, @@ -59,14 +60,15 @@ bool testcase_hill::run() { : MDBX_NODUPDATA; uint64_t serial_count = 0; + uint64_t commited_serial = serial_count; unsigned txn_nops = 0; - if (!txn_guard) - txn_begin(false); while (should_continue()) { const keygen::serial_t a_serial = serial_count; - if (unlikely(!keyvalue_maker.increment(serial_count, 1))) - failure("uphill: unexpected key-space overflow"); + if (unlikely(!keyvalue_maker.increment(serial_count, 1))) { + log_notice("uphill: unexpected key-space overflow"); + break; + } const keygen::serial_t b_serial = serial_count; assert(b_serial > a_serial); @@ -76,26 +78,52 @@ bool testcase_hill::run() { log_trace("uphill: insert-a (age %" PRIu64 ") %" PRIu64, age_shift, a_serial); generate_pair(a_serial, a_key, a_data_1, age_shift); - int rc = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, - insert_flags); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_put(insert-a.1)", rc); + err = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, + insert_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("uphill: bailout at insert-a due '%s'", mdbx_strerror(err)); + txn_restart(true, false); + serial_count = commited_serial; + break; + } + failure_perror("mdbx_put(insert-a.1)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); + serial_count = commited_serial; + break; + } + commited_serial = a_serial; txn_nops = 0; } // создаем вторую запись из пары log_trace("uphill: insert-b %" PRIu64, b_serial); generate_pair(b_serial, b_key, b_data, 0); - rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, - insert_flags); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_put(insert-b)", rc); + err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, + insert_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("uphill: bailout at insert-b due '%s'", mdbx_strerror(err)); + txn_restart(true, false); + serial_count = commited_serial; + break; + } + failure_perror("mdbx_put(insert-b)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); + serial_count = commited_serial; + break; + } + commited_serial = a_serial; txn_nops = 0; } @@ -104,25 +132,51 @@ bool testcase_hill::run() { a_serial); generate_pair(a_serial, a_key, a_data_0, 0); checkdata("uphill: update-a", dbi, a_key->value, a_data_1->value); - rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, - &a_data_1->value, update_flags); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_replace(update-a: 1->0)", rc); + err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, + &a_data_1->value, update_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("uphill: bailout at update-a due '%s'", mdbx_strerror(err)); + txn_restart(true, false); + serial_count = commited_serial; + break; + } + failure_perror("mdbx_replace(update-a: 1->0)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); + serial_count = commited_serial; + break; + } + commited_serial = a_serial; txn_nops = 0; } // удаляем вторую запись log_trace("uphill: delete-b %" PRIu64, b_serial); checkdata("uphill: delete-b", dbi, b_key->value, b_data->value); - rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_del(b)", rc); + err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("uphill: bailout at delete-b due '%s'", mdbx_strerror(err)); + txn_restart(true, false); + serial_count = commited_serial; + break; + } + failure_perror("mdbx_del(b)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err)); + serial_count = commited_serial; + break; + } + commited_serial = a_serial; txn_nops = 0; } @@ -134,7 +188,7 @@ bool testcase_hill::run() { } } - while (serial_count > 0) { + while (serial_count > 1) { if (unlikely(!keyvalue_maker.increment(serial_count, -2))) failure("downhill: unexpected key-space underflow"); @@ -150,26 +204,48 @@ bool testcase_hill::run() { generate_pair(a_serial, a_key, a_data_0, 0); generate_pair(a_serial, a_key, a_data_1, age_shift); checkdata("downhill: update-a", dbi, a_key->value, a_data_0->value); - int rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, - &a_data_0->value, update_flags); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_put(update-a: 0->1)", rc); + err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, + &a_data_0->value, update_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("downhill: bailout at update-a due '%s'", + mdbx_strerror(err)); + txn_end(true); + break; + } + failure_perror("mdbx_put(update-a: 0->1)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + break; + } txn_nops = 0; } // создаем вторую запись из пары log_trace("downhill: insert-b %" PRIu64, b_serial); generate_pair(b_serial, b_key, b_data, 0); - rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, - insert_flags); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_put(insert-b)", rc); + err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, + insert_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("downhill: bailout at insert-a due '%s'", + mdbx_strerror(err)); + txn_end(true); + break; + } + failure_perror("mdbx_put(insert-b)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + break; + } txn_nops = 0; } @@ -177,38 +253,67 @@ bool testcase_hill::run() { log_trace("downhill: delete-a (age %" PRIu64 ") %" PRIu64, age_shift, a_serial); checkdata("downhill: delete-a", dbi, a_key->value, a_data_1->value); - rc = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_del(a)", rc); + err = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("downhill: bailout at delete-a due '%s'", + mdbx_strerror(err)); + txn_end(true); + break; + } + failure_perror("mdbx_del(a)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + break; + } txn_nops = 0; } // удаляем вторую запись log_trace("downhill: delete-b %" PRIu64, b_serial); checkdata("downhill: delete-b", dbi, b_key->value, b_data->value); - rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_del(b)", rc); + err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("downhill: bailout at delete-b due '%s'", + mdbx_strerror(err)); + txn_end(true); + break; + } + failure_perror("mdbx_del(b)", err); + } if (++txn_nops >= config.params.batch_write) { - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + break; + } txn_nops = 0; } report(1); } - if (txn_guard) - txn_end(false); + if (txn_guard) { + err = breakable_commit(); + if (unlikely(err != MDBX_SUCCESS)) + log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err)); + } if (dbi) { if (config.params.drop_table && !mode_readonly()) { txn_begin(false); db_table_drop(dbi); - txn_end(false); + err = breakable_commit(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("hill: bailout-clean due '%s'", mdbx_strerror(err)); + return true; + } } else db_table_close(dbi); } diff --git a/libs/libmdbx/src/test/keygen.cc b/libs/libmdbx/src/test/keygen.cc index 30cdf7a571..0110b049bf 100644 --- a/libs/libmdbx/src/test/keygen.cc +++ b/libs/libmdbx/src/test/keygen.cc @@ -184,7 +184,8 @@ bool maker::increment(serial_t &serial, int delta) const { } serial_t target = serial + (int64_t)delta; - if (target > mask(mapping.width)) { + if (target > mask(mapping.width) || + ((delta > 0) ? target < serial : target > serial)) { log_extra("keygen-increment: %" PRIu64 "%-d => %" PRIu64 ", overflow", serial, delta, target); return false; diff --git a/libs/libmdbx/src/test/long_stochastic.sh b/libs/libmdbx/src/test/long_stochastic.sh new file mode 100644 index 0000000000..954d18268a --- /dev/null +++ b/libs/libmdbx/src/test/long_stochastic.sh @@ -0,0 +1,138 @@ +#!/bin/bash +if ! which make cc c++ tee lz4 >/dev/null; then + echo "Please install the following prerequisites: make cc c++ tee lz4" >&2 + exit 1 +fi + +set -euo pipefail + +UNAME="$(uname -s 2>/dev/null || echo Unknown)" +case ${UNAME} in + Linux) + MAKE=make + if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then + TESTDB_DIR="/dev/shm/mdbx-test.$$" + fi + mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + if LC_ALL=C free | grep -q -i available; then + ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 7) / 1024)) + else + ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 4) / 1024)) + fi + ;; + FreeBSD) + MAKE=gmake + if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then + for old_test_dir in $(ls -d /tmp/mdbx-test.[0-9]*); do + umount $old_test_dir && rm -r $old_test_dir + done + TESTDB_DIR="/tmp/mdbx-test.$$" + rm -rf $TESTDB_DIR && mkdir -p $TESTDB_DIR && mount -t tmpfs tmpfs $TESTDB_DIR + else + mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/* + fi + ram_avail_mb=$(($(LC_ALL=C vmstat -s | grep -ie '[0-9] pages free$' | cut -d p -f 1) * ($(LC_ALL=C vmstat -s | grep -ie '[0-9] bytes per page$' | cut -d b -f 1) / 1024) / 1024)) + ;; + *) + echo "FIXME: ${UNAME} not supported by this script" + exit 2 + ;; +esac + +echo "=== ${ram_avail_mb}M RAM available" +ram_reserve4logs_mb=1234 +if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then + echo "=== At least ${ram_reserve4logs_mb}Mb RAM required" + exit 3 +fi + +# +# В режимах отличных от MDBX_WRITEMAP изменения до записи в файл +# будут накапливаться в памяти, что может потребовать свободной +# памяти размером с БД. Кроме этого, в тест входит сценарий +# создания копия БД на ходу. Поэтому БД не может быть больше 1/3 +# от доступной памяти. Однако, следует учесть что malloc() будет +# не сразу возвращать выделенную память системе, а также +# предусмотреть места для логов. +# +# In non-MDBX_WRITEMAP modes, updates (dirty pages) will +# accumulate in memory before writing to the disk, which may +# require a free memory up to the size of a whole database. In +# addition, the test includes a script create a copy of the +# database on the go. Therefore, the database cannot be more 1/3 +# of available memory. Moreover, should be taken into account +# that malloc() will not return the allocated memory to the +# system immediately, as well some space is required for logs. +# +db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4)) +if [ $db_size_mb -gt 3072 ]; then + db_size_mb=3072 +fi +echo "=== use ${db_size_mb}M for DB" + +${MAKE} TESTDB=${TESTDB_DIR}/smoke.db TESTLOG=${TESTDB_DIR}/smoke.log check +rm -f ${TESTDB_DIR}/* + +############################################################################### + +function rep9 { printf "%*s" $1 '' | tr ' ' '9'; } +function join { local IFS="$1"; shift; echo "$*"; } +function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; } + +options=(writemap coalesce lifo) + +function bits2list { + local -n arr=$1 + local i + local list=() + for ((i=0; i<${#arr[@]}; ++i)) do + list[$i]=$(bit2option $1 $2 $i) + done + join , "${list[@]}" +} + +function probe { + echo "=============================================== $(date)" + echo "${caption}: $*" + rm -f ${TESTDB_DIR}/* \ + && ./mdbx_test --ignore-dbfull --repeat=42 --pathname=${TESTDB_DIR}/long.db "$@" | lz4 > ${TESTDB_DIR}/long.log.lz4 \ + && ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db | tee ${TESTDB_DIR}/long-chk.log \ + && ([ ! -e ${TESTDB_DIR}/long.db-copy ] || ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db-copy | tee ${TESTDB_DIR}/long-chk-copy.log) \ + || (echo "FAILED"; exit 1) +} + +############################################################################### + +count=0 +for nops in $(seq 2 6); do + for ((wbatch=nops-1; wbatch > 0; --wbatch)); do + loops=$(((111 >> nops) / nops + 3)) + for ((rep=0; rep++ < loops; )); do + for ((bits=2**${#options[@]}; --bits >= 0; )); do + seed=$(($(date +%s) + RANDOM)) + caption="Probe #$((++count)) int-key,w/o-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) int-key,with-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) int-key,int-data, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,+data.integer --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size-upper=${db_size_mb}M --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \ + --pagesize=min --size-upper=${db_size_mb}M --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \ + --keygen.seed=${seed} basic + done + done + done +done + +echo "=== ALL DONE ====================== $(date)" diff --git a/libs/libmdbx/src/test/loop.bat b/libs/libmdbx/src/test/loop.bat deleted file mode 100644 index 0e3b6271c9..0000000000 --- a/libs/libmdbx/src/test/loop.bat +++ /dev/null @@ -1,15 +0,0 @@ -@echo off - -del test.db test.db-lck - -:loop - -mdbx_test.exe --pathname=test.db --dont-cleanup-after basic > test.log -if errorlevel 1 goto fail - -mdbx_chk.exe -nvvv test.db > chk.log -if errorlevel 1 goto fail -goto loop - -:fail -echo FAILED diff --git a/libs/libmdbx/src/test/main.cc b/libs/libmdbx/src/test/main.cc index 9dc3eccbae..959359a515 100644 --- a/libs/libmdbx/src/test/main.cc +++ b/libs/libmdbx/src/test/main.cc @@ -27,6 +27,8 @@ void actor_params::set_defaults(const std::string &tmpdir) { loglevel = #ifdef NDEBUG logging::info; +#elif defined(_WIN32) || defined(_WIN64) + logging::verbose; #else logging::trace; #endif @@ -70,6 +72,7 @@ void actor_params::set_defaults(const std::string &tmpdir) { inject_writefaultn = 0; drop_table = false; + ignore_dbfull = false; max_readers = 42; max_tables = 42; @@ -179,17 +182,19 @@ int main(int argc, char *const argv[]) { params.datalen_max = datalen_max; continue; } - if (config::parse_option(argc, argv, narg, "size-lower", params.size_lower, - mdbx_limits_dbsize_min(params.pagesize), - mdbx_limits_dbsize_max(params.pagesize))) + if (config::parse_option_intptr(argc, argv, narg, "size-lower", + params.size_lower, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size", params.size_now, - mdbx_limits_dbsize_min(params.pagesize), - mdbx_limits_dbsize_max(params.pagesize))) + if (config::parse_option_intptr(argc, argv, narg, "size-upper", + params.size_upper, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) continue; - if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper, - mdbx_limits_dbsize_min(params.pagesize), - mdbx_limits_dbsize_max(params.pagesize))) + if (config::parse_option_intptr(argc, argv, narg, "size", params.size_now, + mdbx_limits_dbsize_min(params.pagesize), + mdbx_limits_dbsize_max(params.pagesize))) continue; if (config::parse_option( argc, argv, narg, "shrink-threshold", params.shrink_threshold, 0, @@ -288,6 +293,9 @@ int main(int argc, char *const argv[]) { continue; if (config::parse_option(argc, argv, narg, "drop", params.drop_table)) continue; + if (config::parse_option(argc, argv, narg, "ignore-dbfull", + params.ignore_dbfull)) + continue; if (config::parse_option(argc, argv, narg, "dump-config", global::config::dump_config)) continue; diff --git a/libs/libmdbx/src/test/osal-unix.cc b/libs/libmdbx/src/test/osal-unix.cc index fd691e354f..0157bace23 100644 --- a/libs/libmdbx/src/test/osal-unix.cc +++ b/libs/libmdbx/src/test/osal-unix.cc @@ -21,6 +21,10 @@ #include <sys/wait.h> #include <unistd.h> +#ifdef __APPLE__ +#include "darwin/pthread_barrier.c" +#endif + struct shared_t { pthread_barrier_t barrier; pthread_mutex_t mutex; @@ -199,7 +203,9 @@ retry: if (WIFEXITED(status)) childs[pid] = (WEXITSTATUS(status) == EXIT_SUCCESS) ? as_successful : as_failed; - else if (WIFSIGNALED(status) || WCOREDUMP(status)) + else if (WCOREDUMP(status)) + childs[pid] = as_coredump; + else if (WIFSIGNALED(status)) childs[pid] = as_killed; else if (WIFSTOPPED(status)) childs[pid] = as_debuging; @@ -216,7 +222,7 @@ retry: if (ts.tv_sec == 0 && ts.tv_nsec == 0) ts.tv_nsec = 1; if (nanosleep(&ts, &ts) == 0) { - /* timeout and no signal fomr child */ + /* timeout and no signal from child */ pid = 0; return 0; } diff --git a/libs/libmdbx/src/test/osal-windows.cc b/libs/libmdbx/src/test/osal-windows.cc index 5858e89530..975d8268f5 100644 --- a/libs/libmdbx/src/test/osal-windows.cc +++ b/libs/libmdbx/src/test/osal-windows.cc @@ -312,14 +312,22 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { case EXIT_SUCCESS: status = as_successful; break; - // case EXCEPTION_BREAKPOINT: + case EXCEPTION_BREAKPOINT: case EXCEPTION_SINGLE_STEP: status = as_debuging; break; case STATUS_CONTROL_C_EXIT: - case EXCEPTION_NONCONTINUABLE_EXCEPTION: status = as_killed; break; + case EXCEPTION_ACCESS_VIOLATION: + case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: + case EXCEPTION_DATATYPE_MISALIGNMENT: + case EXCEPTION_STACK_OVERFLOW: + case EXCEPTION_INVALID_DISPOSITION: + case EXCEPTION_ILLEGAL_INSTRUCTION: + case EXCEPTION_NONCONTINUABLE_EXCEPTION: + status = as_coredump; + break; default: status = as_failed; break; diff --git a/libs/libmdbx/src/test/test.cc b/libs/libmdbx/src/test/test.cc index cf61f1eeee..e9e925e9a9 100644 --- a/libs/libmdbx/src/test/test.cc +++ b/libs/libmdbx/src/test/test.cc @@ -55,6 +55,8 @@ const char *status2str(actor_status status) { return "killed"; case as_failed: return "failed"; + case as_coredump: + return "coredump"; } } @@ -137,6 +139,8 @@ void testcase::db_open() { if (!db_guard) db_prepare(); + + jitter_delay(true); int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), (unsigned)config.params.mode_flags, 0640); if (unlikely(rc != MDBX_SUCCESS)) @@ -170,20 +174,42 @@ void testcase::txn_begin(bool readonly, unsigned flags) { flags); } +int testcase::breakable_commit() { + int rc = MDBX_SUCCESS; + log_trace(">> txn_commit"); + assert(txn_guard); + + MDBX_txn *txn = txn_guard.release(); + txn_inject_writefault(txn); + int err = mdbx_txn_commit(txn); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + rc = err; + err = mdbx_txn_abort(txn); + if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH)) + failure_perror("mdbx_txn_abort()", err); + } else + failure_perror("mdbx_txn_commit()", err); + } + + log_trace("<< txn_commit: %s", rc ? "failed" : "Ok"); + return rc; +} + void testcase::txn_end(bool abort) { log_trace(">> txn_end(%s)", abort ? "abort" : "commit"); assert(txn_guard); MDBX_txn *txn = txn_guard.release(); if (abort) { - int rc = mdbx_txn_abort(txn); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_txn_abort()", rc); + int err = mdbx_txn_abort(txn); + if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH)) + failure_perror("mdbx_txn_abort()", err); } else { txn_inject_writefault(txn); - int rc = mdbx_txn_commit(txn); - if (unlikely(rc != MDBX_SUCCESS)) - failure_perror("mdbx_txn_commit()", rc); + int err = mdbx_txn_commit(txn); + if (unlikely(err != MDBX_SUCCESS)) + failure_perror("mdbx_txn_commit()", err); } log_trace("<< txn_end(%s)", abort ? "abort" : "commit"); @@ -211,6 +237,16 @@ void testcase::cursor_close() { log_trace("<< cursor_close()"); } +int testcase::breakable_restart() { + int rc = MDBX_SUCCESS; + if (txn_guard) + rc = breakable_commit(); + if (cursor_guard) + cursor_close(); + txn_begin(false, 0); + return rc; +} + void testcase::txn_restart(bool abort, bool readonly, unsigned flags) { if (txn_guard) txn_end(abort); @@ -394,6 +430,28 @@ void testcase::update_canary(uint64_t increment) { log_trace("<< update_canary: sequence = %" PRIu64, canary_now.y); } +int testcase::db_open__begin__table_create_open_clean(MDBX_dbi &dbi) { + db_open(); + + int err, retry_left = 42; + for (;;) { + txn_begin(false); + dbi = db_table_open(true); + db_table_clear(dbi); + err = breakable_commit(); + if (likely(err == MDBX_SUCCESS)) { + txn_begin(false); + return MDBX_SUCCESS; + } + if (--retry_left == 0) + break; + jitter_delay(true); + } + log_notice("db_begin_table_create_open_clean: bailout due '%s'", + mdbx_strerror(err)); + return err; +} + MDBX_dbi testcase::db_table_open(bool create) { log_trace(">> testcase::db_table_create"); @@ -513,22 +571,27 @@ bool test_execute(const actor_config &config_const) { if (!test->setup()) { log_notice("test setup failed"); return false; - } else if (!test->run()) { + } + if (!test->run()) { log_notice("test failed"); return false; - } else if (!test->teardown()) { + } + if (!test->teardown()) { log_notice("test teardown failed"); return false; - } else { - if (config.params.nrepeat == 1) - log_info("test successed"); - else if (config.params.nrepeat == 1) + } + + if (config.params.nrepeat == 1) + log_info("test successed"); + else { + if (config.params.nrepeat) log_info("test successed (iteration %zi of %zi)", iter, size_t(config.params.nrepeat)); else log_info("test successed (iteration %zi)", iter); config.params.keygen.seed += INT32_C(0xA4F4D37B); } + } while (config.params.nrepeat == 0 || iter < config.params.nrepeat); return true; } catch (const std::exception &pipets) { diff --git a/libs/libmdbx/src/test/test.h b/libs/libmdbx/src/test/test.h index 117a66f93c..fb5ad4ee83 100644 --- a/libs/libmdbx/src/test/test.h +++ b/libs/libmdbx/src/test/test.h @@ -105,7 +105,9 @@ protected: void db_open(); void db_close(); void txn_begin(bool readonly, unsigned flags = 0); + int breakable_commit(); void txn_end(bool abort); + int breakable_restart(); void txn_restart(bool abort, bool readonly, unsigned flags = 0); void cursor_open(unsigned dbi); void cursor_close(); @@ -121,6 +123,7 @@ protected: void db_table_drop(MDBX_dbi handle); void db_table_clear(MDBX_dbi handle); void db_table_close(MDBX_dbi handle); + int db_open__begin__table_create_open_clean(MDBX_dbi &dbi); bool wait4start(); void report(size_t nops_done); diff --git a/libs/libmdbx/src/test/ttl.cc b/libs/libmdbx/src/test/ttl.cc index 1ecfd0c7e5..b2650f0c13 100644 --- a/libs/libmdbx/src/test/ttl.cc +++ b/libs/libmdbx/src/test/ttl.cc @@ -29,12 +29,12 @@ static unsigned edge2count(uint64_t edge, unsigned count_max) { } bool testcase_ttl::run() { - db_open(); - - txn_begin(false); - MDBX_dbi dbi = db_table_open(true); - db_table_clear(dbi); - txn_end(false); + MDBX_dbi dbi; + int err = db_open__begin__table_create_open_clean(dbi); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("ttl: bailout-prepare due '%s'", mdbx_strerror(err)); + return true; + } /* LY: тест "эмуляцией time-to-live": * - организуется "скользящее окно", которое двигается вперед вдоль @@ -53,11 +53,26 @@ bool testcase_ttl::run() { */ /* LY: для параметризации используем подходящие параметры, которые не имеют - * здесь смысла в первоначальном значении */ - const unsigned window_max = - (config.params.batch_read > 999) ? config.params.batch_read : 1000; - const unsigned count_max = - (config.params.batch_write > 999) ? config.params.batch_write : 1000; + * здесь смысла в первоначальном значении. */ + const unsigned window_max_lower = +#ifdef __APPLE__ + 333; +#else + 999; +#endif + const unsigned count_max_lower = +#ifdef __APPLE__ + 333; +#else + 999; +#endif + + const unsigned window_max = (config.params.batch_read > window_max_lower) + ? config.params.batch_read + : window_max_lower; + const unsigned count_max = (config.params.batch_write > count_max_lower) + ? config.params.batch_write + : count_max_lower; log_info("ttl: using `batch_read` value %u for window_max", window_max); log_info("ttl: using `batch_write` value %u for count_max", count_max); @@ -73,15 +88,13 @@ bool testcase_ttl::run() { std::deque<std::pair<uint64_t, unsigned>> fifo; uint64_t serial = 0; while (should_continue()) { - if (!txn_guard) - txn_begin(false); const uint64_t salt = prng64_white(seed) /* mdbx_txn_id(txn_guard.get()) */; const unsigned window_width = edge2window(salt, window_max); - const unsigned head_count = edge2count(salt, count_max); - log_info("ttl: step #%zu (serial %" PRIu64 - ", window %u, count %u) salt %" PRIu64, - nops_completed, serial, window_width, head_count, salt); + unsigned head_count = edge2count(salt, count_max); + log_verbose("ttl: step #%zu (serial %" PRIu64 + ", window %u, count %u) salt %" PRIu64, + nops_completed, serial, window_width, head_count, salt); if (window_width) { while (fifo.size() > window_width) { @@ -93,9 +106,14 @@ bool testcase_ttl::run() { for (unsigned n = 0; n < tail_count; ++n) { log_trace("ttl: remove-tail %" PRIu64, serial); generate_pair(tail_serial); - int err = mdbx_del(txn_guard.get(), dbi, &key->value, &data->value); - if (unlikely(err != MDBX_SUCCESS)) + err = mdbx_del(txn_guard.get(), dbi, &key->value, &data->value); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("ttl: tail-bailout due '%s'", mdbx_strerror(err)); + goto bailout; + } failure_perror("mdbx_del(tail)", err); + } if (unlikely(!keyvalue_maker.increment(tail_serial, 1))) failure("ttl: unexpected key-space overflow on the tail"); } @@ -106,30 +124,54 @@ bool testcase_ttl::run() { fifo.clear(); } - txn_restart(false, false); + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("ttl: bailout at commit due '%s'", mdbx_strerror(err)); + break; + } fifo.push_front(std::make_pair(serial, head_count)); - + retry: for (unsigned n = 0; n < head_count; ++n) { log_trace("ttl: insert-head %" PRIu64, serial); generate_pair(serial); - int err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, - insert_flags); - if (unlikely(err != MDBX_SUCCESS)) + err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, + insert_flags); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { + log_notice("ttl: head-insert skip due '%s'", mdbx_strerror(err)); + txn_restart(true, false); + serial = fifo.front().first; + fifo.front().second = head_count = n; + goto retry; + } failure_perror("mdbx_put(head)", err); + } - if (unlikely(!keyvalue_maker.increment(serial, 1))) - failure("uphill: unexpected key-space overflow"); + if (unlikely(!keyvalue_maker.increment(serial, 1))) { + log_notice("ttl: unexpected key-space overflow"); + goto bailout; + } + } + err = breakable_restart(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("ttl: head-commit skip due '%s'", mdbx_strerror(err)); + serial = fifo.front().first; + fifo.pop_front(); } - - txn_end(false); report(1); } +bailout: + txn_end(true); if (dbi) { if (config.params.drop_table && !mode_readonly()) { txn_begin(false); db_table_drop(dbi); - txn_end(false); + err = breakable_commit(); + if (unlikely(err != MDBX_SUCCESS)) { + log_notice("ttl: bailout-clean due '%s'", mdbx_strerror(err)); + return true; + } } else db_table_close(dbi); } diff --git a/libs/libmdbx/src/test/utils.cc b/libs/libmdbx/src/test/utils.cc index ddf47a4cd9..d9b3538b99 100644 --- a/libs/libmdbx/src/test/utils.cc +++ b/libs/libmdbx/src/test/utils.cc @@ -17,6 +17,9 @@ #if defined(HAVE_IEEE754_H) || __has_include(<ieee754.h>) #include <ieee754.h> #endif +#if defined(__APPLE__) || defined(__MACH__) +#include <mach/mach_time.h> +#endif /* defined(__APPLE__) || defined(__MACH__) */ std::string format(const char *fmt, ...) { va_list ap, ones; @@ -353,7 +356,7 @@ void jitter_delay(bool extra) { cpu_relax(); if (dice > 2) { unsigned us = entropy_white() & - (extra ? 0xfffff /* 1.05 s */ : 0x3ff /* 1 ms */); + (extra ? 0xffff /* 656 ms */ : 0x3ff /* 1 ms */); log_trace("== jitter.delay: %0.6f", us / 1000000.0); osal_udelay(us); } diff --git a/libs/libmdbx/src/test/utils.h b/libs/libmdbx/src/test/utils.h index efda8394fc..d1b859acd4 100644 --- a/libs/libmdbx/src/test/utils.h +++ b/libs/libmdbx/src/test/utils.h @@ -26,9 +26,14 @@ #endif #if __GNUC_PREREQ(4, 4) || defined(__clang__) +#ifndef bswap64 #define bswap64(v) __builtin_bswap64(v) +#endif +#ifndef bswap32 #define bswap32(v) __builtin_bswap32(v) -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#endif +#if (__GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)) && \ + !defined(bswap16) #define bswap16(v) __builtin_bswap16(v) #endif @@ -184,52 +189,9 @@ static __inline uint64_t rot64(uint64_t v, unsigned s) { } #endif /* rot64 */ -#ifndef mul_32x32_64 -static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { - return a * (uint64_t)b; -} -#endif /* mul_32x32_64 */ - -#ifndef mul_64x64_128 - -static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) { - *sum += addend; - return (*sum < addend) ? 1u : 0u; -} - -static __inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, uint64_t *h) { -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t r = (__uint128_t)a * (__uint128_t)b; - /* modern GCC could nicely optimize this */ - *h = r >> 64; - return r; -#elif defined(mul_64x64_high) - *h = mul_64x64_high(a, b); - return a * b; -#else - /* performs 64x64 to 128 bit multiplication */ - uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); - uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); - uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); - *h = mul_32x32_64(a >> 32, b >> 32) + (lh >> 32) + (hl >> 32) + - add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32); - return ll; -#endif -} - -#endif /* mul_64x64_128() */ - -#ifndef mul_64x64_high -static __inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) { - uint64_t h; - mul_64x64_128(a, b, &h); - return h; -} -#endif /* mul_64x64_high */ - static __inline bool is_power2(size_t x) { return (x & (x - 1)) == 0; } +#undef roundup2 static __inline size_t roundup2(size_t value, size_t granularity) { assert(is_power2(granularity)); return (value + granularity - 1) & ~(granularity - 1); @@ -285,18 +247,20 @@ struct simple_checksum { simple_checksum() : value(0) {} - void push(uint32_t data) { + void push(const uint32_t &data) { value += data * UINT64_C(9386433910765580089) + 1; value ^= value >> 41; value *= UINT64_C(0xBD9CACC22C6E9571); } - void push(uint64_t data) { + void push(const uint64_t &data) { push((uint32_t)data); push((uint32_t)(data >> 32)); } - void push(bool data) { push(data ? UINT32_C(0x780E) : UINT32_C(0xFA18E)); } + void push(const bool data) { + push(data ? UINT32_C(0x780E) : UINT32_C(0xFA18E)); + } void push(const void *ptr, size_t bytes) { const uint8_t *data = (const uint8_t *)ptr; @@ -309,7 +273,7 @@ struct simple_checksum { void push(const std::string &str) { push(str.data(), str.size()); } void push(unsigned salt, const MDBX_val &val) { - push(val.iov_len); + push(unsigned(val.iov_len)); push(salt); push(val.iov_base, val.iov_len); } |