summaryrefslogtreecommitdiff
path: root/libs/libmdbx
diff options
context:
space:
mode:
Diffstat (limited to 'libs/libmdbx')
-rw-r--r--libs/libmdbx/src/.travis.yml1
-rw-r--r--libs/libmdbx/src/Makefile75
-rw-r--r--libs/libmdbx/src/README-RU.md257
-rw-r--r--libs/libmdbx/src/README.md539
-rw-r--r--libs/libmdbx/src/appveyor.yml2
-rw-r--r--libs/libmdbx/src/dll.vcxproj4
-rw-r--r--libs/libmdbx/src/libmdbx.cflags1
-rw-r--r--libs/libmdbx/src/libmdbx.cxxflags1
-rw-r--r--libs/libmdbx/src/libmdbx.files25
-rw-r--r--libs/libmdbx/src/libmdbx.includes2
-rw-r--r--libs/libmdbx/src/mdbx.h57
-rw-r--r--libs/libmdbx/src/packages/rpm/CMakeLists.txt6
-rw-r--r--libs/libmdbx/src/src/bits.h319
-rw-r--r--libs/libmdbx/src/src/defs.h84
-rw-r--r--libs/libmdbx/src/src/lck-linux.c428
-rw-r--r--libs/libmdbx/src/src/lck-posix.c293
-rw-r--r--libs/libmdbx/src/src/lck-windows.c35
-rw-r--r--libs/libmdbx/src/src/mdbx.c868
-rw-r--r--libs/libmdbx/src/src/osal.c311
-rw-r--r--libs/libmdbx/src/src/osal.h211
-rw-r--r--libs/libmdbx/src/src/tools/mdbx_chk.c218
-rw-r--r--libs/libmdbx/src/src/tools/mdbx_copy.c2
-rw-r--r--libs/libmdbx/src/src/tools/mdbx_dump.c3
-rw-r--r--libs/libmdbx/src/src/tools/mdbx_load.c10
-rw-r--r--libs/libmdbx/src/src/tools/mdbx_stat.c2
-rw-r--r--libs/libmdbx/src/test/append.cc62
-rw-r--r--libs/libmdbx/src/test/base.h4
-rw-r--r--libs/libmdbx/src/test/cases.cc3
-rw-r--r--libs/libmdbx/src/test/config.cc4
-rw-r--r--libs/libmdbx/src/test/config.h20
-rw-r--r--libs/libmdbx/src/test/darwin/LICENSE24
-rw-r--r--libs/libmdbx/src/test/darwin/README.md8
-rw-r--r--libs/libmdbx/src/test/darwin/pthread_barrier.c110
-rw-r--r--libs/libmdbx/src/test/darwin/pthread_barrier.h83
-rw-r--r--libs/libmdbx/src/test/gc.sh66
-rw-r--r--libs/libmdbx/src/test/hill.cc205
-rw-r--r--libs/libmdbx/src/test/keygen.cc3
-rw-r--r--libs/libmdbx/src/test/long_stochastic.sh138
-rw-r--r--libs/libmdbx/src/test/loop.bat15
-rw-r--r--libs/libmdbx/src/test/main.cc26
-rw-r--r--libs/libmdbx/src/test/osal-unix.cc10
-rw-r--r--libs/libmdbx/src/test/osal-windows.cc12
-rw-r--r--libs/libmdbx/src/test/test.cc87
-rw-r--r--libs/libmdbx/src/test/test.h3
-rw-r--r--libs/libmdbx/src/test/ttl.cc100
-rw-r--r--libs/libmdbx/src/test/utils.cc5
-rw-r--r--libs/libmdbx/src/test/utils.h62
47 files changed, 3420 insertions, 1384 deletions
diff --git a/libs/libmdbx/src/.travis.yml b/libs/libmdbx/src/.travis.yml
index 9359084060..e46991f597 100644
--- a/libs/libmdbx/src/.travis.yml
+++ b/libs/libmdbx/src/.travis.yml
@@ -7,6 +7,7 @@ compiler:
os:
- linux
+- osx
script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi
diff --git a/libs/libmdbx/src/Makefile b/libs/libmdbx/src/Makefile
index 145b7fadcc..2d261bc7c3 100644
--- a/libs/libmdbx/src/Makefile
+++ b/libs/libmdbx/src/Makefile
@@ -1,4 +1,4 @@
-# GNU Makefile for libmdbx, https://github.com/leo-yuriev/libmdbx
+# GNU Makefile for libmdbx, https://abf.io/erthink/libmdbx
########################################################################
# Configuration. The compiler options must enable threaded compilation.
@@ -23,17 +23,18 @@ suffix ?=
CC ?= gcc
CXX ?= g++
+LD ?= ld
CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden
-XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1
+XCFLAGS ?= -DNDEBUG=1 -DLIBMDBX_EXPORTS=1
CFLAGS += -D_GNU_SOURCE=1 -std=gnu11 -pthread $(XCFLAGS)
CXXFLAGS = -std=c++11 $(filter-out -std=gnu11,$(CFLAGS))
TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db
TESTLOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log
# LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old
-LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt
-EXE_LDFLAGS ?= -pthread -lrt
+LDFLAGS ?= $(shell $(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1')$(shell $(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip')
+EXE_LDFLAGS ?= -pthread
# LY: just for benchmarking
IOARENA ?= $(shell \
@@ -44,16 +45,55 @@ NN ?= 25000000
########################################################################
+ifdef MSVC
+ UNAME := Windows
+ LCK_IMPL := windows
+ TEST_OSAL := windows
+ TEST_ITER := 42
+else
+ UNAME := $(shell uname -s 2>/dev/null || echo Unknown)
+ define uname2lck
+ case "$(UNAME)" in
+ Linux) echo linux;;
+ CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;;
+ *) echo posix;;
+ esac
+ endef
+ define uname2osal
+ case "$(UNAME)" in
+ CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;;
+ *) echo unix;;
+ esac
+ endef
+ define uname2titer
+ case "$(UNAME)" in
+ Darwin*|Mach*) echo 3;;
+ *) echo 42;;
+ esac
+ endef
+ define uname2suffix
+ case "$(UNAME)" in
+ Darwin*|Mach*) echo dylib;;
+ CYGWIN*|MINGW*|MSYS*|Windows*) echo dll;;
+ *) echo so;;
+ esac
+ endef
+ LCK_IMPL := $(shell $(uname2lck))
+ TEST_OSAL := $(shell $(uname2osal))
+ TEST_ITER := $(shell $(uname2titer))
+ SO_SUFFIX := $(shell $(uname2suffix))
+endif
+
HEADERS := mdbx.h
-LIBRARIES := libmdbx.a libmdbx.so
+LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX)
TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk
MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1
SHELL := /bin/bash
-CORE_SRC := $(filter-out src/lck-windows.c, $(wildcard src/*.c))
+CORE_SRC := src/lck-$(LCK_IMPL).c $(filter-out $(wildcard src/lck-*.c), $(wildcard src/*.c))
CORE_INC := $(wildcard src/*.h)
CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC))
-TEST_SRC := $(filter-out test/osal-windows.cc, $(wildcard test/*.cc))
+TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc))
TEST_INC := $(wildcard test/*.h)
TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC))
@@ -61,10 +101,10 @@ TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC))
all: $(LIBRARIES) $(TOOLS) mdbx_test example
-mdbx: libmdbx.a libmdbx.so
+mdbx: libmdbx.a libmdbx.$(SO_SUFFIX)
-example: mdbx.h tutorial/sample-mdbx.c libmdbx.so
- $(CC) $(CFLAGS) -I. tutorial/sample-mdbx.c ./libmdbx.so -o example
+example: mdbx.h tutorial/sample-mdbx.c libmdbx.$(SO_SUFFIX)
+ $(CC) $(CFLAGS) -I. tutorial/sample-mdbx.c ./libmdbx.$(SO_SUFFIX) -o example
tools: $(TOOLS)
@@ -82,16 +122,19 @@ clean:
rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o
check: all
- rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=42 --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \
+ rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=$(TEST_ITER) --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \
&& ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy
check-singleprocess: all
- rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=42 --pathname=$(TESTDB) --dont-cleanup-after --hill --copy | tee -a $(TESTLOG) | tail -n 42) \
+ rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; \
+ ./mdbx_test --repeat=4 --pathname=$(TESTDB) --dont-cleanup-after --hill && \
+ ./mdbx_test --repeat=2 --pathname=$(TESTDB) --dont-cleanup-before --dont-cleanup-after --copy \
+ | tee -a $(TESTLOG) | tail -n 42) \
&& ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy
check-fault: all
rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \
- && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy
+ ; ./mdbx_chk -vvnw $(TESTDB) && ([ ! -e $(TESTDB)-copy ] || ./mdbx_chk -vvn $(TESTDB)-copy)
define core-rule
$(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile
@@ -110,13 +153,13 @@ $(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file))))
libmdbx.a: $(CORE_OBJ)
$(AR) rs $@ $?
-libmdbx.so: $(CORE_OBJ)
+libmdbx.$(SO_SUFFIX): $(CORE_OBJ)
$(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@
mdbx_%: src/tools/mdbx_%.c libmdbx.a
$(CC) $(CFLAGS) $^ $(EXE_LDFLAGS) -o $@
-mdbx_test: $(TEST_OBJ) libmdbx.so
+mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX)
$(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) -o $@
###############################################################################
@@ -142,7 +185,7 @@ bench-$(1)_$(2).txt: $(3) $(IOARENA) Makefile
endef
-$(eval $(call bench-rule,mdbx,$(NN),libmdbx.so))
+$(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX)))
$(eval $(call bench-rule,sophia,$(NN)))
$(eval $(call bench-rule,leveldb,$(NN)))
diff --git a/libs/libmdbx/src/README-RU.md b/libs/libmdbx/src/README-RU.md
index 97888cbf88..e040b5c4bb 100644
--- a/libs/libmdbx/src/README-RU.md
+++ b/libs/libmdbx/src/README-RU.md
@@ -1,56 +1,55 @@
+### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans.
+<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
+-----
+
libmdbx
======================================
-**The revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).**
-
-*The Future will Positive. Всё будет хорошо.*
-[![Build Status](https://travis-ci.org/leo-yuriev/libmdbx.svg?branch=master)](https://travis-ci.org/leo-yuriev/libmdbx)
-[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master)
-[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx)
+Доработанный и расширенный потомок [Lightning Memory-Mapped Database](https://ru.bmstu.wiki/LMDB_(Lightning_Memory-Mapped_Database)) (aka _LMDB_).
+English version is [here](README.md).
-English version [by Google](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/leo-yuriev/libmdbx/tree/master)
-and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.com%2FReOpen%2Flibmdbx%2Ftree%2Fmaster&lang=ru-en).
+_libmdbx_ превосходит LMDB по возможностям и надежности, не уступая в
+производительности. _libmdbx_ работает на Linux, FreeBSD, MacOS X и
+других ОС соответствующих POSIX.1-2008, а также поддерживает Windows в
+качестве дополнительной платформы.
-### Project Status
+Отдельно ведётся не-публичная разработка следующей версии, в которой
+будет кардинальное изменение как API, так и формата базы данных. Цель
+этой революции - обеспечение более четкого и надежного API, добавление
+новых функций, а также наделение базы данных новыми свойствами.
-**Сейчас MDBX _активно перерабатывается_** предстоит
-большое изменение как API, так и формата базы данных. К сожалению,
-обновление приведет к потере совместимости с предыдущими версиями.
+*Всё будет хорошо. The Future will (be) [Positive](https://www.ptsecurity.ru).*
-Цель этой революции - обеспечение более четкого надежного API и
-добавление новых функции, а также наделение базы данных новыми
-свойствами.
-
-В настоящее время MDBX предназначена для Linux, а также поддерживает
-Windows (начиная с Windows Server 2008) в качестве дополнительной
-платформы. Поддержка других ОС может быть обеспечена на коммерческой
-основе. Однако такие усовершенствования (т. е. pull-requests) могут быть
-приняты в мейнстрим только в том случае, если будет доступен
-соответствующий публичный и бесплатный сервис непрерывной интеграции
-(aka Continuous Integration).
+[![Build Status](https://travis-ci.org/leo-yuriev/libmdbx.svg?branch=master)](https://travis-ci.org/leo-yuriev/libmdbx)
+[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master)
+[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx)
## Содержание
- [Обзор](#Обзор)
- - [Сравнение с другими СУБД](#Сравнение-с-другими-СУБД)
- - [История & Acknowledgments](#История)
-- [Основные свойства](#Основные-свойства)
-- [Доработки и усовершенствования относительно LMDB](#Доработки-и-усовершенствования-относительно-lmdb)
-- [Недостатки и Компромиссы](#Недостатки-и-Компромиссы)
- - [Проблема долгих чтений](#Проблема-долгих-чтений)
- - [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации)
+ - [Сравнение с другими базами данных](#Сравнение-с-другими-базами-данных)
+ - [История & Выражение признательности](#История)
+- [Описание](#Описание)
+ - [Ключевые свойства](#Ключевые-свойства)
+ - [Доработки и усовершенствования относительно LMDB](#Доработки-и-усовершенствования-относительно-lmdb)
+ - [Недостатки и Компромиссы](#Недостатки-и-Компромиссы)
+ - [Проблема долгих чтений](#Проблема-долгих-чтений)
+ - [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации)
+- [Использование](#Использование)
+ - [Сборка](#Сборка)
+ - [Привязки к другим языкам](#Привязки-к-другим-языкам)
- [Сравнение производительности](#Сравнение-производительности)
- - [Интегральная производительность](#Интегральная-производительность)
- - [Масштабируемость чтения](#Масштабируемость-чтения)
- - [Синхронная фиксация](#Синхронная-фиксация)
- - [Отложенная фиксация](#Отложенная-фиксация)
- - [Асинхронная фиксация](#Асинхронная-фиксация)
- - [Потребление ресурсов](#Потребление-ресурсов)
+ - [Интегральная производительность](#Интегральная-производительность)
+ - [Масштабируемость чтения](#Масштабируемость-чтения)
+ - [Синхронная фиксация](#Синхронная-фиксация)
+ - [Отложенная фиксация](#Отложенная-фиксация)
+ - [Асинхронная фиксация](#Асинхронная-фиксация)
+ - [Потребление ресурсов](#Потребление-ресурсов)
+-----
## Обзор
_libmdbx_ - это встраиваемый key-value движок хранения со специфическим
набором свойств и возможностей, ориентированный на создание уникальных
-легковесных решений с предельной производительностью под Linux и
-Windows.
+легковесных решений с предельной производительностью.
_libmdbx_ позволяет множеству процессов совместно читать и обновлять
несколько key-value таблиц с соблюдением
@@ -79,11 +78,12 @@ _libmdbx_ не использует
сценарии использования.
-### Сравнение с другими СУБД
-Ввиду того, что в _libmdbx_ сейчас происходит революция, я посчитал
-лучшим решением ограничится здесь ссылкой на [главу Comparison with
-other databases](https://github.com/coreos/bbolt#comparison-with-other-databases)
-в описании _BoltDB_.
+### Сравнение с другими базами данных
+
+На данный момент, пожалуйста, обратитесь к [главе "сравнение BoltDB с
+другими базами
+данных"](https://github.com/coreos/bbolt#comparison-with-other-databases),
+которая также (в основном) применима к MDBX.
### История
@@ -104,16 +104,20 @@ Tables](https://github.com/leo-yuriev/libfpta), aka ["Позитивные
Technologies](https://www.ptsecurity.ru).
-#### Acknowledgments
-Howard Chu (Symas Corporation) - the author of LMDB, from which
-originated the MDBX in 2015.
+### Выражение признательности
+
+Говард Чу (Howard Chu) <hyc@openldap.org> является автором движка LMDB, от
+которого в 2015 году произошел MDBX.
+
+Мартин Хеденфальк (Martin Hedenfalk) <martin@bzero.se> является автором кода
+`btree.c`, который использовался для начала разработки LMDB.
-Martin Hedenfalk <martin@bzero.se> - the author of `btree.c` code, which
-was used for begin development of LMDB.
+-----
+Описание
+========
-Основные свойства
-=================
+## Ключевые свойства
_libmdbx_ наследует все ключевые возможности и особенности своего
прародителя
@@ -168,8 +172,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N).
кэшированием. Всё необходимое штатно выполняет ядро ОС.
-Доработки и усовершенствования относительно LMDB
-================================================
+## Доработки и усовершенствования относительно LMDB
1. Автоматическое динамическое управление размером БД согласно
параметрам задаваемым функцией `mdbx_env_set_geometry()`, включая шаг
@@ -328,7 +331,25 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N).
> - попытки повторного освобождения памяти;
> - повреждение памяти и ошибки сегментации.
---------------------------------------------------------------------------------
+32. На **MacOS X** для синхронизации данных с диском _по-умолчанию_
+используется системная функция `fcntl(F_FULLFSYNC)`, так как [только
+этим гарантируется сохранность
+данных](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html)
+при сбое электропитания. К сожалению, в сценариях с высокой
+интенсивностью пишущих транзакций, использование `F_FULLFSYNC` приводит
+к существенной деградации производительности в сравнении с LMDB, где
+используется системная функция `fsync()`. Поэтому _libmdbx_ позволяет
+переопределить это поведение определением опции
+`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` при сборке библиотеки.
+
+33. На **Windows** _libmdbx_ использует файловые блокировки
+`LockFileEx()`, так как это позволяет размещать БД на сетевых дисках, а
+также обеспечивает защиту от некомпетентных действий пользователя
+([защиту от
+дурака](https://ru.wikipedia.org/wiki/%D0%97%D0%B0%D1%89%D0%B8%D1%82%D0%B0_%D0%BE%D1%82_%D0%B4%D1%83%D1%80%D0%B0%D0%BA%D0%B0)).
+Поэтому _libmdbx_ может немного отставать в тестах производительность от
+LMDB, где используются именованные мьютексы.
+
## Недостатки и Компромиссы
@@ -386,7 +407,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N).
> Подробности ниже.
-#### Проблема долгих чтений
+### Проблема долгих чтений
*Следует отметить*, что проблема "сборки мусора" так или иначе
существует во всех СУБД (Vacuum в PostgreSQL). Однако в случае _libmdbx_
и LMDB она проявляется более остро, прежде всего из-за высокой
@@ -452,7 +473,7 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N).
за счет эффективной работы [BBWC](https://en.wikipedia.org/wiki/BBWC)
при включении `LIFO RECLAIM` в _libmdbx_.
-#### Сохранность данных в режиме асинхронной фиксации
+### Сохранность данных в режиме асинхронной фиксации
При работе в режиме `WRITEMAP+MAPSYNC` запись измененных страниц
выполняется ядром ОС, что имеет ряд преимуществ. Так например, при крахе
приложения, ядро ОС сохранит все изменения.
@@ -515,6 +536,119 @@ Amplification Factor) и RAF (Read Amplification Factor) также Olog(N).
--------------------------------------------------------------------------------
+Использование
+=============
+
+## Сборка
+
+Для сборки на всех платформах кроме Windows вам потребуются не-дремучие
+версии: GNU Make, [bash](https://ru.wikipedia.org/wiki/Bash), компиляторы C и C++ совместимые с GCC или CLANG.
+
+Исторически сборка _libmdbx_ основывается на одном
+[Makefile](https://ru.wikipedia.org/wiki/Makefile), что предполагает
+разные рецепты сборки в зависимости от целевой платформы. В следующих
+версиях планируется переход на использование
+[CMake](https://ru.wikipedia.org/wiki/CMake), с отказом от поддержки
+других инструментов.
+
+#### Выгрузка DSO/DLL и деструкторы Thread-Local-Storage объектов
+При сборке _libmdbx_ в виде разделяемой библиотеки, либо использовании
+статической _libmdbx_ в составе другой динамической библиотеке,
+желательно убедиться, что ваша система обеспечивает корректность вызова
+деструкторов Thread-Local-Storage объектов при выгрузке динамических
+библиотек.
+
+Если это не так, то при выгрузке динамической библиотеки с _libmdbx_
+внутри возможна либо утечка ресурсов, либо падения из-за вызова
+деструкторов из уже выгруженного DSO/DLL объекта. Проблема может
+проявляться только в многопоточном приложении, которое производит
+выгрузку разделяемых динамических библиотек с кодом _libmdbx_ внутри,
+после использования _libmdbx_. Заведомо известно, что TLS-деструкторы
+корректно обслуживаются:
+
+- На всех актуальных версиях Windows (Windows 7 и последующих).
+
+- На системах c функцией
+[`__cxa_thread_atexit_impl()`](https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables)
+в стандартной библиотеке C. В том числе на системах с GNU libc версии
+2.18 и выше.
+
+- На системах с libpthread/ntpl из состава GNU libc с исправлением
+ошибок [#21031](https://sourceware.org/bugzilla/show_bug.cgi?id=21031) и
+[#21032](https://sourceware.org/bugzilla/show_bug.cgi?id=21032), либо
+где нет подобных ошибок в реализации pthreads.
+
+### Linux и другие платформы с GNU Make
+Для сборки библиотеки достаточно выполнить `make all` в директории с
+исходными текстами, а для выполнения базовых тестов `make check`.
+
+Если установленный в система `make` не является GNU Make, то при попытке
+сборки будет масса ошибок от make. В этом случае, возможно, вместо
+`make` вам следует использовать `gmake`, либо даже `gnu-make` и т.п.
+
+### FreeBSD и родственные платформы
+Как правило, на таких системах по-умолчанию используется Berkeley Make.
+А GNU Make вызывается командой `gmake` или может отсутствовать. Кроме
+этого может отсутствовать [`bash`](https://ru.wikipedia.org/wiki/Bash).
+
+Вам необходимо установить требуемые компоненты: GNU Make, bash,
+компиляторы C и C++ совместимые с GCC или CLANG. После этого для сборки
+библиотеки достаточно выполнить `gmake all` (или `make all`) в
+директории с исходными текстами, а для выполнения базовых тестов `gmake
+check` (или `make check`).
+
+### Windows
+Для сборки libmdbx_ для ОС Windows рекомендуется использовать [Microsoft
+Visual Studio](https://ru.wikipedia.org/wiki/Microsoft_Visual_Studio),
+но не такие инструменты как MinGW, MSYS или Cygwin. Для этого в набор
+исходных кодов _libmdbx_ входят соответствующие файлы проектов
+совместимые с Visual Studio 2015, Windows SDK для Windows 8.1 и более
+поздними версиями. Достаточно открыть `mdbx.sln` и выполнить сборку
+библиотеки.
+
+Для сборки с более новыми версиями SDK или Visual Studio должно быть
+достаточно выполнить "Retarget solution". Для сборки под старые версии
+Windows (например Windows XP) или более старыми компиляторами вам
+потребуется самостоятельно преобразовать или воссоздать файлы проектов.
+
+Сборка посредством MinGW, MSYS или Cygwin потенциально возможна. Однако,
+эти сценарии не тестируются и вероятно потребуют от вас доработки
+`Makefile`. Следует отметить, что в _libmdbx_ предприняты усилия для
+устранения runtime зависимостей от CRT и других библиотек Visual Studio.
+Для этого достаточно при сборке определить опцию `MDBX_AVOID_CRT`.
+
+Пример запуска базового сценария тестирования можно найти в
+[CI-сценарии](appveyor.yml) для [AppVeyor](https://www.appveyor.com/).
+Для выполнения [сценария длительного стохастического
+тестирования](test/long_stochastic.sh) потребуется
+[`bash`](https://ru.wikipedia.org/wiki/Bash), а само тестирование
+рекомендуется выполнять с размещением тестовых данных на
+[RAM-диске](https://ru.wikipedia.org/wiki/RAM-%D0%B4%D0%B8%D1%81%D0%BA).
+
+### MacOS X
+Актуальные [нативные сборочные
+инструменты](https://ru.wikipedia.org/wiki/Xcode) для MacOS X включают
+GNU Make, CLANG и устаревшую версию bash. Поэтому для сборки библиотеки
+достаточно выполнить `make all` в директории с исходными текстами, а для
+выполнения базовых тестов `make check`. Если же что-то пойдет не так, то
+рекомендуется установить [Homebrew](https://brew.sh/) и попробовать ещё
+раз.
+
+Для выполнения [сценария длительного стохастического
+тестирования](test/long_stochastic.sh) потребуется установка актуальной
+(не устаревшей) версии [`bash`](https://ru.wikipedia.org/wiki/Bash). Для
+этого рекомендуется установить [Homebrew](https://brew.sh/), а затем
+выполнить `brew install bash`.
+
+## Привязки к другим языкам
+
+ | Runtime | GitHub | Author |
+ | -------- | ------ | ------ |
+ | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
+ | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
+
+--------------------------------------------------------------------------------
+
Сравнение производительности
============================
@@ -526,9 +660,8 @@ SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб.
сценарии тестирования [доступны на
github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015).
---------------------------------------------------------------------------------
-### Интегральная производительность
+## Интегральная производительность
Показана соотнесенная сумма ключевых показателей производительности в трёх
бенчмарках:
@@ -555,7 +688,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015).
--------------------------------------------------------------------------------
-### Масштабируемость чтения
+## Масштабируемость чтения
Для каждого движка показана суммарная производительность при
одновременном выполнении запросов чтения/поиска в 1-2-4-8 потоков на
@@ -565,7 +698,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015).
--------------------------------------------------------------------------------
-### Синхронная фиксация
+## Синхронная фиксация
- Линейная шкала слева и темные прямоугольники соответствуют количеству
транзакций в секунду, усредненному за всё время теста.
@@ -591,7 +724,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015).
--------------------------------------------------------------------------------
-### Отложенная фиксация
+## Отложенная фиксация
- Линейная шкала слева и темные прямоугольники соответствуют количеству
транзакций в секунду, усредненному за всё время теста.
@@ -622,7 +755,7 @@ _libmdbx_ при этом не ведет WAL, а передает весь ко
--------------------------------------------------------------------------------
-### Асинхронная фиксация
+## Асинхронная фиксация
- Линейная шкала слева и темные прямоугольники соответствуют количеству
транзакций в секунду, усредненному за всё время теста.
@@ -652,7 +785,7 @@ _libmdbx_ при этом не ведет WAL, а передает весь ко
--------------------------------------------------------------------------------
-### Потребление ресурсов
+## Потребление ресурсов
Показана соотнесенная сумма использованных ресурсов в ходе бенчмарка в
режиме отложенной фиксации:
diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md
index 78cc906763..a5b420f35b 100644
--- a/libs/libmdbx/src/README.md
+++ b/libs/libmdbx/src/README.md
@@ -1,159 +1,139 @@
+### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans.
+<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences -->
+-----
+
libmdbx
======================================
-**Revised and extended descendant of [Symas LMDB](https://symas.com/lmdb/).**
+Revised and extended descendant of [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) (aka _LMDB_).
+Русскоязычная версия [здесь](README-RU.md).
-*The Future will be positive.*
-[![Build Status](https://travis-ci.org/leo-yuriev/libmdbx.svg?branch=master)](https://travis-ci.org/leo-yuriev/libmdbx)
-[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master)
-[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx)
+_libmdbx_ is superior to LMDB in terms of features and reliability, not
+inferior in performance. _libmdbx_ works on Linux, FreeBSD, MacOS X and
+other systems compliant with POSIX.1-2008, but also support Windows as a
+complementary platform.
-## Project Status for now
-
- - The stable versions
- ([_stable/0.0_](https://github.com/leo-yuriev/libmdbx/tree/stable/0.0)
- and
- [_stable/0.1_](https://github.com/leo-yuriev/libmdbx/tree/stable/0.1)
- branches) of _MDBX_ are frozen, i.e. no new features or API changes, but
- only bug fixes.
-
- - The next version
- ([_devel_](https://github.com/leo-yuriev/libmdbx/tree/devel) branch)
- **is under active non-public development**, i.e. current API and set of
- features are extreme volatile.
-
- - The immediate goal of development is formation of the stable API and
- the stable internal database format, which allows realise all PLANNED
- FEATURES:
- 1. Integrity check by [Merkle tree](https://en.wikipedia.org/wiki/Merkle_tree);
- 2. Support for [raw block devices](https://en.wikipedia.org/wiki/Raw_device);
- 3. Separate place (HDD) for large data items;
- 4. Using "[Roaring bitmaps](http://roaringbitmap.org/about/)" inside garbage collector;
- 5. Non-sequential reclaiming, like PostgreSQL's [Vacuum](https://www.postgresql.org/docs/9.1/static/sql-vacuum.html);
- 6. [Asynchronous lazy data flushing](https://sites.fas.harvard.edu/~cs265/papers/kathuria-2008.pdf) to disk(s);
- 7. etc...
-
-Don't miss libmdbx for other runtimes.
-
-| Runtime | GitHub | Author |
-| ------------- | ------------- | ------------- |
-| JVM | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
-| .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
+The next version is under active non-public development, which will
+radically change both the API and the database format. The goal of this
+revolution is to provide a clearer and more reliable API, add more
+features and new database properties.
------
+*The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.*
-Nowadays MDBX intended for Linux, and support Windows (since Windows
-Server 2008) as a complementary platform. Support for other OS could be
-implemented on commercial basis. However such enhancements (i.e. pull
-requests) could be accepted in mainstream only when corresponding public
-and free Continuous Integration service will be available.
+[![Build Status](https://travis-ci.org/leo-yuriev/libmdbx.svg?branch=master)](https://travis-ci.org/leo-yuriev/libmdbx)
+[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master)
+[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx)
-## Contents
+## Table of Contents
- [Overview](#overview)
- - [Comparison with other DBs](#comparison-with-other-dbs)
- - [History & Acknowledgments](#history)
-- [Main features](#main-features)
-- [Improvements over LMDB](#improvements-over-lmdb)
-- [Gotchas](#gotchas)
- - [Long-time read transactions problem](#long-time-read-transactions-problem)
- - [Data safety in async-write-mode](#data-safety-in-async-write-mode)
+ - [Comparison with other databases](#comparison-with-other-databases)
+ - [History & Acknowledgments](#history)
+- [Description](#description)
+ - [Key features](#key-features)
+ - [Improvements over LMDB](#improvements-over-lmdb)
+ - [Gotchas](#gotchas)
+ - [Problem of long-time reading](#problem-of-long-time-reading)
+ - [Durability in asynchronous writing mode](#durability-in-asynchronous-writing-mode)
+- [Usage](#usage)
+ - [Building](#building)
+ - [Bindings](#bindings)
- [Performance comparison](#performance-comparison)
- - [Integral performance](#integral-performance)
- - [Read scalability](#read-scalability)
- - [Sync-write mode](#sync-write-mode)
- - [Lazy-write mode](#lazy-write-mode)
- - [Async-write mode](#async-write-mode)
- - [Cost comparison](#cost-comparison)
+ - [Integral performance](#integral-performance)
+ - [Read scalability](#read-scalability)
+ - [Sync-write mode](#sync-write-mode)
+ - [Lazy-write mode](#lazy-write-mode)
+ - [Async-write mode](#async-write-mode)
+ - [Cost comparison](#cost-comparison)
+-----
## Overview
_libmdbx_ is an embedded lightweight key-value database engine oriented
-for performance under Linux and Windows.
+for performance.
_libmdbx_ allows multiple processes to read and update several key-value
tables concurrently, while being
[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, with minimal
-overhead and operation cost of Olog(N).
-
-_libmdbx_ provides
-[serializability](https://en.wikipedia.org/wiki/Serializability) and
-consistency of data after crash. Read-write transactions don't block
-read-only transactions and are
-[serialized](https://en.wikipedia.org/wiki/Serializability) by
-[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion).
+overhead and Olog(N) operation cost.
-_libmdbx_
+_libmdbx_ enforce
+[serializability](https://en.wikipedia.org/wiki/Serializability) for
+writers by single
+[mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords
[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom)
-provides parallel read transactions without atomic operations or
-synchronization primitives.
+for parallel readers without atomic/interlocked operations, while
+writing and reading transactions do not block each other.
+
+_libmdbx_ can guarantee consistency after crash depending of operation
+mode.
_libmdbx_ uses [B+Trees](https://en.wikipedia.org/wiki/B%2B_tree) and
-[mmap](https://en.wikipedia.org/wiki/Memory-mapped_file), doesn't use
-[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). This might
-have caveats for some workloads.
+[Memory-Mapping](https://en.wikipedia.org/wiki/Memory-mapped_file),
+doesn't use [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging)
+which might be a caveat for some workloads.
-### Comparison with other DBs
-Because _libmdbx_ is currently overhauled, I think it's better to just
-link [chapter of Comparison with other
-databases](https://github.com/coreos/bbolt#comparison-with-other-databases)
-here.
+### Comparison with other databases
+For now please refer to [chapter of "BoltDB comparison with other
+databases"](https://github.com/coreos/bbolt#comparison-with-other-databases)
+which is also (mostly) applicable to MDBX.
### History
The _libmdbx_ design is based on [Lightning Memory-Mapped
Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
Initial development was going in
-[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project, about a
-year later it received separate development effort and in autumn 2015
-was isolated to separate project, which was [presented at Highload++
-2015 conference](http://www.highload.ru/2015/abstracts/1831.html).
+[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project. About a
+year later libmdbx was isolated to separate project, which was
+[presented at Highload++ 2015
+conference](http://www.highload.ru/2015/abstracts/1831.html).
-Since early 2017 _libmdbx_ is used in [Fast PositiveTables](https://github.com/leo-yuriev/libfpta),
-by [Positive Technologies](https://www.ptsecurity.com).
+Since early 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta),
+and development is funded by [Positive Technologies](https://www.ptsecurity.com).
-#### Acknowledgments
-Howard Chu (Symas Corporation) - the author of LMDB, from which
+### Acknowledgments
+Howard Chu <hyc@openldap.org> is the author of LMDB, from which
originated the MDBX in 2015.
-Martin Hedenfalk <martin@bzero.se> - the author of `btree.c` code, which
+Martin Hedenfalk <martin@bzero.se> is the author of `btree.c` code, which
was used for begin development of LMDB.
+-----
+
+Description
+===========
-Main features
-=============
+## Key features
-_libmdbx_ inherits all keys features and characteristics from
+_libmdbx_ inherits all features and characteristics from
[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database):
-1. Data is stored in ordered map, keys are always sorted, range lookups
-are supported.
+1. Key-value pairs are stored in ordered map(s), keys are always sorted,
+range lookups are supported.
-2. Data is [mmaped](https://en.wikipedia.org/wiki/Memory-mapped_file) to
-memory of each worker DB process, read transactions are zero-copy.
+2. Data is [memory-mapped](https://en.wikipedia.org/wiki/Memory-mapped_file)
+into each worker DB process, and could be accessed zero-copy from transactions.
3. Transactions are
-[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, thanks to
+[ACID](https://en.wikipedia.org/wiki/ACID)-compliant, through to
[MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
and [CoW](https://en.wikipedia.org/wiki/Copy-on-write). Writes are
strongly serialized and aren't blocked by reads, transactions can't
conflict with each other. Reads are guaranteed to get only commited data
([relaxing serializability](https://en.wikipedia.org/wiki/Serializability#Relaxing_serializability)).
-4. Reads and queries are
+4. Read transactions are
[non-blocking](https://en.wikipedia.org/wiki/Non-blocking_algorithm),
-don't use [atomic
-operations](https://en.wikipedia.org/wiki/Linearizability#High-level_atomic_operations).
+don't use [atomic operations](https://en.wikipedia.org/wiki/Linearizability#High-level_atomic_operations).
Readers don't block each other and aren't blocked by writers. Read
performance scales linearly with CPU core count.
- > Though "connect to DB" (start of first read transaction in thread) and
- > "disconnect from DB" (shutdown or thread termination) requires to
- > acquire a lock to register/unregister current thread from "readers
- > table"
+ > Nonetheless, "connect to DB" (starting the first read transaction in a thread) and
+ > "disconnect from DB" (closing DB or thread termination) requires a lock
+ > acquisition to register/unregister at the "readers table".
5. Keys with multiple values are stored efficiently without key
-duplication, sorted by value, including integers (reasonable for
+duplication, sorted by value, including integers (valuable for
secondary indexes).
-6. Efficient operation on short fixed length keys, including integer
-ones.
+6. Efficient operation on short fixed length keys,
+including 32/64-bit integer types.
7. [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write
Amplification Factor) и RAF (Read Amplification Factor) are Olog(N).
@@ -161,14 +141,12 @@ Amplification Factor) и RAF (Read Amplification Factor) are Olog(N).
8. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) and
transaction journal. In case of a crash no recovery needed. No need for
regular maintenance. Backups can be made on the fly on working DB
- without freezing writers.
+without freezing writers.
-9. No custom memory management, all done with standard OS syscalls.
+9. No additional memory management, all done by basic OS services.
---------------------------------------------------------------------------------
-Improvements over LMDB
-======================
+## Improvements over LMDB
1. Automatic dynamic DB size management according to the parameters
specified by `mdbx_env_set_geometry()` function. Including
@@ -192,21 +170,23 @@ additional resources for that.
[BBWC](https://en.wikipedia.org/wiki/Disk_buffer#Write_acceleration)
this may greatly improve write performance.
-4. Fast estimation of range query result size via functions `mdbx_estimate_range()`,
-`mdbx_estimate_move()` and `mdbx_estimate_distance()`. E.g. for selection the
-optimal query execution plan.
+4. Fast estimation of range query result size via functions
+`mdbx_estimate_range()`, `mdbx_estimate_move()` and
+`mdbx_estimate_distance()`. E.g. for selection the optimal query
+execution plan.
5. `mdbx_chk` tool for DB integrity check.
-6. Support for keys and values of zero length, including sorted
-duplicates.
+6. Support for keys and values of zero length, including multi-values
+(aka sorted duplicates).
-7. Ability to assign up to 3 markers to commiting transaction with
-`mdbx_canary_put()` and then get them in read transaction by
-`mdbx_canary_get()`.
+7. Ability to assign up to 3 persistent 64-bit markers to commiting
+transaction with `mdbx_canary_put()` and then get them in read
+transaction by `mdbx_canary_get()`.
8. Ability to update or delete record and get previous value via
-`mdbx_replace()`. Also can update specific multi-value.
+`mdbx_replace()`. Also allows update the specific item from multi-value
+with the same key.
9. Sequence generation via `mdbx_dbi_sequence()`.
@@ -228,30 +208,29 @@ duplicates.
* abort current write transaction with returning error code.
-11. Ability to open DB in exclusive mode with `MDBX_EXCLUSIVE` flag.
+11. Ability to open DB in exclusive mode by `MDBX_EXCLUSIVE` flag.
-12. Ability to get how far current read-only snapshot is from latest
-version of the DB by `mdbx_txn_straggler()`.
+12. Ability to get how far current read-transaction snapshot lags
+from the latest version of the DB by `mdbx_txn_straggler()`.
-13. Ability to explicitly request update of present record without
-creating new record. Implemented as `MDBX_CURRENT` flag for
-`mdbx_put()`.
+13. Ability to explicitly update the existing record, not insertion
+a new one. Implemented as `MDBX_CURRENT` flag for `mdbx_put()`.
14. Fixed `mdbx_cursor_count()`, which returns correct count of
-duplicated for all table types and any cursor position.
+duplicated (aka multi-value) for all cases and any cursor position.
15. `mdbx_env_info()` to getting additional info, including number of
-the oldest snapshot of DB, which is used by one of the readers.
+the oldest snapshot of DB, which is used by someone of the readers.
16. `mdbx_del()` doesn't ignore additional argument (specifier) `data`
for tables without duplicates (without flag `MDBX_DUPSORT`), if `data`
is not null then always uses it to verify record, which is being
deleted.
-17. Ability to open dbi-table with simultaneous setup of comparators for
-keys and values, via `mdbx_dbi_open_ex()`.
+17. Ability to open dbi-table with simultaneous with race-free setup
+of comparators for keys and values, via `mdbx_dbi_open_ex()`.
-18. `mdbx_is_dirty()`to find out if key or value is on dirty page, that
+18. `mdbx_is_dirty()`to find out if given key or value is on dirty page, that
useful to avoid copy-out before updates.
19. Correct update of current record in `MDBX_CURRENT` mode of
@@ -266,21 +245,21 @@ useful to avoid copy-out before updates.
22. Ability to get value by key and duplicates count by `mdbx_get_ex()`.
23. Functions `mdbx_cursor_on_first()` and `mdbx_cursor_on_last()`,
-which allows to know if cursor is currently on first or last position
+which allows to check cursor is currently on first or last position
respectively.
-24. Automatic creation of synchronization points (flush changes to
-persistent storage) when changes reach set threshold (threshold can be
-set by `mdbx_env_set_syncbytes()`).
+24. Automatic creation of steady commit-points (flushing data to the
+disk) when the volume of changes reaches a threshold, which can be
+set by `mdbx_env_set_syncbytes()`.
25. Control over debugging and receiving of debugging messages via
`mdbx_setup_debug()`.
-26. Function `mdbx_env_pgwalk()` for page-walking all pages in DB.
+26. Function `mdbx_env_pgwalk()` for page-walking the DB.
-27. Three meta-pages instead of two, this allows to guarantee
-consistently update weak sync-points without risking to corrupt last
-steady sync-point.
+27. Three meta-pages instead of two, that allows to guarantee
+consistency of data when updating weak commit-points without the
+risk of damaging the last steady commit-point.
28. Guarantee of DB integrity in `WRITEMAP+MAPSYNC` mode:
> Current _libmdbx_ gives a choice of safe async-write mode (default)
@@ -292,9 +271,9 @@ steady sync-point.
creation of steady synchronization point) via `mdbx_env_close_ex()`.
30. If read transaction is aborted via `mdbx_txn_abort()` or
-`mdbx_txn_reset()` then DBI-handles, which were opened in it, aren't
-closed or deleted. This allows to avoid several types of hard-to-debug
-errors.
+`mdbx_txn_reset()` then DBI-handles, which were opened during it,
+will not be closed or deleted. In several cases this allows
+to avoid hard-to-debug errors.
31. All cursors in all read and write transactions can be reused by
`mdbx_cursor_renew()` and MUST be freed explicitly.
@@ -307,13 +286,29 @@ errors.
> - double-free;
> - memory corruption and segfaults.
---------------------------------------------------------------------------------
+
+32. On **Mac OS X** the `fcntl(F_FULLFSYNC)` syscall is used _by
+default_ to synchronize data with the disk, as this is [the only way to
+guarantee data
+durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html)
+in case of power failure. Unfortunately, in scenarios with high write
+intensity, the use of `F_FULLFSYNC` significant degrades performance
+compared to LMDB, where the `fsync()` syscall is used. Therefore,
+_libmdbx_ allows you to override this behavior by defining the
+`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library.
+
+33. On **Windows** the `LockFileEx()` syscall is used for locking, since
+it allows place the database on network drives, and provides protection
+against incompetent user actions (aka
+[poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore
+_libmdbx_ may be a little lag in performance tests from LMDB where a
+named mutexes are used.
+
## Gotchas
-1. At one moment there can be only one writer. But this allows to
-serialize writes and eliminate any possibility of conflict or logical
-errors during transaction rollback.
+1. There cannot be more than one writer at a time. This allows serialize an
+updates and eliminate any possibility of conflicts, deadlocks or logical errors.
2. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) means
relatively big [WAF](https://en.wikipedia.org/wiki/Write_amplification)
@@ -346,7 +341,7 @@ performance bottleneck in `MAPASYNC` mode.
> storage then it's much more preferable to use `std::map`.
-4. LMDB has a problem of long-time readers which degrades performance
+4. _LMDB_ has a problem of long-time readers which degrades performance
and bloats DB.
> _libmdbx_ addresses that, details below.
@@ -357,56 +352,41 @@ of data.
> Details below.
-#### Long-time read transactions problem
+### Problem of long-time reading
Garbage collection problem exists in all databases one way or another
(e.g. VACUUM in PostgreSQL). But in _libmdbx_ and LMDB it's even more
-important because of high performance and deliberate simplification of
-internals with emphasis on performance.
+discernible because of high transaction rate and intentional internals
+simplification in favor of performance.
+
+Understanding the problem requires some explanation, but can be
+difficult for quick perception. So is is reasonable
+to simplify this as follows:
-* Altering data during long read operation may exhaust available space
-on persistent storage.
+* Massive altering of data during a parallel long read operation may
+exhaust the free DB space.
-* If available space is exhausted then any attempt to update data
-results in `MAP_FULL` error until long read operation ends.
+* If the available space is exhausted, any attempt to update the data
+will cause a "MAP_FULL" error until a long read transaction is
+completed.
-* Main examples of long readers is hot backup and debugging of client
-application which actively uses read transactions.
+* A good example of long readers is a hot backup or debugging of
+a client application while retaining an active read transaction.
* In _LMDB_ this results in degraded performance of all operations of
-syncing data to persistent storage.
-
-* _libmdbx_ has a mechanism which aborts such operations and `LIFO RECLAIM`
-mode which addresses performance degradation.
-
-Read operations operate only over snapshot of DB which is consistent on
-the moment when read transaction started. This snapshot doesn't change
-throughout the transaction but this leads to inability to reclaim the
-pages until read transaction ends.
-
-In _LMDB_ this leads to a problem that memory pages, allocated for
-operations during long read, will be used for operations and won't be
-reclaimed until DB process terminates. In _LMDB_ they are used in
-[FIFO](https://en.wikipedia.org/wiki/FIFO_(computing_and_electronics))
-manner, which causes increased page count and less chance of cache hit
-during I/O. In other words: one long-time reader can impact performance
-of all database until it'll be reopened.
-
-_libmdbx_ addresses the problem, details below. Illustrations to this
-problem can be found in the
-[presentation](http://www.slideshare.net/leoyuriev/lmdb). There is also
-example of performance increase thanks to
-[BBWC](https://en.wikipedia.org/wiki/Disk_buffer#Write_acceleration)
-when `LIFO RECLAIM` enabled in _libmdbx_.
-
-#### Data safety in async-write mode
-In `WRITEMAP+MAPSYNC` mode dirty pages are written to persistent storage
-by kernel. This means that in case of application crash OS kernel will
-write all dirty data to disk and nothing will be lost. But in case of
-hardware malfunction or OS kernel fatal error only some dirty data might
-be synced to disk, and there is high probability that pages with
-metadata saved, will point to non-saved, hence non-existent, data pages.
-In such situation, DB is completely corrupted and can't be repaired even
-if there was full sync before the crash via `mdbx_env_sync().
+writing data to persistent storage.
+
+* _libmdbx_ has the `OOM-KICK` mechanism which allow to abort such
+operations and the `LIFO RECLAIM` mode which addresses performance
+degradation.
+
+### Durability in asynchronous writing mode
+In `WRITEMAP+MAPSYNC` mode updated (aka dirty) pages are written to
+persistent storage by the OS kernel. This means that if the application
+fails, the OS kernel will finish writing all updated data to disk and
+nothing will be lost. However, in the case of hardware malfunction or OS
+kernel fatal error, only some updated data can be written to disk and
+the database structure is likely to be destroyed. In such situation, DB
+is completely corrupted and can't be repaired.
_libmdbx_ addresses this by fully reimplementing write path of data:
@@ -414,39 +394,154 @@ _libmdbx_ addresses this by fully reimplementing write path of data:
instead their shadow copies are used and their updates are synced after
data is flushed to disk.
-* During transaction commit _libmdbx_ marks synchronization points as
-steady or weak depending on how much synchronization needed between RAM
-and persistent storage, e.g. in `WRITEMAP+MAPSYNC` commited transactions
-are marked as weak, but during explicit data synchronization - as
-steady.
+* During transaction commit _libmdbx_ marks it as a steady or weak
+depending on synchronization status between RAM and persistent storage.
+For instance, in the `WRITEMAP+MAPSYNC` mode committed transactions
+are marked as weak by default, but as steady after explicit data flushes.
* _libmdbx_ maintains three separate meta-pages instead of two. This
-allows to commit transaction with steady or weak synchronization point
-without losing two previous synchronization points (one of them can be
-steady, and second - weak). This allows to order weak and steady
-synchronization points in any order without losing consistency in case
-of system crash.
-
-* During DB open _libmdbx_ rollbacks to the last steady synchronization
-point, this guarantees database integrity.
-
-For data safety pages which form database snapshot with steady
-synchronization point must not be updated until next steady
-synchronization point. So last steady synchronization point creates
-"long-time read" effect. The only difference that in case of memory
-exhaustion the problem will be immediately addressed by flushing changes
-to persistent storage and forming new steady synchronization point.
+allows to commit transaction as steady or weak without losing two
+previous commit points (one of them can be steady, and another
+weak). Thus, after a fatal system failure, it will be possible to
+rollback to the last steady commit point.
+
+* During DB open _libmdbx_ rollbacks to the last steady commit point,
+this guarantees database integrity after a crash. However, if the
+database opening in read-only mode, such rollback cannot be performed
+which will cause returning the MDBX_WANNA_RECOVERY error.
+
+For data integrity a pages which form database snapshot with steady
+commit point, must not be updated until next steady commit point.
+Therefore the last steady commit point creates an effect analogues to
+"long-time read". The only difference that now in case of space
+exhaustion the problem will be immediately addressed by writing changes
+to disk and forming the new steady commit point.
+
+So in async-write mode _libmdbx_ will always use new pages until the
+free DB space will be exhausted or `mdbx_env_sync()` will be invoked,
+and the total write traffic to the disk will be the same as in
+sync-write mode.
+
+Currently libmdbx gives a choice between a safe async-write mode
+(default) and `UTTERLY_NOSYNC` mode which may lead to DB corruption
+after a system crash, i.e. like the LMDB.
+
+Next version of _libmdbx_ will be automatically create steady commit
+points in async-write mode upon completion transfer data to the disk.
-So in async-write mode _libmdbx_ will always use new pages until memory
-is exhausted or `mdbx_env_sync()` is invoked. Total disk usage will be
-almost the same as in sync-write mode.
+--------------------------------------------------------------------------------
-Current _libmdbx_ gives a choice of safe async-write mode (default) and
-`UTTERLY_NOSYNC` mode which may result in full DB corruption during
-system crash as with LMDB.
+Usage
+=====
+
+## Building
+
+To build on all platforms except Windows the prerequirements are the
+same: non-obsolete versions of GNU Make,
+[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)), C and C++
+compilers compatible with GCC or CLANG. On Windows you will need only :
+Microsoft Visual Studio 2015 or later, Windows SDK for Windows 8 or
+later.
+
+Historically, the libmdbx builing is based on single
+[Makefile](https://en.wikipedia.org/wiki/Makefile) which assumes
+different recipes depending on target platform. In the next versions, it
+is planned to switch to [CMake](https://en.wikipedia.org/wiki/CMake),
+with the refusal to support other tools.
+
+#### DSO/DLL unloading and destructors of Thread-Local-Storage objects
+When building _libmdbx_ as a shared library or use static _libmdbx_ as a
+part of another dynamic library, it is advisable to make sure that your
+system ensures the correctness of the call destructors of
+Thread-Local-Storage objects when unloading dynamic libraries'.
+
+If this is not the case, then unloading a dynamic-link library with
+_libmdbx_ code inside, can result in either a resource leak or a crash
+due to calling destructors from an already unloaded DSO/DLL object. The
+problem can only manifest in a multithreaded application, which makes
+the unloading of shared dynamic libraries with _libmdbx_ code inside,
+after using _libmdbx_. It is known that TLS-destructors are properly
+maintained in the following cases:
+
+- On all modern versions of Windows (Windows 7 and later).
+
+- On systems with the
+[`__cxa_thread_atexit_impl()`](https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables)
+function in the standard C library, including systems with GNU libc
+version 2.18 and later.
+
+- On systems with libpthread/ntpl from GNU libc with bug fixes
+[#21031](https://sourceware.org/bugzilla/show_bug.cgi?id=21031) and
+[#21032](https://sourceware.org/bugzilla/show_bug.cgi?id=21032), or
+where there are no similar bugs in the pthreads implementation.
+
+### Linux and other platforms with GNU Make
+To build the library it is enough to execute `make all` in the directory
+of source code, and `make check` for execute the basic tests.
+
+If the `make` installed on the system is not GNU Make, there will be a
+lot of errors from make when trying to build. In this case, perhaps you
+should use `gmake` instead of `make`, or even `gnu-make`, etc.
+
+### FreeBSD and related platforms
+As a rule, in such systems, the default is to use Berkeley Make. And GNU
+Make is called by the gmake command or may be missing. In addition,
+[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) may be absent.
+
+You need to install the required components: GNU Make, bash, C and C++
+compilers compatible with GCC or CLANG. After that, to build the
+library, it is enough execute `gmake all` (or `make all`) in the
+directory with source code, and `gmake check` (or `make check`) to run
+the basic tests.
+
+### Windows
+For building _libmdbx_ on Windows the [Microsoft Visual
+Studio](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) is
+recommended, but not tools such as MinGW, MSYS, or Cygwin. To do this,
+the libmdbx source code includes the set of appropriate project files
+that are compatible with Visual Studio 2015, the Windows SDK for Windows
+8.1, and later. Just open `mdbx.sln` in Visual Studio and build the
+library.
+
+To build with newer versions of the SDK or Visual Studio, it should be
+sufficient to execute "Retarget solution". To build for older versions
+of Windows (such as Windows XP) or by older compilers, you will need to
+convert or recreate the corresponding project files yourself.
+
+Building by MinGW, MSYS or Cygwin is potentially possible. However,
+these scripts are not tested and will probably require you to modify the
+Makefile. It should be noted that in _libmdbx_ was efforts to resolve
+runtime dependencies from CRT and other libraries Visual Studio.
+For this is enough define the `MDBX_AVOID_CRT` during build.
+
+An example of running a basic test script can be found in the
+[CI-script](appveyor.yml) for [AppVeyor](https://www.appveyor.com/). To
+run the [long stochastic test scenario](test/long_stochastic.sh),
+[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) is required, and
+the such testing is recommended with place the test data on the
+[RAM-disk](https://en.wikipedia.org/wiki/RAM_drive).
+
+### MacOS X
+Current [native build tools](https://en.wikipedia.org/wiki/Xcode) for
+MacOS X include GNU Make, CLANG and an outdated version of bash.
+Therefore, to build the library, it is enough to run `make all` in the
+directory with source code, and run `make check` to execute the base
+tests. If something goes wrong, it is recommended to install
+[Homebrew](https://brew.sh/) and try again.
+
+To run the [long stochastic test scenario](test/long_stochastic.sh), you
+will need to install the current (not outdated) version of
+[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). To do this, we
+recommend that you install [Homebrew](https://brew.sh/) and then execute
+`brew install bash`.
+
+## Bindings
+
+ | Runtime | GitHub | Author |
+ | -------- | ------ | ------ |
+ | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) |
+ | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) |
-Next version of _libmdbx_ will create steady synchronization points
-automatically in async-write mode.
--------------------------------------------------------------------------------
@@ -458,9 +553,7 @@ and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015)
runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz, 8 Gb RAM,
SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb.
---------------------------------------------------------------------------------
-
-### Integral performance
+## Integral performance
Here showed sum of performance metrics in 3 benchmarks:
@@ -486,7 +579,7 @@ Here showed sum of performance metrics in 3 benchmarks:
--------------------------------------------------------------------------------
-### Read Scalability
+## Read Scalability
Summary performance with concurrent read/search queries in 1-2-4-8
threads on 4 CPU cores machine.
@@ -495,7 +588,7 @@ threads on 4 CPU cores machine.
--------------------------------------------------------------------------------
-### Sync-write mode
+## Sync-write mode
- Linear scale on left and dark rectangles mean arithmetic mean
transactions per second;
@@ -517,7 +610,7 @@ and after full run the database contains 10,000 small key-value records.
--------------------------------------------------------------------------------
-### Lazy-write mode
+## Lazy-write mode
- Linear scale on left and dark rectangles mean arithmetic mean of
thousands transactions per second;
@@ -544,7 +637,7 @@ records.
--------------------------------------------------------------------------------
-### Async-write mode
+## Async-write mode
- Linear scale on left and dark rectangles mean arithmetic mean of
thousands transactions per second;
@@ -568,7 +661,7 @@ and after full run the database contains 10,000 small key-value records.
--------------------------------------------------------------------------------
-### Cost comparison
+## Cost comparison
Summary of used resources during lazy-write mode benchmarks:
diff --git a/libs/libmdbx/src/appveyor.yml b/libs/libmdbx/src/appveyor.yml
index ca70d96265..0c0dd8a6a6 100644
--- a/libs/libmdbx/src/appveyor.yml
+++ b/libs/libmdbx/src/appveyor.yml
@@ -1,4 +1,4 @@
-version: 0.2.0.{build}
+version: 0.3.2.{build}
environment:
matrix:
diff --git a/libs/libmdbx/src/dll.vcxproj b/libs/libmdbx/src/dll.vcxproj
index c0743890ce..5464c15875 100644
--- a/libs/libmdbx/src/dll.vcxproj
+++ b/libs/libmdbx/src/dll.vcxproj
@@ -99,7 +99,7 @@
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
- <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions);MDBX_DEBUG=1</PreprocessorDefinitions>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<WarningLevel>EnableAllWarnings</WarningLevel>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
@@ -162,7 +162,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>EnableAllWarnings</WarningLevel>
- <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions);MDBX_DEBUG=1</PreprocessorDefinitions>
+ <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<StringPooling>true</StringPooling>
<TreatWarningAsError>true</TreatWarningAsError>
diff --git a/libs/libmdbx/src/libmdbx.cflags b/libs/libmdbx/src/libmdbx.cflags
new file mode 100644
index 0000000000..bb8c4c3337
--- /dev/null
+++ b/libs/libmdbx/src/libmdbx.cflags
@@ -0,0 +1 @@
+-std=c11 \ No newline at end of file
diff --git a/libs/libmdbx/src/libmdbx.cxxflags b/libs/libmdbx/src/libmdbx.cxxflags
new file mode 100644
index 0000000000..6435dfce2f
--- /dev/null
+++ b/libs/libmdbx/src/libmdbx.cxxflags
@@ -0,0 +1 @@
+-std=c++17 \ No newline at end of file
diff --git a/libs/libmdbx/src/libmdbx.files b/libs/libmdbx/src/libmdbx.files
index f147554a2a..cdea405b5d 100644
--- a/libs/libmdbx/src/libmdbx.files
+++ b/libs/libmdbx/src/libmdbx.files
@@ -1,26 +1,20 @@
-CMakeLists.txt
-README-RU.md
-pcrf_test/CMakeLists.txt
-src/tools/CMakeLists.txt
-test/CMakeLists.txt
-test/append.cc
-test/copy.cc
-test/ttl.cc
-tutorial/CMakeLists.txt
-tutorial/sample-mdbx.c
AUTHORS
+CMakeLists.txt
LICENSE
Makefile
+README-RU.md
README.md
TODO.md
mdbx.h
src/bits.h
src/defs.h
+src/lck-linux.c
src/lck-posix.c
src/lck-windows.c
src/mdbx.c
src/osal.c
src/osal.h
+src/tools/CMakeLists.txt
src/tools/mdbx_chk.c
src/tools/mdbx_copy.1
src/tools/mdbx_copy.c
@@ -33,11 +27,16 @@ src/tools/mdbx_stat.c
src/tools/wingetopt.c
src/tools/wingetopt.h
src/version.c
+test/CMakeLists.txt
test/actor.cc
+test/append.cc
test/base.h
+test/cases.cc
test/chrono.cc
test/chrono.h
+test/config.cc
test/config.h
+test/copy.cc
test/dead.cc
test/hill.cc
test/jitter.cc
@@ -46,16 +45,18 @@ test/keygen.h
test/log.cc
test/log.h
test/main.cc
-test/config.cc
-test/cases.cc
test/osal-unix.cc
test/osal-windows.cc
test/osal.h
+test/pcrf/CMakeLists.txt
test/test.cc
test/test.h
test/try.cc
+test/ttl.cc
test/utils.cc
test/utils.h
+tutorial/CMakeLists.txt
tutorial/README.md
tutorial/sample-bdb.txt
tutorial/sample-mdb.txt
+tutorial/sample-mdbx.c
diff --git a/libs/libmdbx/src/libmdbx.includes b/libs/libmdbx/src/libmdbx.includes
index 0b5ade5690..eb512a01ce 100644
--- a/libs/libmdbx/src/libmdbx.includes
+++ b/libs/libmdbx/src/libmdbx.includes
@@ -2,5 +2,3 @@
src
src/tools
test
-pcrf_test
-tutorial
diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h
index 8f3b1575dc..67b725139a 100644
--- a/libs/libmdbx/src/mdbx.h
+++ b/libs/libmdbx/src/mdbx.h
@@ -117,7 +117,11 @@ typedef DWORD mdbx_tid_t;
typedef int mdbx_filehandle_t;
typedef pid_t mdbx_pid_t;
typedef pthread_t mdbx_tid_t;
+#ifdef ENODATA
#define MDBX_ENODATA ENODATA
+#else
+#define MDBX_ENODATA -1
+#endif
#define MDBX_EINVAL EINVAL
#define MDBX_EACCESS EACCES
#define MDBX_ENOMEM ENOMEM
@@ -149,8 +153,8 @@ typedef pthread_t mdbx_tid_t;
#else
#define __dll_export
#endif
-#elif defined(__GNUC__) || __has_attribute(visibility)
-#define __dll_export __attribute__((visibility("default")))
+#elif defined(__GNUC__) || __has_attribute(__visibility__)
+#define __dll_export __attribute__((__visibility__("default")))
#else
#define __dll_export
#endif
@@ -736,6 +740,8 @@ LIBMDBX_API int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
* [out] stat The address of an MDBX_stat structure where the statistics
* will be copied */
LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes);
+LIBMDBX_API int mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn,
+ MDBX_stat *stat, size_t bytes);
/* Return information about the MDBX environment.
*
@@ -743,6 +749,8 @@ LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes);
* [out] stat The address of an MDBX_envinfo structure
* where the information will be copied */
LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, size_t bytes);
+LIBMDBX_API int mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn,
+ MDBX_envinfo *info, size_t bytes);
/* Flush the data buffers to disk.
*
@@ -1578,14 +1586,15 @@ LIBMDBX_API char *mdbx_dkey(const MDBX_val *key, char *const buf,
LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync);
-/* Set threshold to force flush the data buffers to disk,
+/* Sets threshold to force flush the data buffers to disk,
* even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags
- * in the environment.
+ * in the environment. The value affects all processes which operates with given
+ * DB until the last process close DB or a new value will be settled.
*
* Data is always written to disk when mdbx_txn_commit() is called,
* but the operating system may keep it buffered. MDBX always flushes
* the OS buffers upon commit as well, unless the environment was
- * opened with MDBX_NOSYNC or in part MDBX_NOMETASYNC.
+ * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC.
*
* The default is 0, than mean no any threshold checked, and no additional
* flush will be made.
@@ -1597,6 +1606,32 @@ LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync);
* Returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes);
+/* Sets relative period since the last unsteay commit to force flush the data
+ * buffers to disk, even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags
+ * in the environment. The value affects all processes which operates with given
+ * DB until the last process close DB or a new value will be settled.
+ *
+ * Data is always written to disk when mdbx_txn_commit() is called,
+ * but the operating system may keep it buffered. MDBX always flushes
+ * the OS buffers upon commit as well, unless the environment was
+ * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC.
+ *
+ * Settled period don't checked asynchronously, but only inside the functions.
+ * mdbx_txn_commit() and mdbx_env_sync(). Therefore, in cases where transactions
+ * are committed infrequently and/or irregularly, polling by mdbx_env_sync() may
+ * be a reasonable solution to timeout enforcement.
+ *
+ * The default is 0, than mean no any timeout checked, and no additional
+ * flush will be made.
+ *
+ * [in] env An environment handle returned by mdbx_env_create()
+ * [in] seconds_16dot16 The period in 1/65536 of second when a synchronous
+ * flush would be made since the last unsteay commit.
+ *
+ * Returns A non-zero error value on failure and 0 on success. */
+LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env,
+ unsigned seconds_16dot16);
+
/* Returns a lag of the reading for the given transaction.
*
* Returns an information for estimate how much given read-only
@@ -1654,6 +1689,7 @@ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env);
#define MDBX_DBG_AUDIT 16
#define MDBX_DBG_JITTER 32
#define MDBX_DBG_DUMP 64
+#define MDBX_DBG_LEGACY_MULTIOPEN 128
typedef void MDBX_debug_func(int type, const char *function, int line,
const char *msg, va_list args);
@@ -1675,11 +1711,12 @@ typedef enum {
#define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1))
#define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2))
-typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned number, void *ctx,
- int deep, const char *dbi, size_t page_size,
- MDBX_page_type_t type, size_t nentries,
- size_t payload_bytes, size_t header_bytes,
- size_t unused_bytes);
+typedef int
+MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx,
+ const int deep, const char *const dbi,
+ const size_t page_size, const MDBX_page_type_t type,
+ const size_t nentries, const size_t payload_bytes,
+ const size_t header_bytes, const size_t unused_bytes);
LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor,
void *ctx);
diff --git a/libs/libmdbx/src/packages/rpm/CMakeLists.txt b/libs/libmdbx/src/packages/rpm/CMakeLists.txt
index 55fbe964fb..5949e9f0ac 100644
--- a/libs/libmdbx/src/packages/rpm/CMakeLists.txt
+++ b/libs/libmdbx/src/packages/rpm/CMakeLists.txt
@@ -3,8 +3,8 @@ set(TARGET mdbx)
project(${TARGET})
set(MDBX_VERSION_MAJOR 0)
-set(MDBX_VERSION_MINOR 2)
-set(MDBX_VERSION_RELEASE 0)
+set(MDBX_VERSION_MINOR 3)
+set(MDBX_VERSION_RELEASE 1)
set(MDBX_VERSION_REVISION 0)
set(MDBX_VERSION_STRING ${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE})
@@ -83,7 +83,7 @@ set(${TARGET}_SRC
mdbx.h
src/bits.h
src/defs.h
- src/lck-posix.c
+ src/lck-linux.c
src/mdbx.c
src/osal.c
src/osal.h
diff --git a/libs/libmdbx/src/src/bits.h b/libs/libmdbx/src/src/bits.h
index ad335b6b65..fd36cac918 100644
--- a/libs/libmdbx/src/src/bits.h
+++ b/libs/libmdbx/src/src/bits.h
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
@@ -15,30 +15,37 @@
/* *INDENT-OFF* */
/* clang-format off */
+/* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
#ifndef MDBX_DEBUG
+#ifdef NDEBUG
# define MDBX_DEBUG 0
+#else
+# define MDBX_DEBUG 1
+#endif
#endif
+/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
#if MDBX_DEBUG
# undef NDEBUG
#endif
+#define MDBX_OSX_WANNA_DURABILITY 0 /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
+#define MDBX_OSX_WANNA_SPEED 1 /* using fsync() with chance of data lost on power failure */
+#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
+#define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
+#endif
+
/*----------------------------------------------------------------------------*/
/* Should be defined before any includes */
-#ifndef _GNU_SOURCE
-# define _GNU_SOURCE 1
-#endif
-#ifndef _POSIX_C_SOURCE
-# define _POSIX_C_SOURCE 200112L
-#endif
-#ifndef _XOPEN_SOURCE
-# define _XOPEN_SOURCE 500
-#endif
#ifndef _FILE_OFFSET_BITS
# define _FILE_OFFSET_BITS 64
#endif
+#ifdef __APPLE__
+#define _DARWIN_C_SOURCE
+#endif
+
#ifdef _MSC_VER
# if _MSC_VER < 1400
# error "Microsoft Visual C++ 8.0 (Visual Studio 2005) or later version is required"
@@ -156,7 +163,7 @@
/* The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 2
/* The version number for a database's lockfile format. */
-#define MDBX_LOCK_VERSION 2
+#define MDBX_LOCK_VERSION 3
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@@ -198,74 +205,6 @@ typedef uint16_t indx_t;
/* Core structures for database and shared memory (i.e. format definition) */
#pragma pack(push, 1)
-/* Reader Lock Table
- *
- * Readers don't acquire any locks for their data access. Instead, they
- * simply record their transaction ID in the reader table. The reader
- * mutex is needed just to find an empty slot in the reader table. The
- * slot's address is saved in thread-specific data so that subsequent
- * read transactions started by the same thread need no further locking to
- * proceed.
- *
- * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
- * No reader table is used if the database is on a read-only filesystem.
- *
- * Since the database uses multi-version concurrency control, readers don't
- * actually need any locking. This table is used to keep track of which
- * readers are using data from which old transactions, so that we'll know
- * when a particular old transaction is no longer in use. Old transactions
- * that have discarded any data pages can then have those pages reclaimed
- * for use by a later write transaction.
- *
- * The lock table is constructed such that reader slots are aligned with the
- * processor's cache line size. Any slot is only ever used by one thread.
- * This alignment guarantees that there will be no contention or cache
- * thrashing as threads update their own slot info, and also eliminates
- * any need for locking when accessing a slot.
- *
- * A writer thread will scan every slot in the table to determine the oldest
- * outstanding reader transaction. Any freed pages older than this will be
- * reclaimed by the writer. The writer doesn't use any locks when scanning
- * this table. This means that there's no guarantee that the writer will
- * see the most up-to-date reader info, but that's not required for correct
- * operation - all we need is to know the upper bound on the oldest reader,
- * we don't care at all about the newest reader. So the only consequence of
- * reading stale information here is that old pages might hang around a
- * while longer before being reclaimed. That's actually good anyway, because
- * the longer we delay reclaiming old pages, the more likely it is that a
- * string of contiguous pages can be found after coalescing old pages from
- * many old transactions together. */
-
-/* The actual reader record, with cacheline padding. */
-typedef struct MDBX_reader {
- /* Current Transaction ID when this transaction began, or (txnid_t)-1.
- * Multiple readers that start at the same time will probably have the
- * same ID here. Again, it's not important to exclude them from
- * anything; all we need to know is which version of the DB they
- * started from so we can avoid overwriting any data used in that
- * particular version. */
- volatile txnid_t mr_txnid;
-
- /* The information we store in a single slot of the reader table.
- * In addition to a transaction ID, we also record the process and
- * thread ID that owns a slot, so that we can detect stale information,
- * e.g. threads or processes that went away without cleaning up.
- *
- * NOTE: We currently don't check for stale records.
- * We simply re-init the table when we know that we're the only process
- * opening the lock file. */
-
- /* The process ID of the process owning this reader txn. */
- volatile mdbx_pid_t mr_pid;
- /* The thread ID of the thread owning this txn. */
- volatile mdbx_tid_t mr_tid;
-
- /* cache line alignment */
- uint8_t pad[MDBX_CACHELINE_SIZE -
- (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) %
- MDBX_CACHELINE_SIZE];
-} MDBX_reader;
-
/* Information about a single database in the environment. */
typedef struct MDBX_db {
uint16_t md_flags; /* see mdbx_dbi_open */
@@ -324,6 +263,12 @@ typedef struct MDBX_meta {
/* txnid that committed this page, the second of a two-phase-update pair */
volatile txnid_t mm_txnid_b;
+
+ /* Number of non-meta pages which were put in GC after COW. May be 0 in case
+ * DB was previously handled by libmdbx without corresponding feature.
+ * This value in couple with mr_snapshot_pages_retired allows fast estimation
+ * of "how much reader is restraining GC recycling". */
+ uint64_t mm_pages_retired;
} MDBX_meta;
/* Common header for all page types. The page type depends on mp_flags.
@@ -412,6 +357,84 @@ typedef struct MDBX_page {
#define MAX_MAPSIZE MAX_MAPSIZE32
#endif /* MDBX_WORDBITS */
+#pragma pack(pop)
+
+/* Reader Lock Table
+ *
+ * Readers don't acquire any locks for their data access. Instead, they
+ * simply record their transaction ID in the reader table. The reader
+ * mutex is needed just to find an empty slot in the reader table. The
+ * slot's address is saved in thread-specific data so that subsequent
+ * read transactions started by the same thread need no further locking to
+ * proceed.
+ *
+ * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
+ * No reader table is used if the database is on a read-only filesystem.
+ *
+ * Since the database uses multi-version concurrency control, readers don't
+ * actually need any locking. This table is used to keep track of which
+ * readers are using data from which old transactions, so that we'll know
+ * when a particular old transaction is no longer in use. Old transactions
+ * that have discarded any data pages can then have those pages reclaimed
+ * for use by a later write transaction.
+ *
+ * The lock table is constructed such that reader slots are aligned with the
+ * processor's cache line size. Any slot is only ever used by one thread.
+ * This alignment guarantees that there will be no contention or cache
+ * thrashing as threads update their own slot info, and also eliminates
+ * any need for locking when accessing a slot.
+ *
+ * A writer thread will scan every slot in the table to determine the oldest
+ * outstanding reader transaction. Any freed pages older than this will be
+ * reclaimed by the writer. The writer doesn't use any locks when scanning
+ * this table. This means that there's no guarantee that the writer will
+ * see the most up-to-date reader info, but that's not required for correct
+ * operation - all we need is to know the upper bound on the oldest reader,
+ * we don't care at all about the newest reader. So the only consequence of
+ * reading stale information here is that old pages might hang around a
+ * while longer before being reclaimed. That's actually good anyway, because
+ * the longer we delay reclaiming old pages, the more likely it is that a
+ * string of contiguous pages can be found after coalescing old pages from
+ * many old transactions together. */
+
+/* The actual reader record, with cacheline padding. */
+typedef struct MDBX_reader {
+ /* Current Transaction ID when this transaction began, or (txnid_t)-1.
+ * Multiple readers that start at the same time will probably have the
+ * same ID here. Again, it's not important to exclude them from
+ * anything; all we need to know is which version of the DB they
+ * started from so we can avoid overwriting any data used in that
+ * particular version. */
+ volatile txnid_t mr_txnid;
+
+ /* The information we store in a single slot of the reader table.
+ * In addition to a transaction ID, we also record the process and
+ * thread ID that owns a slot, so that we can detect stale information,
+ * e.g. threads or processes that went away without cleaning up.
+ *
+ * NOTE: We currently don't check for stale records.
+ * We simply re-init the table when we know that we're the only process
+ * opening the lock file. */
+
+ /* The thread ID of the thread owning this txn. */
+ union {
+ volatile mdbx_tid_t mr_tid;
+ volatile uint64_t mr_tid_u64;
+ };
+ /* The process ID of the process owning this reader txn. */
+ union {
+ volatile mdbx_pid_t mr_pid;
+ volatile uint32_t mr_pid_u32;
+ };
+ /* The number of pages used in the reader's MVCC snapshot,
+ * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
+ volatile pgno_t mr_snapshot_pages_used;
+ /* Number of retired pages at the time this reader starts transaction. So,
+ * at any time the difference mm_pages_retired - mr_snapshot_pages_retired
+ * will give the number of pages which this reader restraining from reuse. */
+ volatile uint64_t mr_snapshot_pages_retired;
+} MDBX_reader;
+
/* The header for the reader table (a memory-mapped lock file). */
typedef struct MDBX_lockinfo {
/* Stamp identifying this as an MDBX file.
@@ -424,68 +447,62 @@ typedef struct MDBX_lockinfo {
/* Flags which environment was opened. */
volatile uint32_t mti_envmode;
+ /* Threshold of un-synced-with-disk pages for auto-sync feature,
+ * zero means no-threshold, i.e. auto-sync is disabled. */
+ volatile pgno_t mti_autosync_threshold;
+ /* Period for timed auto-sync feature, i.e. at the every steady checkpoint
+ * the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
+ * The time value is represented in a suitable system-dependent form, for
+ * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
+ * Zero means timed auto-sync is disabled. */
+ volatile uint64_t mti_autosync_period;
+
+ alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
#ifdef MDBX_OSAL_LOCK
- /* Mutex protecting write-txn. */
- union {
- MDBX_OSAL_LOCK mti_wmutex;
- uint8_t pad_mti_wmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)];
- };
+ /* Mutex protecting write-txn. */
+ MDBX_OSAL_LOCK mti_wmutex;
#endif
-#define MDBX_lockinfo_SIZE_A \
- (8 /* mti_magic_and_version */ + 4 /* mti_os_and_format */ + \
- 4 /* mti_envmode */ + MDBX_OSAL_LOCK_SIZE /* mti_wmutex */ + \
- MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_wmutex */)
- /* cache-line alignment */
- uint8_t
- pad_a[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_A % MDBX_CACHELINE_SIZE];
-
- /* The number of slots that have been used in the reader table.
- * This always records the maximum count, it is not decremented
- * when readers release their slots. */
- volatile unsigned mti_numreaders;
+ volatile txnid_t mti_oldest_reader;
-#ifdef MDBX_OSAL_LOCK
- /* Mutex protecting readers registration access to this table. */
- union {
- MDBX_OSAL_LOCK mti_rmutex;
- uint8_t pad_mti_rmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)];
- };
-#endif
+ /* Timestamp for auto-sync feature, i.e. the steady checkpoint should be
+ * created at the first commit that will be not early this timestamp.
+ * The time value is represented in a suitable system-dependent form, for
+ * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
+ * Zero means timed auto-sync is not pending. */
+ volatile uint64_t mti_unsynced_timeout;
- volatile txnid_t mti_oldest;
- volatile uint32_t mti_readers_refresh_flag;
+ /* Number un-synced-with-disk pages for auto-sync feature. */
+ volatile pgno_t mti_unsynced_pages;
-#define MDBX_lockinfo_SIZE_B \
- (sizeof(unsigned) /* mti_numreaders */ + \
- MDBX_OSAL_LOCK_SIZE /* mti_rmutex */ + sizeof(txnid_t) /* mti_oldest */ + \
- sizeof(uint32_t) /* mti_readers_refresh_flag */ + \
- MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_rmutex */)
+ alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
- /* cache-line alignment */
- uint8_t
- pad_b[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_B % MDBX_CACHELINE_SIZE];
+#ifdef MDBX_OSAL_LOCK
+ /* Mutex protecting readers registration access to this table. */
+ MDBX_OSAL_LOCK mti_rmutex;
+#endif
- MDBX_reader mti_readers[1];
+ /* The number of slots that have been used in the reader table.
+ * This always records the maximum count, it is not decremented
+ * when readers release their slots. */
+ volatile unsigned mti_numreaders;
+ volatile unsigned mti_readers_refresh_flag;
+ alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
+ MDBX_reader mti_readers[1];
} MDBX_lockinfo;
-#pragma pack(pop)
-
-#define MDBX_LOCKINFO_WHOLE_SIZE \
- ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \
- ~((size_t)MDBX_CACHELINE_SIZE - 1))
-
/* Lockfile format signature: version, features and field layout */
#define MDBX_LOCK_FORMAT \
- ((MDBX_OSAL_LOCK_SIGN << 16) + \
- (uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1))
+ (MDBX_OSAL_LOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \
+ (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \
+ (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \
+ (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 29)
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
-#define MDBX_LOCK_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
#ifndef MDBX_ASSUME_MALLOC_OVERHEAD
#define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
@@ -768,8 +785,6 @@ struct MDBX_env {
unsigned me_psize2log; /* log2 of DB page size */
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
unsigned me_maxreaders; /* size of the reader table */
- /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */
- unsigned me_close_readers;
mdbx_fastmutex_t me_dbi_lock;
MDBX_dbi me_numdbs; /* number of DBs opened */
MDBX_dbi me_maxdbs; /* size of the DB table */
@@ -798,14 +813,24 @@ struct MDBX_env {
unsigned me_maxgc_ov1page;
/* Max size of a node on a page */
unsigned me_nodemax;
- unsigned me_maxkey_limit; /* max size of a key */
- mdbx_pid_t me_live_reader; /* have liveness lock in reader table */
- void *me_userctx; /* User-settable context */
- size_t me_sync_pending; /* Total dirty/non-sync'ed bytes
- * since the last mdbx_env_sync() */
- size_t me_sync_threshold; /* Treshold of above to force synchronous flush */
+ unsigned me_maxkey_limit; /* max size of a key */
+ mdbx_pid_t me_live_reader; /* have liveness lock in reader table */
+ void *me_userctx; /* User-settable context */
+ volatile uint64_t *me_unsynced_timeout;
+ volatile uint64_t *me_autosync_period;
+ volatile pgno_t *me_unsynced_pages;
+ volatile pgno_t *me_autosync_threshold;
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
- txnid_t me_oldest_stub;
+ struct {
+#ifdef MDBX_OSAL_LOCK
+ MDBX_OSAL_LOCK wmutex;
+#endif
+ txnid_t oldest;
+ uint64_t unsynced_timeout;
+ uint64_t autosync_period;
+ pgno_t autosync_pending;
+ pgno_t autosync_threshold;
+ } me_lckless_stub;
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */
#endif
@@ -826,7 +851,6 @@ struct MDBX_env {
/* Workaround for LockFileEx and WriteFile multithread bug */
CRITICAL_SECTION me_windowsbug_lock;
#else
- mdbx_fastmutex_t me_lckless_wmutex;
mdbx_fastmutex_t me_remap_guard;
#endif
};
@@ -840,22 +864,16 @@ typedef struct MDBX_ntxn {
/*----------------------------------------------------------------------------*/
/* Debug and Logging stuff */
+#ifndef mdbx_runtime_flags /* avoid override from tools */
extern int mdbx_runtime_flags;
+#endif
extern MDBX_debug_func *mdbx_debug_logger;
extern txnid_t mdbx_debug_edge;
void mdbx_debug_log(int type, const char *function, int line, const char *fmt,
- ...)
-#if defined(__GNUC__) || __has_attribute(format)
- __attribute__((format(printf, 4, 5)))
-#endif
- ;
+ ...) __printf_args(4, 5);
-void mdbx_panic(const char *fmt, ...)
-#if defined(__GNUC__) || __has_attribute(format)
- __attribute__((format(printf, 1, 2)))
-#endif
- ;
+void mdbx_panic(const char *fmt, ...) __printf_args(1, 2);
#if MDBX_DEBUG
@@ -1212,10 +1230,23 @@ static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) {
}
static __inline void mdbx_jitter4testing(bool tiny) {
-#ifndef NDEBUG
+#if MDBX_DEBUG
if (MDBX_DBG_JITTER & mdbx_runtime_flags)
mdbx_osal_jitter(tiny);
#else
(void)tiny;
#endif
}
+
+/* Controls checking PID against reuse DB environment after the fork() */
+#ifndef MDBX_TXN_CHECKPID
+#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
+/* PID check could be ommited:
+ * - on Linux when madvise(MADV_DONTFORK) is available. i.e. after the fork()
+ * mapped pages will not be available for child process.
+ * - in Windows where fork() not available. */
+#define MDBX_TXN_CHECKPID 0
+#else
+#define MDBX_TXN_CHECKPID 1
+#endif
+#endif /* MDBX_TXN_CHECKPID */
diff --git a/libs/libmdbx/src/src/defs.h b/libs/libmdbx/src/src/defs.h
index a672b8bdf0..622f1c13b6 100644
--- a/libs/libmdbx/src/src/defs.h
+++ b/libs/libmdbx/src/src/defs.h
@@ -104,8 +104,8 @@
/*----------------------------------------------------------------------------*/
#ifndef __always_inline
-# if defined(__GNUC__) || __has_attribute(always_inline)
-# define __always_inline __inline __attribute__((always_inline))
+# if defined(__GNUC__) || __has_attribute(__always_inline__)
+# define __always_inline __inline __attribute__((__always_inline__))
# elif defined(_MSC_VER)
# define __always_inline __forceinline
# else
@@ -114,8 +114,8 @@
#endif /* __always_inline */
#ifndef __noinline
-# if defined(__GNUC__) || __has_attribute(noinline)
-# define __noinline __attribute__((noinline))
+# if defined(__GNUC__) || __has_attribute(__noinline__)
+# define __noinline __attribute__((__noinline__))
# elif defined(_MSC_VER)
# define __noinline __declspec(noinline)
# elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
@@ -126,16 +126,16 @@
#endif /* __noinline */
#ifndef __must_check_result
-# if defined(__GNUC__) || __has_attribute(warn_unused_result)
-# define __must_check_result __attribute__((warn_unused_result))
+# if defined(__GNUC__) || __has_attribute(__warn_unused_result__)
+# define __must_check_result __attribute__((__warn_unused_result__))
# else
# define __must_check_result
# endif
#endif /* __must_check_result */
#ifndef __deprecated
-# if defined(__GNUC__) || __has_attribute(deprecated)
-# define __deprecated __attribute__((deprecated))
+# if defined(__GNUC__) || __has_attribute(__deprecated__)
+# define __deprecated __attribute__((__deprecated__))
# elif defined(_MSC_VER)
# define __deprecated __declspec(deprecated)
# else
@@ -163,8 +163,8 @@
#endif /* __noop */
#ifndef __fallthrough
-# if __GNUC_PREREQ(7, 0) || __has_attribute(fallthrough)
-# define __fallthrough __attribute__((fallthrough))
+# if __GNUC_PREREQ(7, 0) || __has_attribute(__fallthrough__)
+# define __fallthrough __attribute__((__fallthrough__))
# else
# define __fallthrough __noop()
# endif
@@ -189,8 +189,8 @@
#endif /* __prefetch */
#ifndef __noreturn
-# if defined(__GNUC__) || __has_attribute(noreturn)
-# define __noreturn __attribute__((noreturn))
+# if defined(__GNUC__) || __has_attribute(__noreturn__)
+# define __noreturn __attribute__((__noreturn__))
# elif defined(_MSC_VER)
# define __noreturn __declspec(noreturn)
# else
@@ -199,8 +199,14 @@
#endif /* __noreturn */
#ifndef __nothrow
-# if defined(__GNUC__) || __has_attribute(nothrow)
-# define __nothrow __attribute__((nothrow))
+# if defined(__cplusplus)
+# if __cplusplus < 201703L
+# define __nothrow throw()
+# else
+# define __nothrow noexcept(true)
+# endif /* __cplusplus */
+# elif defined(__GNUC__) || __has_attribute(__nothrow__)
+# define __nothrow __attribute__((__nothrow__))
# elif defined(_MSC_VER) && defined(__cplusplus)
# define __nothrow __declspec(nothrow)
# else
@@ -214,8 +220,8 @@
* Such a function can be subject to common subexpression elimination
* and loop optimization just as an arithmetic operator would be.
* These functions should be declared with the attribute pure. */
-# if defined(__GNUC__) || __has_attribute(pure)
-# define __pure_function __attribute__((pure))
+# if defined(__GNUC__) || __has_attribute(__pure__)
+# define __pure_function __attribute__((__pure__))
# else
# define __pure_function
# endif
@@ -231,27 +237,27 @@
* data pointed to must not be declared const. Likewise, a function
* that calls a non-const function usually must not be const.
* It does not make sense for a const function to return void. */
-# if defined(__GNUC__) || __has_attribute(const)
-# define __const_function __attribute__((const))
+# if defined(__GNUC__) || __has_attribute(__const__)
+# define __const_function __attribute__((__const__))
# else
# define __const_function
# endif
#endif /* __const_function */
-#ifndef __dll_hidden
-# if defined(__GNUC__) || __has_attribute(visibility)
-# define __hidden __attribute__((visibility("hidden")))
+#ifndef __hidden
+# if defined(__GNUC__) || __has_attribute(__visibility__)
+# define __hidden __attribute__((__visibility__("hidden")))
# else
# define __hidden
# endif
-#endif /* __dll_hidden */
+#endif /* __hidden */
#ifndef __optimize
# if defined(__OPTIMIZE__)
-# if defined(__clang__) && !__has_attribute(optimize)
+# if defined(__clang__) && !__has_attribute(__optimize__)
# define __optimize(ops)
-# elif defined(__GNUC__) || __has_attribute(optimize)
-# define __optimize(ops) __attribute__((optimize(ops)))
+# elif defined(__GNUC__) || __has_attribute(__optimize__)
+# define __optimize(ops) __attribute__((__optimize__(ops)))
# else
# define __optimize(ops)
# endif
@@ -263,12 +269,13 @@
#ifndef __hot
# if defined(__OPTIMIZE__)
# if defined(__e2k__)
-# define __hot __attribute__((hot)) __optimize(3)
-# elif defined(__clang__) && !__has_attribute(hot)
+# define __hot __attribute__((__hot__)) __optimize(3)
+# elif defined(__clang__) && !__has_attribute(__hot_) \
+ && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
/* just put frequently used functions in separate section */
-# define __hot __attribute__((section("text.hot"))) __optimize("O3")
-# elif defined(__GNUC__) || __has_attribute(hot)
-# define __hot __attribute__((hot)) __optimize("O3")
+# define __hot __attribute__((__section__("text.hot"))) __optimize("O3")
+# elif defined(__GNUC__) || __has_attribute(__hot__)
+# define __hot __attribute__((__hot__)) __optimize("O3")
# else
# define __hot __optimize("O3")
# endif
@@ -280,12 +287,13 @@
#ifndef __cold
# if defined(__OPTIMIZE__)
# if defined(__e2k__)
-# define __cold __attribute__((cold)) __optimize(1)
-# elif defined(__clang__) && !__has_attribute(cold)
+# define __cold __attribute__((__cold__)) __optimize(1)
+# elif defined(__clang__) && !__has_attribute(cold) \
+ && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__))
/* just put infrequently used functions in separate section */
-# define __cold __attribute__((section("text.unlikely"))) __optimize("Os")
+# define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
# elif defined(__GNUC__) || __has_attribute(cold)
-# define __cold __attribute__((cold)) __optimize("Os")
+# define __cold __attribute__((__cold__)) __optimize("Os")
# else
# define __cold __optimize("Os")
# endif
@@ -295,8 +303,8 @@
#endif /* __cold */
#ifndef __flatten
-# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(flatten))
-# define __flatten __attribute__((flatten))
+# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__))
+# define __flatten __attribute__((__flatten__))
# else
# define __flatten
# endif
@@ -338,9 +346,9 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__)));
# define mdbx_func_ "<mdbx_unknown>"
#endif
-#if defined(__GNUC__) || __has_attribute(format)
+#if defined(__GNUC__) || __has_attribute(__format__)
#define __printf_args(format_index, first_arg) \
- __attribute__((format(printf, format_index, first_arg)))
+ __attribute__((__format__(printf, format_index, first_arg)))
#else
#define __printf_args(format_index, first_arg)
#endif
diff --git a/libs/libmdbx/src/src/lck-linux.c b/libs/libmdbx/src/src/lck-linux.c
new file mode 100644
index 0000000000..925941b229
--- /dev/null
+++ b/libs/libmdbx/src/src/lck-linux.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
+ * and other libmdbx authors: please see AUTHORS file.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted only as authorized by the OpenLDAP
+ * Public License.
+ *
+ * A copy of this license is available in the file LICENSE in the
+ * top-level directory of the distribution or, alternatively, at
+ * <http://www.OpenLDAP.org/license.html>.
+ */
+
+#if !(defined(__linux__) || defined(__gnu_linux__))
+#error "This implementation of locking only supports Linux,\
+ where is no interaction between the types of lock placed\
+ by flock() and fcntl()."
+#endif
+
+#include "./bits.h"
+#include <sys/utsname.h>
+
+/* Some platforms define the EOWNERDEAD error code
+ * even though they don't support Robust Mutexes.
+ * Compile with -DMDBX_USE_ROBUST=0. */
+#ifndef MDBX_USE_ROBUST
+/* Howard Chu: Android currently lacks Robust Mutex support */
+#if defined(EOWNERDEAD) && \
+ !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \
+ with Robust Mutex too. */ \
+ && (!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) || \
+ _POSIX_C_SOURCE >= 200809L)
+#define MDBX_USE_ROBUST 1
+#else
+#define MDBX_USE_ROBUST 0
+#endif
+#endif /* MDBX_USE_ROBUST */
+
+/*----------------------------------------------------------------------------*/
+/* global constructor/destructor */
+
+uint32_t mdbx_linux_kernel_version;
+static __cold __attribute__((__constructor__)) void
+mdbx_global_constructor(void) {
+ struct utsname buffer;
+ if (uname(&buffer) == 0) {
+ int i = 0;
+ char *p = buffer.release;
+ while (*p && i < 4) {
+ if (*p >= '0' && *p <= '9') {
+ long number = strtol(p, &p, 10);
+ if (number > 0) {
+ if (number > 255)
+ number = 255;
+ mdbx_linux_kernel_version += number << (24 - i * 8);
+ }
+ ++i;
+ } else {
+ ++p;
+ }
+ }
+ }
+
+ mdbx_rthc_global_init();
+}
+
+static __cold __attribute__((__destructor__)) void
+mdbx_global_destructor(void) {
+ mdbx_rthc_global_dtor();
+}
+
+/*----------------------------------------------------------------------------*/
+/* lck */
+
+/* Описание реализации блокировок для Linux:
+ *
+ * lck-файл отображается в память, в нём организуется таблица читателей и
+ * размещаются совместно используемые posix-мьютексы (futex). Посредством
+ * этих мьютексов (см struct MDBX_lockinfo) реализуются:
+ * - Блокировка таблицы читателей для регистрации,
+ * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock().
+ * - Блокировка БД для пишущих транзакций,
+ * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
+ *
+ * Остальной функционал реализуется отдельно посредством файловых блокировок:
+ * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
+ * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade().
+ * - Проверка присутствие процессов-читателей,
+ * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
+ *
+ * Используется два вида файловых блокировок flock() и fcntl(F_SETLK),
+ * как для lck-файла, так и для основного файла БД:
+ * - Для контроля процессов-читателей используются однобайтовые
+ * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
+ * в качестве позиции используется pid процесса-читателя.
+ * - Для первоначального захвата и shared/exclusive блокировок используется
+ * комбинация flock() и fcntl(F_SETLK) блокировки одного байта lck-файла
+ * в нулевой позиции (нулевая позиция не используется механизмом контроля
+ * процессов-читателей, так как pid пользовательского процесса в Linux
+ * всегда больше 0).
+ * - Кроме этого, flock() блокировка основного файла БД используется при работе
+ * в режимах без lck-файла, как в в read-only, так и в эксклюзивном.
+ * - Блокировки flock() и fcntl(F_SETLK) в Linux работают независимо. Поэтому
+ * их комбинирование позволяет предотвратить совместное использование БД
+ * через NFS, что позволяет fcntl(F_SETLK), одновременно защитившись
+ * от проблем не-аторманости flock() при переходе между эксклюзивным
+ * и атомарным режимами блокировок.
+ */
+
+static int op_setlk, op_setlkw, op_getlk;
+static void __cold choice_fcntl() {
+ assert(!op_setlk && !op_setlkw && !op_getlk);
+#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK)
+ if (mdbx_linux_kernel_version >
+ 0x030f0000 /* OFD locks are available since 3.15, but engages here
+ only for 3.16 and larer kernels (LTS) for reliability reasons */
+ && (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0) {
+ op_setlk = F_OFD_SETLK;
+ op_setlkw = F_OFD_SETLKW;
+ op_getlk = F_OFD_GETLK;
+ return;
+ }
+#endif /* OFD locks */
+ op_setlk = F_SETLK;
+ op_setlkw = F_SETLKW;
+ op_getlk = F_GETLK;
+}
+
+#ifndef OFF_T_MAX
+#define OFF_T_MAX \
+ ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
+#endif
+#define LCK_WHOLE OFF_T_MAX
+
+static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
+ off_t len) {
+ for (;;) {
+ struct flock lock_op;
+ memset(&lock_op, 0, sizeof(lock_op));
+ lock_op.l_type = lck;
+ lock_op.l_whence = SEEK_SET;
+ lock_op.l_start = offset;
+ lock_op.l_len = len;
+ if (fcntl(fd, cmd, &lock_op) == 0) {
+ if (cmd == op_getlk) {
+ /* Checks reader by pid. Returns:
+ * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock)
+ * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */
+ return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
+ : MDBX_RESULT_TRUE;
+ }
+ return 0;
+ }
+ int rc = errno;
+ if (rc != EINTR || cmd == op_setlkw)
+ return rc;
+ }
+}
+
+static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) {
+ assert(lfd != INVALID_HANDLE_VALUE);
+ if (flock(lfd, LOCK_EX | LOCK_NB))
+ return errno;
+ int rc = mdbx_lck_op(lfd, op_setlk, F_WRLCK, 0, 1);
+ if (rc != 0 && fallback2shared) {
+ while (flock(lfd, LOCK_SH)) {
+ int rc = errno;
+ if (rc != EINTR)
+ return rc;
+ }
+ }
+ return rc;
+}
+
+static __inline int mdbx_lck_shared(int lfd) {
+ assert(lfd != INVALID_HANDLE_VALUE);
+ while (flock(lfd, LOCK_SH)) {
+ int rc = errno;
+ if (rc != EINTR)
+ return rc;
+ }
+ return mdbx_lck_op(lfd, op_setlkw, F_RDLCK, 0, 1);
+}
+
+int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
+ assert(env->me_lfd != INVALID_HANDLE_VALUE);
+ return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS;
+}
+
+int mdbx_rpid_set(MDBX_env *env) {
+ assert(env->me_lfd != INVALID_HANDLE_VALUE);
+ assert(env->me_pid > 0);
+ return mdbx_lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
+}
+
+int mdbx_rpid_clear(MDBX_env *env) {
+ assert(env->me_lfd != INVALID_HANDLE_VALUE);
+ assert(env->me_pid > 0);
+ return mdbx_lck_op(env->me_lfd, op_setlkw, F_UNLCK, env->me_pid, 1);
+}
+
+int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) {
+ assert(env->me_lfd != INVALID_HANDLE_VALUE);
+ assert(pid > 0);
+ return mdbx_lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
+ const int rc);
+
+int __cold mdbx_lck_init(MDBX_env *env) {
+ pthread_mutexattr_t ma;
+ int rc = pthread_mutexattr_init(&ma);
+ if (rc)
+ return rc;
+
+ rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
+ if (rc)
+ goto bailout;
+
+#if MDBX_USE_ROBUST
+#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
+ !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
+ rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
+#else
+ rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
+#endif
+ if (rc)
+ goto bailout;
+#endif /* MDBX_USE_ROBUST */
+
+#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU)
+ rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT);
+ if (rc == ENOTSUP)
+ rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE);
+ if (rc)
+ goto bailout;
+#endif /* PTHREAD_PRIO_INHERIT */
+
+ rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
+ if (rc)
+ goto bailout;
+
+ rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma);
+ if (rc)
+ goto bailout;
+ rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma);
+
+bailout:
+ pthread_mutexattr_destroy(&ma);
+ return rc;
+}
+
+void __cold mdbx_lck_destroy(MDBX_env *env) {
+ if (env->me_lfd != INVALID_HANDLE_VALUE) {
+ /* try get exclusive access */
+ if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) {
+ mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
+ int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
+ if (rc == 0)
+ rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
+ assert(rc == 0);
+ (void)rc;
+ /* file locks would be released (by kernel)
+ * while the me_lfd will be closed */
+ }
+ }
+}
+
+static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
+ int rc = pthread_mutex_lock(mutex);
+ if (unlikely(rc != 0))
+ rc = mdbx_mutex_failed(env, mutex, rc);
+ return rc;
+}
+
+static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) {
+ int rc = pthread_mutex_trylock(mutex);
+ if (unlikely(rc != 0 && rc != EBUSY))
+ rc = mdbx_mutex_failed(env, mutex, rc);
+ return (rc != EBUSY) ? rc : MDBX_BUSY;
+}
+
+static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) {
+ int rc = pthread_mutex_unlock(mutex);
+ if (unlikely(rc != 0))
+ rc = mdbx_mutex_failed(env, mutex, rc);
+ return rc;
+}
+
+int mdbx_rdt_lock(MDBX_env *env) {
+ mdbx_trace(">>");
+ int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex);
+ mdbx_trace("<< rc %d", rc);
+ return rc;
+}
+
+void mdbx_rdt_unlock(MDBX_env *env) {
+ mdbx_trace(">>");
+ int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex);
+ mdbx_trace("<< rc %d", rc);
+ if (unlikely(MDBX_IS_ERROR(rc)))
+ mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
+}
+
+int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
+ mdbx_trace(">>");
+ int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex)
+ : mdbx_robust_lock(env, env->me_wmutex);
+ mdbx_trace("<< rc %d", rc);
+ return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS;
+}
+
+void mdbx_txn_unlock(MDBX_env *env) {
+ mdbx_trace(">>");
+ int rc = mdbx_robust_unlock(env, env->me_wmutex);
+ mdbx_trace("<< rc %d", rc);
+ if (unlikely(MDBX_IS_ERROR(rc)))
+ mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
+}
+
+static int __cold internal_seize_lck(int lfd) {
+ assert(lfd != INVALID_HANDLE_VALUE);
+
+ /* try exclusive access */
+ int rc = mdbx_lck_exclusive(lfd, false);
+ if (rc == 0)
+ /* got exclusive */
+ return MDBX_RESULT_TRUE;
+ if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) {
+ /* get shared access */
+ rc = mdbx_lck_shared(lfd);
+ if (rc == 0) {
+ /* got shared, try exclusive again */
+ rc = mdbx_lck_exclusive(lfd, true);
+ if (rc == 0)
+ /* now got exclusive */
+ return MDBX_RESULT_TRUE;
+ if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK)
+ /* unable exclusive, but stay shared */
+ return MDBX_RESULT_FALSE;
+ }
+ }
+ assert(MDBX_IS_ERROR(rc));
+ return rc;
+}
+
+int __cold mdbx_lck_seize(MDBX_env *env) {
+ assert(env->me_fd != INVALID_HANDLE_VALUE);
+ if (unlikely(op_setlk == 0))
+ choice_fcntl();
+
+ if (env->me_lfd == INVALID_HANDLE_VALUE) {
+ /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
+ int rc = mdbx_lck_op(env->me_fd, op_setlk,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
+ LCK_WHOLE);
+ if (rc != 0) {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
+ return rc;
+ }
+ return MDBX_RESULT_TRUE;
+ }
+
+ if ((env->me_flags & MDBX_RDONLY) == 0) {
+ /* Check that another process don't operates in without-lck mode. */
+ int rc = mdbx_lck_op(env->me_fd, op_setlk, F_WRLCK, env->me_pid, 1);
+ if (rc != 0) {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
+ "lock-against-without-lck", rc);
+ return rc;
+ }
+ }
+
+ return internal_seize_lck(env->me_lfd);
+}
+
+static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
+ const int err) {
+ int rc = err;
+#if MDBX_USE_ROBUST
+ if (err == EOWNERDEAD) {
+ /* We own the mutex. Clean up after dead previous owner. */
+
+ int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex);
+ rc = MDBX_SUCCESS;
+ if (!rlocked) {
+ if (unlikely(env->me_txn)) {
+ /* env is hosed if the dead thread was ours */
+ env->me_flags |= MDBX_FATAL_ERROR;
+ env->me_txn = NULL;
+ rc = MDBX_PANIC;
+ }
+ }
+ mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
+ (rc ? "this process' env is hosed" : "recovering"));
+
+ int check_rc = mdbx_reader_check0(env, rlocked, NULL);
+ check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
+
+#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
+ !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
+ int mreco_rc = pthread_mutex_consistent_np(mutex);
+#else
+ int mreco_rc = pthread_mutex_consistent(mutex);
+#endif
+ check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
+
+ if (unlikely(mreco_rc))
+ mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc));
+
+ rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
+ if (MDBX_IS_ERROR(rc))
+ pthread_mutex_unlock(mutex);
+ return rc;
+ }
+#else
+ (void)mutex;
+#endif /* MDBX_USE_ROBUST */
+
+ mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err));
+ if (rc != EDEADLK)
+ env->me_flags |= MDBX_FATAL_ERROR;
+ return rc;
+}
diff --git a/libs/libmdbx/src/src/lck-posix.c b/libs/libmdbx/src/src/lck-posix.c
index e8d4539a1b..3b93157980 100644
--- a/libs/libmdbx/src/src/lck-posix.c
+++ b/libs/libmdbx/src/src/lck-posix.c
@@ -18,11 +18,7 @@
* even though they don't support Robust Mutexes.
* Compile with -DMDBX_USE_ROBUST=0. */
#ifndef MDBX_USE_ROBUST
-/* Howard Chu: Android currently lacks Robust Mutex support */
-#if defined(EOWNERDEAD) && \
- !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \
- with Robust Mutex too. */ \
- && __GLIBC_PREREQ(2, 10)
+#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__)
#define MDBX_USE_ROBUST 1
#else
#define MDBX_USE_ROBUST 0
@@ -32,95 +28,199 @@
/*----------------------------------------------------------------------------*/
/* rthc */
-static __cold __attribute__((constructor)) void mdbx_global_constructor(void) {
+static __cold __attribute__((__constructor__)) void
+mdbx_global_constructor(void) {
mdbx_rthc_global_init();
}
-static __cold __attribute__((destructor)) void mdbx_global_destructor(void) {
+static __cold __attribute__((__destructor__)) void
+mdbx_global_destructor(void) {
mdbx_rthc_global_dtor();
}
/*----------------------------------------------------------------------------*/
/* lck */
+/* Описание реализации блокировок для POSIX:
+ *
+ * lck-файл отображается в память, в нём организуется таблица читателей и
+ * размещаются совместно используемые posix-мьютексы (futex). Посредством
+ * этих мьютексов (см struct MDBX_lockinfo) реализуются:
+ * - Блокировка таблицы читателей для регистрации,
+ * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock().
+ * - Блокировка БД для пишущих транзакций,
+ * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
+ *
+ * Остальной функционал реализуется отдельно посредством файловых блокировок:
+ * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
+ * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade().
+ * - Проверка присутствие процессов-читателей,
+ * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
+ *
+ * Для блокировки файлов Используется только fcntl(F_SETLK), так как:
+ * - lockf() оперирует только эксклюзивной блокировкой и требует
+ * открытия файла в RW-режиме.
+ * - flock() не гарантирует атомарности при смене блокировок
+ * и оперирует только всем файлом целиком.
+ * - Для контроля процессов-читателей используются однобайтовые
+ * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
+ * в качестве позиции используется pid процесса-читателя.
+ * - Для первоначального захвата и shared/exclusive выполняется блокировка
+ * основного файла БД и при успехе lck-файла.
+ */
+
#ifndef OFF_T_MAX
-#define OFF_T_MAX (sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX)
+#define OFF_T_MAX \
+ ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
+#endif
+#ifndef PID_T_MAX
+#define PID_T_MAX INT_MAX
#endif
-#define LCK_WHOLE OFF_T_MAX
-static int mdbx_lck_op(mdbx_filehandle_t fd, int op, short lck, off_t offset,
+#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK)
+#define OP_SETLK F_OFD_SETLK
+#define OP_SETLKW F_OFD_SETLKW
+#define OP_GETLK F_OFD_GETLK
+#else
+#define OP_SETLK F_SETLK
+#define OP_SETLKW F_SETLKW
+#define OP_GETLK F_GETLK
+#endif /* OFD locks */
+
+static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
off_t len) {
for (;;) {
- int rc;
struct flock lock_op;
memset(&lock_op, 0, sizeof(lock_op));
lock_op.l_type = lck;
lock_op.l_whence = SEEK_SET;
lock_op.l_start = offset;
lock_op.l_len = len;
- if ((rc = fcntl(fd, op, &lock_op)) == 0) {
- if (op == F_GETLK && lock_op.l_type != F_UNLCK)
- rc = -lock_op.l_pid;
- } else if ((rc = errno) == EINTR) {
- continue;
- }
- return rc;
- }
-}
-
-static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) {
- assert(lfd != INVALID_HANDLE_VALUE);
- if (flock(lfd, LOCK_EX | LOCK_NB))
- return errno;
- int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1);
- if (rc != 0 && fallback2shared) {
- while (flock(lfd, LOCK_SH)) {
- int rc = errno;
- if (rc != EINTR)
- return rc;
+ if (fcntl(fd, cmd, &lock_op) == 0) {
+ if (cmd == OP_GETLK) {
+ /* Checks reader by pid. Returns:
+ * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock)
+ * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */
+ return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
+ : MDBX_RESULT_TRUE;
+ }
+ return 0;
}
- }
- return rc;
-}
-
-static __inline int mdbx_lck_shared(int lfd) {
- assert(lfd != INVALID_HANDLE_VALUE);
- while (flock(lfd, LOCK_SH)) {
int rc = errno;
- if (rc != EINTR)
+ if (rc != EINTR || cmd == F_SETLKW)
return rc;
}
- return mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1);
-}
-
-int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
- assert(env->me_lfd != INVALID_HANDLE_VALUE);
- return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS;
}
int mdbx_rpid_set(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
- return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1);
+ assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
+ return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1);
}
int mdbx_rpid_clear(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
- return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1);
+ assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
+ return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1);
}
-/* Checks reader by pid.
- *
- * Returns:
- * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
- * MDBX_RESULT_FALSE, if pid is dead (lock acquired)
- * or otherwise the errcode. */
int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
- int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1);
- if (rc == 0)
- return MDBX_RESULT_FALSE;
- if (rc < 0 && -rc == pid)
+ assert(pid > 0 && pid <= PID_T_MAX);
+ assert(PID_T_MAX < OFF_T_MAX);
+ return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1);
+}
+
+int __cold mdbx_lck_seize(MDBX_env *env) {
+ assert(env->me_fd != INVALID_HANDLE_VALUE);
+ assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
+
+ if (env->me_lfd == INVALID_HANDLE_VALUE) {
+ /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
+ int rc = mdbx_lck_op(env->me_fd, OP_SETLK,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
+ OFF_T_MAX);
+ if (rc != 0) {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
+ return rc;
+ }
return MDBX_RESULT_TRUE;
+ }
+
+ /* try exclusive access */
+ int rc = mdbx_lck_op(env->me_fd, OP_SETLK,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
+ OFF_T_MAX);
+ if (rc == 0) {
+ continue_exclusive:
+ /* got dxb-exclusive, continue lck-exclusive */
+ rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX);
+ if (rc == 0) {
+ /* got both exclusive */
+ return MDBX_RESULT_TRUE;
+ }
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
+ "lck-after-dxb-exclusive", rc);
+ assert(MDBX_IS_ERROR(rc));
+ goto bailout;
+ }
+
+ if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) {
+ rc = mdbx_lck_op(env->me_fd, OP_SETLKW,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
+ env->me_pid, 1);
+ if (rc == 0) {
+ /* got dxb-shared, try again dxb-exclusive */
+ rc = mdbx_lck_op(env->me_fd, OP_SETLK,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
+ OFF_T_MAX);
+ if (rc == 0)
+ goto continue_exclusive;
+
+ /* continue lck-shared */
+ rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
+ if (rc == 0) {
+ /* got both dxb and lck shared lock */
+ return MDBX_RESULT_FALSE;
+ }
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc);
+ } else {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc);
+ }
+ assert(MDBX_IS_ERROR(rc));
+ }
+
+bailout:
+ (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
+ (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
+ assert(MDBX_IS_ERROR(rc));
+ return rc;
+}
+
+int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
+ assert(env->me_lfd != INVALID_HANDLE_VALUE);
+ int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1);
+ if (rc == 0)
+ rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
+ if (unlikely(rc != 0)) {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc);
+ goto bailout;
+ }
+ if (complete) {
+ rc = mdbx_lck_op(env->me_fd, OP_SETLK,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
+ env->me_pid, 1);
+ if (unlikely(rc != 0)) {
+ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc);
+ goto bailout;
+ }
+ }
+ return MDBX_SUCCESS;
+
+bailout:
+ (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
+ (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
+ assert(MDBX_IS_ERROR(rc));
return rc;
}
@@ -140,11 +240,7 @@ int __cold mdbx_lck_init(MDBX_env *env) {
goto bailout;
#if MDBX_USE_ROBUST
-#if __GLIBC_PREREQ(2, 12)
rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
-#else
- rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
-#endif
if (rc)
goto bailout;
#endif /* MDBX_USE_ROBUST */
@@ -172,19 +268,28 @@ bailout:
}
void __cold mdbx_lck_destroy(MDBX_env *env) {
+ /* File locks would be released (by kernel) while the file-descriptors
+ * will be closed. But to avoid false-positive EDEADLK from the kernel,
+ * locks should be released here explicitly with properly order. */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
/* try get exclusive access */
- if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) {
+ if (env->me_lck &&
+ mdbx_lck_op(env->me_fd, OP_SETLK,
+ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
+ OFF_T_MAX) == 0 &&
+ mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) {
mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
if (rc == 0)
rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
assert(rc == 0);
(void)rc;
- /* file locks would be released (by kernel)
- * while the me_lfd will be closed */
+ msync(env->me_lck, env->me_os_psize, MS_ASYNC);
}
+ (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
}
+ if (env->me_fd != INVALID_HANDLE_VALUE)
+ (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
}
static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
@@ -239,64 +344,6 @@ void mdbx_txn_unlock(MDBX_env *env) {
mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
}
-static int __cold internal_seize_lck(int lfd) {
- assert(lfd != INVALID_HANDLE_VALUE);
-
- /* try exclusive access */
- int rc = mdbx_lck_exclusive(lfd, false);
- if (rc == 0)
- /* got exclusive */
- return MDBX_RESULT_TRUE;
- if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) {
- /* get shared access */
- rc = mdbx_lck_shared(lfd);
- if (rc == 0) {
- /* got shared, try exclusive again */
- rc = mdbx_lck_exclusive(lfd, true);
- if (rc == 0)
- /* now got exclusive */
- return MDBX_RESULT_TRUE;
- if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK)
- /* unable exclusive, but stay shared */
- return MDBX_RESULT_FALSE;
- }
- }
- assert(MDBX_IS_ERROR(rc));
- return rc;
-}
-
-int __cold mdbx_lck_seize(MDBX_env *env) {
- assert(env->me_fd != INVALID_HANDLE_VALUE);
-
- if (env->me_lfd == INVALID_HANDLE_VALUE) {
- /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
- int rc = mdbx_lck_op(env->me_fd, F_SETLK,
- (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
- LCK_WHOLE);
- if (rc != 0) {
- mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
- return rc;
- }
- return MDBX_RESULT_TRUE;
- }
-
- if ((env->me_flags & MDBX_RDONLY) == 0) {
- /* Check that another process don't operates in without-lck mode. */
- int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_WRLCK, env->me_pid, 1);
- if (rc != 0) {
- mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
- "lock-against-without-lck", rc);
- return rc;
- }
- }
-
- return internal_seize_lck(env->me_lfd);
-}
-
-#if !__GLIBC_PREREQ(2, 12) && !defined(pthread_mutex_consistent)
-#define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex)
-#endif
-
static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
const int err) {
int rc = err;
@@ -331,6 +378,8 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
pthread_mutex_unlock(mutex);
return rc;
}
+#else
+ (void)mutex;
#endif /* MDBX_USE_ROBUST */
mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err));
diff --git a/libs/libmdbx/src/src/lck-windows.c b/libs/libmdbx/src/src/lck-windows.c
index 822ba9c293..34a6e5a5dc 100644
--- a/libs/libmdbx/src/src/lck-windows.c
+++ b/libs/libmdbx/src/src/lck-windows.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
@@ -102,7 +102,7 @@ static
# ifdef _WIN64
const
# endif
- PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((section(".CRT$XLB"), used)) = mdbx_dll_callback;
+ PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_dll_callback;
#else
# error FIXME
#endif
@@ -179,7 +179,7 @@ void mdbx_txn_unlock(MDBX_env *env) {
#define LCK_LO_OFFSET 0
#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
#define LCK_UP_OFFSET LCK_LO_LEN
-#define LCK_UP_LEN (MDBX_LOCKINFO_WHOLE_SIZE - LCK_UP_OFFSET)
+#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
@@ -414,7 +414,9 @@ int mdbx_lck_seize(MDBX_env *env) {
assert(env->me_fd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
- return MDBX_RESULT_TRUE /* files were must be opened non-shareable */;
+ return MDBX_RESULT_TRUE /* nope since files were must be opened
+ non-shareable */
+ ;
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */
@@ -459,7 +461,8 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
- return MDBX_SUCCESS /* files were must be opened non-shareable */;
+ return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
+ ;
/* 1) must be at E-E (exclusive-write) */
if (!complete) {
@@ -664,6 +667,7 @@ MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx;
MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
+MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_NtFsControlFile mdbx_NtFsControlFile;
static void mdbx_winnt_import(void) {
@@ -688,21 +692,14 @@ static void mdbx_winnt_import(void) {
mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive;
}
- mdbx_GetFileInformationByHandleEx =
- (MDBX_GetFileInformationByHandleEx)GetProcAddress(
- hKernel32dll, "GetFileInformationByHandleEx");
+#define GET_KERNEL32_PROC(ENTRY) \
+ mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(hKernel32dll, #ENTRY)
- mdbx_GetVolumeInformationByHandleW =
- (MDBX_GetVolumeInformationByHandleW)GetProcAddress(
- hKernel32dll, "GetVolumeInformationByHandleW");
-
- mdbx_GetFinalPathNameByHandleW =
- (MDBX_GetFinalPathNameByHandleW)GetProcAddress(
- hKernel32dll, "GetFinalPathNameByHandleW");
-
- mdbx_SetFileInformationByHandle =
- (MDBX_SetFileInformationByHandle)GetProcAddress(
- hKernel32dll, "SetFileInformationByHandle");
+ GET_KERNEL32_PROC(GetFileInformationByHandleEx);
+ GET_KERNEL32_PROC(GetVolumeInformationByHandleW);
+ GET_KERNEL32_PROC(GetFinalPathNameByHandleW);
+ GET_KERNEL32_PROC(SetFileInformationByHandle);
+ GET_KERNEL32_PROC(PrefetchVirtualMemory);
const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll");
mdbx_NtFsControlFile =
diff --git a/libs/libmdbx/src/src/mdbx.c b/libs/libmdbx/src/src/mdbx.c
index 7a5e73b7f1..35d8ae1b68 100644
--- a/libs/libmdbx/src/src/mdbx.c
+++ b/libs/libmdbx/src/src/mdbx.c
@@ -291,7 +291,17 @@ typedef struct rthc_entry_t {
static CRITICAL_SECTION rthc_critical_section;
#else
int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, void *dso_symbol)
- __attribute__((weak));
+ __attribute__((__weak__));
+#ifdef __APPLE__ /* FIXME: Thread-Local Storage destructors & DSO-unloading */
+int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj,
+ void *dso_symbol) {
+ (void)dtor;
+ (void)obj;
+ (void)dso_symbol;
+ return -1;
+}
+#endif /* __APPLE__ */
+
static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t mdbx_rthc_cond = PTHREAD_COND_INITIALIZER;
static mdbx_thread_key_t mdbx_rthc_key;
@@ -515,9 +525,9 @@ __cold void mdbx_rthc_global_dtor(void) {
mdbx_thread_key_delete(key);
for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end;
++rthc) {
- mdbx_trace("== [%i] = key %u, %p ... %p, rthc %p (%+i), "
+ mdbx_trace("== [%i] = key %zu, %p ... %p, rthc %p (%+i), "
"rthc-pid %i, current-pid %i",
- i, key, rthc_table[i].begin, rthc_table[i].end, rthc,
+ i, (size_t)key, rthc_table[i].begin, rthc_table[i].end, rthc,
(int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid);
if (rthc->mr_pid == self_pid) {
rthc->mr_pid = 0;
@@ -553,8 +563,8 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin,
return rc;
mdbx_rthc_lock();
- mdbx_trace(">> key 0x%x, rthc_count %u, rthc_limit %u", *key, rthc_count,
- rthc_limit);
+ mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)*key,
+ rthc_count, rthc_limit);
if (rthc_count == rthc_limit) {
rthc_entry_t *new_table =
mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table,
@@ -568,13 +578,14 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin,
rthc_table = new_table;
rthc_limit *= 2;
}
- mdbx_trace("== [%i] = key %u, %p ... %p", rthc_count, *key, begin, end);
+ mdbx_trace("== [%i] = key %zu, %p ... %p", rthc_count, (size_t)*key, begin,
+ end);
rthc_table[rthc_count].key = *key;
rthc_table[rthc_count].begin = begin;
rthc_table[rthc_count].end = end;
++rthc_count;
- mdbx_trace("<< key 0x%x, rthc_count %u, rthc_limit %u", *key, rthc_count,
- rthc_limit);
+ mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)*key,
+ rthc_count, rthc_limit);
mdbx_rthc_unlock();
return MDBX_SUCCESS;
@@ -587,8 +598,8 @@ bailout:
__cold void mdbx_rthc_remove(const mdbx_thread_key_t key) {
mdbx_thread_key_delete(key);
mdbx_rthc_lock();
- mdbx_trace(">> key 0x%x, rthc_count %u, rthc_limit %u", key, rthc_count,
- rthc_limit);
+ mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)key,
+ rthc_count, rthc_limit);
for (unsigned i = 0; i < rthc_count; ++i) {
if (key == rthc_table[i].key) {
@@ -614,8 +625,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) {
}
}
- mdbx_trace("<< key 0x%x, rthc_count %u, rthc_limit %u", key, rthc_count,
- rthc_limit);
+ mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key,
+ rthc_count, rthc_limit);
mdbx_rthc_unlock();
}
@@ -1331,7 +1342,7 @@ const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) {
errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen,
NULL);
return size ? buf : NULL;
-#elif defined(_GNU_SOURCE)
+#elif defined(_GNU_SOURCE) && defined(__GLIBC__)
/* GNU-specific */
msg = strerror_r(errnum, buf, buflen);
#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600)
@@ -2008,7 +2019,7 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env,
mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid);
meta->mm_txnid_a = txnid;
(void)env;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
}
static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
@@ -2019,7 +2030,7 @@ static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
mdbx_jitter4testing(true);
meta->mm_txnid_b = txnid;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
}
static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
@@ -2139,7 +2150,9 @@ static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) {
static __hot txnid_t mdbx_reclaiming_detent(const MDBX_env *env) {
if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC))
- return env->me_txn->mt_txnid - 1;
+ return likely(env->me_txn0->mt_owner == mdbx_thread_self())
+ ? env->me_txn0->mt_txnid - 1
+ : mdbx_meta_txnid_fluid(env, mdbx_meta_head(env));
return mdbx_meta_txnid_stable(env, mdbx_meta_steady(env));
}
@@ -2160,13 +2173,13 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0);
MDBX_env *env = txn->mt_env;
const txnid_t edge = mdbx_reclaiming_detent(env);
- mdbx_tassert(txn, edge <= txn->mt_txnid - 1);
+ mdbx_tassert(txn, edge <= txn->mt_txnid);
MDBX_lockinfo *const lck = env->me_lck;
- if (unlikely(env->me_lck == NULL /* exclusive mode */))
- return env->me_oldest_stub = edge;
+ if (unlikely(lck == NULL /* exclusive mode */))
+ return env->me_lckless_stub.oldest = edge;
- const txnid_t last_oldest = lck->mti_oldest;
+ const txnid_t last_oldest = lck->mti_oldest_reader;
mdbx_tassert(txn, edge >= last_oldest);
if (likely(last_oldest == edge))
return edge;
@@ -2179,7 +2192,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
txnid_t oldest = edge;
lck->mti_readers_refresh_flag = nothing_changed;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
const unsigned snap_nreaders = lck->mti_numreaders;
for (unsigned i = 0; i < snap_nreaders; ++i) {
if (lck->mti_readers[i].mr_pid) {
@@ -2195,12 +2208,38 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
if (oldest != last_oldest) {
mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest);
- mdbx_tassert(txn, oldest >= lck->mti_oldest);
- lck->mti_oldest = oldest;
+ mdbx_tassert(txn, oldest >= lck->mti_oldest_reader);
+ lck->mti_oldest_reader = oldest;
}
return oldest;
}
+/* Find largest mvcc-snapshot still referenced. */
+static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
+ MDBX_lockinfo *const lck = env->me_lck;
+ if (likely(lck != NULL /* exclusive mode */)) {
+ const unsigned snap_nreaders = lck->mti_numreaders;
+ for (unsigned i = 0; i < snap_nreaders; ++i) {
+ retry:
+ if (lck->mti_readers[i].mr_pid) {
+ /* mdbx_jitter4testing(true); */
+ const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used;
+ const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid;
+ mdbx_memory_barrier();
+ if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used ||
+ snap_txnid != lck->mti_readers[i].mr_txnid))
+ goto retry;
+ if (largest < snap_pages &&
+ lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid &&
+ snap_txnid <= env->me_txn0->mt_txnid)
+ largest = snap_pages;
+ }
+ }
+ }
+
+ return largest;
+}
+
/* Add a page to the txn's dirty list */
static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) {
int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) =
@@ -2252,8 +2291,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno,
* 2) At least on Windows 10 1803 the entire mapped section is unavailable
* for short time during NtExtendSection() or VirtualAlloc() execution.
*
- * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED!
- */
+ * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */
array_onstack.limit = ARRAY_LENGTH(array_onstack.handles);
array_onstack.count = 0;
suspended = &array_onstack;
@@ -2286,7 +2324,7 @@ bailout:
env->me_dbgeo.upper = limit_bytes;
if (env->me_txn) {
mdbx_tassert(env->me_txn, size_pgno >= env->me_txn->mt_next_pgno);
- env->me_txn->mt_end_pgno = size_pgno;
+ env->me_txn->mt_end_pgno = env->me_txn0->mt_end_pgno = size_pgno;
}
#ifdef USE_VALGRIND
if (prev_mapsize != env->me_mapsize || prev_mapaddr != env->me_map) {
@@ -2985,42 +3023,47 @@ __cold static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) {
return rc;
}
- MDBX_meta *head = mdbx_meta_head(env);
- if (!META_IS_STEADY(head) || env->me_sync_pending) {
-
- if (force || (env->me_sync_threshold &&
- env->me_sync_pending >= env->me_sync_threshold))
+ const MDBX_meta *head = mdbx_meta_head(env);
+ pgno_t unsynced_pages = *env->me_unsynced_pages;
+ if (!META_IS_STEADY(head) || unsynced_pages) {
+ const pgno_t autosync_threshold = *env->me_autosync_threshold;
+ const uint64_t unsynced_timeout = *env->me_unsynced_timeout;
+ if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
+ (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout))
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
- if (outside_txn &&
- env->me_sync_pending >
- pgno2bytes(env, 16 /* FIXME: define threshold */) &&
- (flags & MDBX_NOSYNC) == 0) {
- mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
- const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next);
+ if (outside_txn) {
+ if (unsynced_pages > /* FIXME: define threshold */ 16 &&
+ (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0) {
+ mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
+ const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next);
- mdbx_txn_unlock(env);
+ mdbx_txn_unlock(env);
- /* LY: pre-sync without holding lock to reduce latency for writer(s) */
- int rc = (flags & MDBX_WRITEMAP)
- ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes,
- flags & MDBX_MAPASYNC)
- : mdbx_filesync(env->me_fd, false);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
+ /* LY: pre-sync without holding lock to reduce latency for writer(s) */
+ int rc = (flags & MDBX_WRITEMAP)
+ ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false)
+ : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA);
+ if (unlikely(rc != MDBX_SUCCESS))
+ return rc;
- rc = mdbx_txn_lock(env, nonblock);
- if (unlikely(rc != MDBX_SUCCESS))
- return rc;
+ rc = mdbx_txn_lock(env, nonblock);
+ if (unlikely(rc != MDBX_SUCCESS))
+ return rc;
- /* LY: head may be changed. */
- head = mdbx_meta_head(env);
+ /* LY: head and unsynced_pages may be changed. */
+ head = mdbx_meta_head(env);
+ unsynced_pages = *env->me_unsynced_pages;
+ }
+ env->me_txn0->mt_txnid = meta_txnid(env, head, false);
+ mdbx_find_oldest(env->me_txn0);
}
- if (!META_IS_STEADY(head) || env->me_sync_pending) {
- mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR,
+ if (!META_IS_STEADY(head) ||
+ ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) {
+ mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO,
container_of(head, MDBX_page, mp_data)->mp_pgno,
- mdbx_durable_str(head), env->me_sync_pending);
+ mdbx_durable_str(head), unsynced_pages);
MDBX_meta meta = *head;
int rc = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta);
if (unlikely(rc != MDBX_SUCCESS)) {
@@ -3129,18 +3172,27 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
MDBX_env *env = txn->mt_env;
int rc;
+#if MDBX_TXN_CHECKPID
if (unlikely(env->me_pid != mdbx_getpid())) {
env->me_flags |= MDBX_FATAL_ERROR;
return MDBX_PANIC;
}
+#endif /* MDBX_TXN_CHECKPID */
- STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE);
+ STATIC_ASSERT(sizeof(MDBX_reader) == 32);
+#ifdef MDBX_OSAL_LOCK
+ STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wmutex) % MDBX_CACHELINE_SIZE == 0);
+ STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rmutex) % MDBX_CACHELINE_SIZE == 0);
+#else
+ STATIC_ASSERT(
+ offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0);
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE ==
0);
+#endif
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE ==
0);
- pgno_t upper_pgno = 0;
+ pgno_t upper_limit_pgno = 0;
if (flags & MDBX_TXN_RDONLY) {
txn->mt_flags = MDBX_TXN_RDONLY;
MDBX_reader *r = txn->mt_ro_reader;
@@ -3207,17 +3259,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
r = &env->me_lck->mti_readers[slot];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
- * slot, next publish it in mtb.mti_numreaders. After
+ * slot, next publish it in lck->mti_numreaders. After
* that, it is safe for mdbx_env_close() to touch it.
* When it will be closed, we can finally claim it. */
r->mr_pid = 0;
r->mr_txnid = ~(txnid_t)0;
r->mr_tid = tid;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
if (slot == nreaders)
env->me_lck->mti_numreaders = ++nreaders;
- if (env->me_close_readers < nreaders)
- env->me_close_readers = nreaders;
r->mr_pid = env->me_pid;
mdbx_rdt_unlock(env);
@@ -3232,14 +3282,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
mdbx_jitter4testing(false);
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
mdbx_jitter4testing(false);
- if (r) {
+ if (likely(r)) {
+ r->mr_snapshot_pages_used = meta->mm_geo.next;
r->mr_txnid = snap;
mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid());
mdbx_assert(env, r->mr_tid == mdbx_thread_self());
mdbx_assert(env, r->mr_txnid == snap);
- mdbx_coherent_barrier();
+ mdbx_compiler_barrier();
env->me_lck->mti_readers_refresh_flag = true;
+ mdbx_flush_noncoherent_cpu_writeback();
}
mdbx_jitter4testing(true);
@@ -3247,7 +3299,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
txn->mt_txnid = snap;
txn->mt_next_pgno = meta->mm_geo.next;
txn->mt_end_pgno = meta->mm_geo.now;
- upper_pgno = meta->mm_geo.upper;
+ upper_limit_pgno = meta->mm_geo.upper;
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
txn->mt_canary = meta->mm_canary;
@@ -3269,6 +3321,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
mdbx_assert(env, txn->mt_txnid >= *env->me_oldest);
txn->mt_ro_reader = r;
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
+ mdbx_ensure(env, txn->mt_txnid >=
+ /* paranoia is appropriate here */ *env->me_oldest);
} else {
/* Not yet touching txn == env->me_txn0, it may be active */
mdbx_jitter4testing(false);
@@ -3317,7 +3371,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
/* Moved to here to avoid a data race in read TXNs */
txn->mt_next_pgno = meta->mm_geo.next;
txn->mt_end_pgno = meta->mm_geo.now;
- upper_pgno = meta->mm_geo.upper;
+ upper_limit_pgno = meta->mm_geo.upper;
}
/* Setup db info */
@@ -3338,12 +3392,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
} else {
const size_t size = pgno2bytes(env, txn->mt_end_pgno);
if (unlikely(size > env->me_mapsize)) {
- if (upper_pgno > MAX_PAGENO ||
- bytes2pgno(env, pgno2bytes(env, upper_pgno)) != upper_pgno) {
+ if (upper_limit_pgno > MAX_PAGENO ||
+ bytes2pgno(env, pgno2bytes(env, upper_limit_pgno)) !=
+ upper_limit_pgno) {
rc = MDBX_MAP_RESIZED;
goto bailout;
}
- rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_pgno);
+ rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_limit_pgno);
if (rc != MDBX_SUCCESS) {
if (rc == MDBX_RESULT_TRUE)
rc = MDBX_MAP_RESIZED;
@@ -3407,8 +3462,10 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags,
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDBX_EBADSIGN;
+#if MDBX_TXN_CHECKPID
if (unlikely(env->me_pid != mdbx_getpid()))
env->me_flags |= MDBX_FATAL_ERROR;
+#endif /* MDBX_TXN_CHECKPID */
if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
return MDBX_PANIC;
@@ -3606,10 +3663,12 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
MDBX_env *env = txn->mt_env;
static const char *const names[] = MDBX_END_NAMES;
+#if MDBX_TXN_CHECKPID
if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) {
env->me_flags |= MDBX_FATAL_ERROR;
return MDBX_PANIC;
}
+#endif /* MDBX_TXN_CHECKPID */
mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO
"/%" PRIaPGNO,
@@ -3618,13 +3677,21 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
(void *)env, txn->mt_dbs[MAIN_DBI].md_root,
txn->mt_dbs[FREE_DBI].md_root);
+ mdbx_ensure(env, txn->mt_txnid >=
+ /* paranoia is appropriate here */ *env->me_oldest);
if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) {
#if defined(_WIN32) || defined(_WIN64)
if (txn->mt_flags & MDBX_SHRINK_ALLOWED)
mdbx_srwlock_ReleaseShared(&env->me_remap_guard);
#endif
if (txn->mt_ro_reader) {
+ mdbx_ensure(env, /* paranoia is appropriate here */
+ txn->mt_txnid == txn->mt_ro_reader->mr_txnid &&
+ txn->mt_ro_reader->mr_txnid >=
+ env->me_lck->mti_oldest_reader);
+ txn->mt_ro_reader->mr_snapshot_pages_used = 0;
txn->mt_ro_reader->mr_txnid = ~(txnid_t)0;
+ mdbx_memory_barrier();
env->me_lck->mti_readers_refresh_flag = true;
if (mode & MDBX_END_SLOT) {
if ((env->me_flags & MDBX_ENV_TXKEY) == 0)
@@ -3632,7 +3699,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
txn->mt_ro_reader = NULL;
}
}
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
txn->mt_numdbs = 0; /* prevent further DBI activity */
txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
txn->mt_owner = 0;
@@ -3741,7 +3808,7 @@ static __inline int mdbx_backlog_size(MDBX_txn *txn) {
? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) +
txn->mt_loose_count
: 0;
- return reclaimed_and_loose + txn->mt_end_pgno - txn->mt_next_pgno;
+ return reclaimed_and_loose;
}
static __inline int mdbx_backlog_extragap(MDBX_env *env) {
@@ -3754,7 +3821,9 @@ static __inline int mdbx_backlog_extragap(MDBX_env *env) {
* in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */
static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) {
/* LY: extra page(s) for b-tree rebalancing */
- const int extra = mdbx_backlog_extragap(txn->mt_env);
+ const int extra =
+ mdbx_backlog_extragap(txn->mt_env) +
+ MDBX_PNL_SIZEOF(txn->mt_befree_pages) / txn->mt_env->me_maxkey_limit;
if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) {
mc->mc_flags &= ~C_RECLAIMING;
@@ -3762,11 +3831,10 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) {
if (unlikely(rc))
return rc;
- int backlog;
- while (unlikely((backlog = mdbx_backlog_size(txn)) < extra)) {
+ while (unlikely(mdbx_backlog_size(txn) < extra)) {
rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC);
if (unlikely(rc)) {
- if (unlikely(rc != MDBX_NOTFOUND))
+ if (rc != MDBX_NOTFOUND)
return rc;
break;
}
@@ -3777,6 +3845,20 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) {
return MDBX_SUCCESS;
}
+static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc,
+ size_t bytes) {
+ const int wanna =
+ (int)OVPAGES(txn->mt_env, bytes) + mdbx_backlog_extragap(txn->mt_env);
+ if (unlikely(wanna > mdbx_backlog_size(txn))) {
+ mc->mc_flags &= ~C_RECLAIMING;
+ do {
+ if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS)
+ break;
+ } while (wanna > mdbx_backlog_size(txn));
+ mc->mc_flags |= C_RECLAIMING;
+ }
+}
+
/* Count all the pages in each DB and in the freelist and make sure
* it matches the actual number of pages being used.
* All named DBs must be open for a correct count. */
@@ -4108,7 +4190,7 @@ retry:
mc.mc_flags &= ~C_RECLAIMING;
rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY);
mc.mc_flags |= C_RECLAIMING;
- if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND))
+ if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND)
goto bailout;
}
/* Write to last page of freeDB */
@@ -4116,6 +4198,7 @@ retry:
key.iov_base = &txn->mt_txnid;
do {
data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages);
+ mdbx_prep_backlog_data(txn, &mc, data.iov_len);
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
@@ -4325,6 +4408,7 @@ retry:
data.iov_len = (chunk + 1) * sizeof(pgno_t);
mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk,
settled + 1, settled + chunk + 1, reservation_gc_id);
+ mdbx_prep_backlog_data(txn, &mc, data.iov_len);
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE);
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true));
if (unlikely(rc != MDBX_SUCCESS))
@@ -4422,9 +4506,10 @@ retry:
if (unlikely(chunk > left)) {
mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk,
left, fill_gc_id);
- if (loop < 5 || chunk - left > env->me_maxgc_ov1page) {
+ if ((loop < 5 && chunk - left > loop / 2) ||
+ chunk - left > env->me_maxgc_ov1page) {
data.iov_len = (left + 1) * sizeof(pgno_t);
- if (loop < 21)
+ if (loop < 7)
mc.mc_flags &= ~C_GCFREEZE;
}
chunk = left;
@@ -4545,8 +4630,7 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) {
}
dp->mp_flags &= ~P_DIRTY;
dp->mp_validator = 0 /* TODO */;
- env->me_sync_pending +=
- IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize;
+ *env->me_unsynced_pages += IS_OVERFLOW(dp) ? dp->mp_pages : 1;
}
goto done;
}
@@ -4567,8 +4651,9 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) {
dp->mp_flags &= ~P_DIRTY;
dp->mp_validator = 0 /* TODO */;
pos = pgno2bytes(env, pgno);
- size = IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize;
- env->me_sync_pending += size;
+ const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1;
+ *env->me_unsynced_pages += npages;
+ size = pgno2bytes(env, npages);
}
/* Write up to MDBX_COMMIT_PAGES dirty pages at a time. */
if (pos != next_pos || n == MDBX_COMMIT_PAGES || wsize + size > MAX_WRITE) {
@@ -4579,6 +4664,17 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) {
mdbx_debug("Write error: %s", mdbx_strerror(rc));
return rc;
}
+
+#if MDBX_CPU_CACHE_MMAP_NONCOHERENT
+#if defined(__linux__) || defined(__gnu_linux__)
+ if (linux_kernel_version >= 0x02060b00)
+ /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
+ * arguments, making this function fairly expensive. Therefore, the
+ * whole cache is always flushed. */
+#endif /* Linux */
+ mdbx_invalidate_mmap_noncoherent_cache(env->me_map + wpos, wsize);
+#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */
+
n = 0;
}
if (i > pagecount)
@@ -4594,7 +4690,16 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) {
n++;
}
- mdbx_invalidate_cache(env->me_map, pgno2bytes(env, txn->mt_next_pgno));
+#if MDBX_CPU_CACHE_MMAP_NONCOHERENT && \
+ (defined(__linux__) || defined(__gnu_linux__))
+ if (linux_kernel_version < 0x02060b00) {
+ /* Linux kernels older than version 2.6.11 ignore the addr and nbytes
+ * arguments, making this function fairly expensive. Therefore, the whole
+ * cache is always flushed. */
+ mdbx_invalidate_mmap_noncoherent_cache(env->me_map,
+ pgno2bytes(env, txn->mt_next_pgno));
+ }
+#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT && Linux */
for (i = keep; ++i <= pagecount;) {
dp = dl[i].ptr;
@@ -4665,10 +4770,12 @@ int mdbx_txn_commit(MDBX_txn *txn) {
return MDBX_THREAD_MISMATCH;
MDBX_env *env = txn->mt_env;
+#if MDBX_TXN_CHECKPID
if (unlikely(env->me_pid != mdbx_getpid())) {
env->me_flags |= MDBX_FATAL_ERROR;
return MDBX_PANIC;
}
+#endif /* MDBX_TXN_CHECKPID */
if (txn->mt_child) {
rc = mdbx_txn_commit(txn->mt_child);
@@ -5281,12 +5388,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env,
pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS));
mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0);
- mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0);
+ mdbx_assert(env, !META_IS_STEADY(head) || *env->me_unsynced_pages != 0);
mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now);
- const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next);
- if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)
- flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
+ if (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) {
+ /* Check auto-sync conditions */
+ const pgno_t autosync_threshold = *env->me_autosync_threshold;
+ const uint64_t unsynced_timeout = *env->me_unsynced_timeout;
+ if ((autosync_threshold && *env->me_unsynced_pages >= autosync_threshold) ||
+ (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout))
+ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
+ }
/* LY: check conditions to shrink datafile */
const pgno_t backlog_gap =
@@ -5295,51 +5407,68 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink &&
pending->mm_geo.now - pending->mm_geo.next >
pending->mm_geo.shrink + backlog_gap) {
- const pgno_t aligner =
- pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink;
- const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap;
- const pgno_t aligned = pgno_align2os_pgno(
- env, with_backlog_gap + aligner - with_backlog_gap % aligner);
- const pgno_t bottom =
- (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
- if (pending->mm_geo.now > bottom) {
- flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
- shrink = pending->mm_geo.now - bottom;
- pending->mm_geo.now = bottom;
- if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
- mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
+ const pgno_t largest = mdbx_find_largest(
+ env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next
+ : pending->mm_geo.next);
+ if (pending->mm_geo.now > largest &&
+ pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) {
+ const pgno_t aligner =
+ pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink;
+ const pgno_t with_backlog_gap = largest + backlog_gap;
+ const pgno_t aligned = pgno_align2os_pgno(
+ env, with_backlog_gap + aligner - with_backlog_gap % aligner);
+ const pgno_t bottom =
+ (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
+ if (pending->mm_geo.now > bottom) {
+ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
+ shrink = pending->mm_geo.now - bottom;
+ pending->mm_geo.now = bottom;
+ if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
+ mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
+ }
}
}
/* LY: step#1 - sync previously written/updated data-pages */
- int rc = MDBX_RESULT_TRUE;
- if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) {
+ int rc = *env->me_unsynced_pages ? MDBX_RESULT_TRUE /* carry non-steady */
+ : MDBX_RESULT_FALSE /* carry steady */;
+ if (rc != MDBX_RESULT_FALSE && (flags & MDBX_NOSYNC) == 0) {
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
- MDBX_meta *const steady = mdbx_meta_steady(env);
+ MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env);
if (flags & MDBX_WRITEMAP) {
+ const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next);
rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
+ rc = MDBX_RESULT_TRUE /* carry non-steady */;
if ((flags & MDBX_MAPASYNC) == 0) {
- if (unlikely(pending->mm_geo.next > steady->mm_geo.now)) {
- rc = mdbx_filesize_sync(env->me_fd);
+ if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) {
+ rc = mdbx_filesync(env->me_fd, MDBX_SYNC_SIZE);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
- env->me_sync_pending = 0;
+ rc = MDBX_RESULT_FALSE /* carry steady */;
}
} else {
- rc = mdbx_filesync(env->me_fd, pending->mm_geo.next > steady->mm_geo.now);
+ rc = mdbx_filesync(env->me_fd,
+ (pending->mm_geo.next > recent_steady_meta->mm_geo.now)
+ ? MDBX_SYNC_DATA | MDBX_SYNC_SIZE
+ : MDBX_SYNC_DATA);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
- env->me_sync_pending = 0;
}
}
/* Steady or Weak */
- if (env->me_sync_pending == 0) {
+ if (rc == MDBX_RESULT_FALSE /* carry steady */) {
pending->mm_datasync_sign = mdbx_meta_sign(pending);
+ *env->me_unsynced_pages = 0;
+ *env->me_unsynced_timeout = 0;
} else {
+ assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
+ const uint64_t autosync_period = *env->me_autosync_period;
+ if (autosync_period && *env->me_unsynced_timeout == 0)
+ *env->me_unsynced_timeout = mdbx_osal_monotime() + autosync_period;
pending->mm_datasync_sign =
(flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC
? MDBX_DATASIGN_NONE
@@ -5425,7 +5554,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI];
target->mm_canary = pending->mm_canary;
mdbx_jitter4testing(true);
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
/* LY: 'commit' the meta */
mdbx_meta_update_end(env, target, pending->mm_txnid_b);
@@ -5444,7 +5573,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
sizeof(head->mm_canary)) == 0);
}
target->mm_datasync_sign = pending->mm_datasync_sign;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
mdbx_jitter4testing(true);
} else {
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta),
@@ -5458,7 +5587,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
(uint8_t *)target - env->me_map);
goto fail;
}
- mdbx_invalidate_cache(target, sizeof(MDBX_meta));
+ mdbx_invalidate_mmap_noncoherent_cache(target, sizeof(MDBX_meta));
}
/* LY: step#3 - sync meta-pages. */
@@ -5477,7 +5606,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
} else {
- rc = mdbx_filesync(env->me_fd, false);
+ rc = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS)
goto undo;
}
@@ -5582,7 +5711,7 @@ int __cold mdbx_env_create(MDBX_env **penv) {
mdbx_fastmutex_destroy(&env->me_dbi_lock);
goto bailout;
}
- rc = mdbx_fastmutex_init(&env->me_lckless_wmutex);
+ rc = mdbx_fastmutex_init(&env->me_lckless_stub.wmutex);
if (unlikely(rc != MDBX_SUCCESS)) {
mdbx_fastmutex_destroy(&env->me_remap_guard);
mdbx_fastmutex_destroy(&env->me_dbi_lock);
@@ -5601,14 +5730,15 @@ bailout:
return rc;
}
-static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
+static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive,
+ const size_t usedsize) {
int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
env->me_dbgeo.upper);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
#ifdef MADV_DONTFORK
- if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK))
+ if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTFORK) != 0))
return errno;
#endif
@@ -5624,22 +5754,81 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
MADV_DONTDUMP);
#endif
-#ifdef MADV_REMOVE
- if (usedsize && (env->me_flags & MDBX_WRITEMAP)) {
- (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize,
- MADV_REMOVE);
- }
+ if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) {
+#ifdef MADV_REMOVE_OR_FREE
+ const size_t used_alined2os = mdbx_roundup2(usedsize, env->me_os_psize);
+ if (used_alined2os < env->me_mapsize)
+ (void)madvise(env->me_map + used_alined2os,
+ env->me_mapsize - used_alined2os, MADV_REMOVE_OR_FREE);
#else
- (void)usedsize;
+ (void)usedsize;
#endif
+ }
-#if defined(MADV_RANDOM) && defined(MADV_WILLNEED)
- /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
- if (madvise(env->me_map, env->me_mapsize,
- (env->me_flags & MDBX_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED))
+#ifdef POSIX_FADV_RANDOM
+ /* this also checks that the file size is valid for a particular FS */
+ rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM);
+ if (unlikely(rc != 0))
+ return rc;
+#elif defined(F_RDAHEAD)
+ if (unlikely(fcntl(env->me_fd, F_RDAHEAD, 0) == -1))
+ return errno;
+#endif
+
+#if defined(MADV_RANDOM)
+ if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_RANDOM) != 0))
+ return errno;
+#elif defined(POSIX_MADV_RANDOM)
+ rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
+ if (unlikely(rc != 0))
return errno;
#endif
+ /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */
+ if (env->me_flags & MDBX_NORDAHEAD) {
+#ifdef POSIX_FADV_DONTNEED
+ rc = posix_fadvise(env->me_fd, 0, env->me_mapsize, POSIX_FADV_DONTNEED);
+ if (unlikely(rc != 0))
+ return rc;
+#endif
+#if defined(MADV_DONTNEED)
+ if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTNEED) != 0))
+ return errno;
+#elif defined(POSIX_MADV_DONTNEED)
+ rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_DONTNEED);
+ if (unlikely(rc != 0))
+ return errno;
+#endif
+ } else {
+#ifdef POSIX_FADV_WILLNEED
+ rc = posix_fadvise(env->me_fd, 0, usedsize, POSIX_FADV_WILLNEED);
+ if (unlikely(rc != 0))
+ return rc;
+#elif defined(F_RDADVISE)
+ struct radvisory hint;
+ hint.ra_offset = 0;
+ hint.ra_count = usedsize;
+ if (unlikely(fcntl(env->me_fd, F_RDADVISE, &hint) == -1))
+ return errno;
+#endif
+#if defined(MADV_WILLNEED)
+ if (unlikely(madvise(env->me_map, usedsize, MADV_WILLNEED) != 0))
+ return errno;
+#elif defined(POSIX_MADV_WILLNEED)
+ rc = posix_madvise(env->me_map, usedsize, POSIX_MADV_WILLNEED);
+ if (unlikely(rc != 0))
+ return errno;
+#endif
+#if defined(_WIN32) || defined(_WIN64)
+ if (mdbx_PrefetchVirtualMemory) {
+ WIN32_MEMORY_RANGE_ENTRY hint;
+ hint.VirtualAddress = env->me_map;
+ hint.NumberOfBytes = usedsize;
+ (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0);
+ }
+#endif /* Windows */
+ }
+
#ifdef USE_VALGRIND
env->me_valgrind_handle =
VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx");
@@ -5658,8 +5847,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDBX_EBADSIGN;
+#if MDBX_TXN_CHECKPID
if (unlikely(env->me_pid != mdbx_getpid()))
env->me_flags |= MDBX_FATAL_ERROR;
+#endif /* MDBX_TXN_CHECKPID */
if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
return MDBX_PANIC;
@@ -5688,6 +5879,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
need_unlock = true;
}
MDBX_meta *head = mdbx_meta_head(env);
+ if (!inside_txn) {
+ env->me_txn0->mt_txnid = meta_txnid(env, head, false);
+ mdbx_find_oldest(env->me_txn0);
+ }
if (pagesize < 0)
pagesize = env->me_psize;
@@ -5707,7 +5902,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
if (shrink_threshold < 0)
shrink_threshold = pgno2bytes(env, head->mm_geo.shrink);
- const size_t usedbytes = pgno2bytes(env, head->mm_geo.next);
+ const size_t usedbytes =
+ pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next));
if ((size_t)size_upper < usedbytes) {
rc = MDBX_MAP_FULL;
goto bailout;
@@ -5832,6 +6028,15 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
if (bytes2pgno(env, shrink_threshold) > UINT16_MAX)
shrink_threshold = pgno2bytes(env, UINT16_MAX);
+#ifdef POSIX_FADV_RANDOM
+ if (env->me_fd != INVALID_HANDLE_VALUE) {
+ /* this also checks that the file size is valid for a particular FS */
+ rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM);
+ if (unlikely(rc != 0))
+ goto bailout;
+ }
+#endif
+
/* save user's geo-params for future open/create */
env->me_dbgeo.lower = size_lower;
env->me_dbgeo.now = size_now;
@@ -5900,7 +6105,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
goto bailout;
head = /* base address could be changed */ mdbx_meta_head(env);
}
- env->me_sync_pending += env->me_psize;
+ *env->me_unsynced_pages += 1;
mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1);
rc = mdbx_sync_locked(env, env->me_flags, &meta);
}
@@ -5964,7 +6169,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) {
}
/* Further setup required for opening an MDBX environment */
-static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
+static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
uint64_t filesize_before_mmap;
MDBX_meta meta;
int rc = MDBX_RESULT_FALSE;
@@ -6133,9 +6338,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
}
}
- err = mdbx_env_map(env, (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE)
- ? 0
- : expected_bytes);
+ err = mdbx_env_map(env, lck_rc /* exclusive status */, expected_bytes);
if (err != MDBX_SUCCESS)
return err;
@@ -6201,7 +6404,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
if (err)
return err;
- mdbx_invalidate_cache(env->me_map, pgno2bytes(env, NUM_METAS));
+ mdbx_invalidate_mmap_noncoherent_cache(env->me_map,
+ pgno2bytes(env, NUM_METAS));
mdbx_ensure(env, undo_txnid == mdbx_meta_txnid_fluid(env, head));
mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env));
continue;
@@ -6261,7 +6465,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
mdbx_ensure(env, mdbx_meta_eq(env, &meta, head));
mdbx_meta_set_txnid(env, &meta, txnid + 1);
- env->me_sync_pending += env->me_psize;
+ *env->me_unsynced_pages += 1;
err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta);
if (err) {
mdbx_info("error %d, while updating meta.geo: "
@@ -6302,10 +6506,14 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
if (MDBX_IS_ERROR(rc))
return rc;
- env->me_oldest = &env->me_oldest_stub;
+ env->me_oldest = &env->me_lckless_stub.oldest;
+ env->me_unsynced_timeout = &env->me_lckless_stub.unsynced_timeout;
+ env->me_autosync_period = &env->me_lckless_stub.autosync_period;
+ env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending;
+ env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold;
env->me_maxreaders = UINT_MAX;
#ifdef MDBX_OSAL_LOCK
- env->me_wmutex = &env->me_lckless_wmutex;
+ env->me_wmutex = &env->me_lckless_stub.wmutex;
#endif
mdbx_debug("lck-setup:%s%s%s", " lck-less",
(env->me_flags & MDBX_RDONLY) ? " readonly" : "",
@@ -6396,8 +6604,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT;
} else {
- if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC &&
- env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC_DEVEL) {
+ if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
mdbx_error("lock region has invalid magic/version");
return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC)
? MDBX_INVALID
@@ -6411,7 +6618,11 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
}
mdbx_assert(env, !MDBX_IS_ERROR(rc));
- env->me_oldest = &env->me_lck->mti_oldest;
+ env->me_oldest = &env->me_lck->mti_oldest_reader;
+ env->me_unsynced_timeout = &env->me_lck->mti_unsynced_timeout;
+ env->me_autosync_period = &env->me_lck->mti_autosync_period;
+ env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages;
+ env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold;
#ifdef MDBX_OSAL_LOCK
env->me_wmutex = &env->me_lck->mti_wmutex;
#endif
@@ -6658,6 +6869,10 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
if (env->me_lck)
mdbx_munmap(&env->me_lck_mmap);
env->me_oldest = nullptr;
+ env->me_unsynced_timeout = nullptr;
+ env->me_autosync_period = nullptr;
+ env->me_unsynced_pages = nullptr;
+ env->me_autosync_threshold = nullptr;
mdbx_lck_destroy(env);
if (env->me_lfd != INVALID_HANDLE_VALUE) {
@@ -6673,6 +6888,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) {
if (unlikely(!env))
return MDBX_EINVAL;
+
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDBX_EBADSIGN;
@@ -6712,11 +6928,14 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) {
DeleteCriticalSection(&env->me_windowsbug_lock);
#else
mdbx_ensure(env,
- mdbx_fastmutex_destroy(&env->me_lckless_wmutex) == MDBX_SUCCESS);
- mdbx_ensure(env,
mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS);
#endif /* Windows */
+#ifdef MDBX_OSAL_LOCK
+ mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_lckless_stub.wmutex) ==
+ MDBX_SUCCESS);
+#endif
+
env->me_pid = 0;
env->me_signature = 0;
mdbx_free(env);
@@ -11253,7 +11472,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
}
#ifndef MDBX_WBUF
-#define MDBX_WBUF (1024 * 1024)
+#define MDBX_WBUF ((size_t)1024 * 1024)
#endif
#define MDBX_EOF 0x10 /* mdbx_env_copyfd1() is done reading */
@@ -11281,23 +11500,7 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) {
uint8_t *ptr;
int toggle = 0;
int rc;
-
-#if defined(F_SETNOSIGPIPE)
- /* OS X delivers SIGPIPE to the whole process, not the thread that caused it.
- * Disable SIGPIPE using platform specific fcntl. */
- int enabled = 1;
- if (fcntl(my->mc_fd, F_SETNOSIGPIPE, &enabled))
- my->mc_error = errno;
-#endif
-
-#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64)
- sigset_t set;
- sigemptyset(&set);
- sigaddset(&set, SIGPIPE);
- rc = pthread_sigmask(SIG_BLOCK, &set, NULL);
- if (rc != 0)
- my->mc_error = rc;
-#endif
+ size_t offset = pgno2bytes(my->mc_env, NUM_METAS);
mdbx_condmutex_lock(&my->mc_condmutex);
while (!my->mc_error) {
@@ -11309,18 +11512,12 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) {
ptr = my->mc_wbuf[toggle];
again:
if (wsize > 0 && !my->mc_error) {
- rc = mdbx_write(my->mc_fd, ptr, wsize);
+ rc = mdbx_pwrite(my->mc_fd, ptr, wsize, offset);
if (rc != MDBX_SUCCESS) {
-#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64)
- if (rc == EPIPE) {
- /* Collect the pending SIGPIPE, otherwise (at least OS X)
- * gives it to the process on thread-exit (ITS#8504). */
- int tmp;
- sigwait(&set, &tmp);
- }
-#endif
my->mc_error = rc;
+ break;
}
+ offset += wsize;
}
/* If there's an overflow page tail, write it too */
@@ -11654,25 +11851,47 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
return rc;
}
+ mdbx_jitter4testing(false);
+ const size_t meta_bytes = pgno2bytes(env, NUM_METAS);
/* Make a snapshot of meta-pages,
* but writing ones after the data was flushed */
- memcpy(buffer, env->me_map, pgno2bytes(env, NUM_METAS));
+ memcpy(buffer, env->me_map, meta_bytes);
MDBX_meta *const headcopy = /* LY: get pointer to the spanshot copy */
(MDBX_meta *)(buffer + ((uint8_t *)mdbx_meta_head(env) - env->me_map));
- const uint64_t size =
- mdbx_roundup2(pgno2bytes(env, headcopy->mm_geo.now), env->me_os_psize);
- mdbx_txn_unlock(env);
-
/* Update signature to steady */
headcopy->mm_datasync_sign = mdbx_meta_sign(headcopy);
+ mdbx_txn_unlock(env);
/* Copy the data */
- rc = mdbx_pwrite(fd, env->me_map + pgno2bytes(env, NUM_METAS),
- pgno2bytes(env, read_txn->mt_next_pgno - NUM_METAS),
- pgno2bytes(env, NUM_METAS));
+ const uint64_t whole_size =
+ mdbx_roundup2(pgno2bytes(env, read_txn->mt_end_pgno), env->me_os_psize);
+ const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno);
+ mdbx_jitter4testing(false);
+#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
+ for (off_t in_offset = meta_bytes; in_offset < (off_t)used_size;) {
+ off_t out_offset = in_offset;
+ ssize_t bytes_copied = copy_file_range(
+ env->me_fd, &in_offset, fd, &out_offset, used_size - in_offset, 0);
+ if (unlikely(bytes_copied <= 0)) {
+ rc = bytes_copied ? errno : MDBX_ENODATA;
+ break;
+ }
+ }
+#else
+ uint8_t *data_buffer = buffer + meta_bytes;
+ for (size_t offset = meta_bytes; offset < used_size;) {
+ const size_t chunk =
+ (MDBX_WBUF < used_size - offset) ? MDBX_WBUF : used_size - offset;
+ memcpy(data_buffer, env->me_map + offset, chunk);
+ rc = mdbx_pwrite(fd, data_buffer, chunk, offset);
+ if (unlikely(rc != MDBX_SUCCESS))
+ break;
+ offset += chunk;
+ }
+#endif
- if (likely(rc == MDBX_SUCCESS))
- rc = mdbx_ftruncate(fd, size);
+ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size)
+ rc = mdbx_ftruncate(fd, whole_size);
return rc;
}
@@ -11689,8 +11908,10 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
if (unlikely(rc != MDBX_SUCCESS))
return rc;
- const size_t buffer_size = pgno2bytes(env, NUM_METAS) +
- ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : 0);
+ const size_t buffer_size =
+ pgno2bytes(env, NUM_METAS) +
+ ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : MDBX_WBUF);
+
uint8_t *buffer = NULL;
rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer);
if (unlikely(rc != MDBX_SUCCESS))
@@ -11708,7 +11929,7 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
/* Firstly write a stub to meta-pages.
* Now we sure to incomplete copy will not be used. */
memset(buffer, -1, pgno2bytes(env, NUM_METAS));
- rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS));
+ rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
if (likely(rc == MDBX_SUCCESS)) {
memset(buffer, 0, pgno2bytes(env, NUM_METAS));
rc = (flags & MDBX_CP_COMPACT)
@@ -11718,12 +11939,15 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
mdbx_txn_abort(read_txn);
if (likely(rc == MDBX_SUCCESS))
- rc = mdbx_filesync(fd, true);
+ rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
/* Write actual meta */
if (likely(rc == MDBX_SUCCESS))
rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0);
+ if (likely(rc == MDBX_SUCCESS))
+ rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
+
mdbx_memalign_free(buffer);
return rc;
}
@@ -11782,6 +12006,12 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, unsigned flags) {
}
int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) {
+ if (unlikely(!env))
+ return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
if (unlikely(flags & ~CHANGEABLE))
return MDBX_EINVAL;
@@ -11802,6 +12032,9 @@ int __cold mdbx_env_get_flags(MDBX_env *env, unsigned *arg) {
if (unlikely(!env || !arg))
return MDBX_EINVAL;
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
*arg = env->me_flags & (CHANGEABLE | CHANGELESS);
return MDBX_SUCCESS;
}
@@ -11809,6 +12042,10 @@ int __cold mdbx_env_get_flags(MDBX_env *env, unsigned *arg) {
int __cold mdbx_env_set_userctx(MDBX_env *env, void *ctx) {
if (unlikely(!env))
return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
env->me_userctx = ctx;
return MDBX_SUCCESS;
}
@@ -11820,6 +12057,10 @@ void *__cold mdbx_env_get_userctx(MDBX_env *env) {
int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) {
if (unlikely(!env))
return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
#if MDBX_DEBUG
env->me_assert_func = func;
return MDBX_SUCCESS;
@@ -11833,6 +12074,9 @@ int __cold mdbx_env_get_path(MDBX_env *env, const char **arg) {
if (unlikely(!env || !arg))
return MDBX_EINVAL;
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
*arg = env->me_path;
return MDBX_SUCCESS;
}
@@ -11841,6 +12085,9 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) {
if (unlikely(!env || !arg))
return MDBX_EINVAL;
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
*arg = env->me_fd;
return MDBX_SUCCESS;
}
@@ -11850,7 +12097,8 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) {
* [in] db the MDBX_db record containing the stats to return.
* [out] arg the address of an MDBX_stat structure to receive the stats.
* Returns 0, this function always succeeds. */
-static int __cold mdbx_stat0(MDBX_env *env, MDBX_db *db, MDBX_stat *arg) {
+static int __cold mdbx_stat0(const MDBX_env *env, const MDBX_db *db,
+ MDBX_stat *arg) {
arg->ms_psize = env->me_psize;
arg->ms_depth = db->md_depth;
arg->ms_branch_pages = db->md_branch_pages;
@@ -11861,54 +12109,108 @@ static int __cold mdbx_stat0(MDBX_env *env, MDBX_db *db, MDBX_stat *arg) {
}
int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) {
- MDBX_meta *meta;
+ return mdbx_env_stat2(env, NULL, arg, bytes);
+}
- if (unlikely(env == NULL || arg == NULL))
+int __cold mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn,
+ MDBX_stat *arg, size_t bytes) {
+ if (unlikely((env == NULL && txn == NULL) || arg == NULL))
return MDBX_EINVAL;
+
+ if (txn) {
+ if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
+ return MDBX_EBADSIGN;
+ if (unlikely(txn->mt_owner != mdbx_thread_self()))
+ return MDBX_THREAD_MISMATCH;
+ }
+ if (env) {
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+ if (txn && unlikely(txn->mt_env != env))
+ return MDBX_EINVAL;
+ }
+
if (unlikely(bytes != sizeof(MDBX_stat)))
return MDBX_EINVAL;
- meta = mdbx_meta_head(env);
- return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg);
+ const MDBX_db *db =
+ txn ? &txn->mt_dbs[MAIN_DBI] : &mdbx_meta_head(env)->mm_dbs[MAIN_DBI];
+ return mdbx_stat0(txn ? txn->mt_env : env, db, arg);
}
int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) {
+ return mdbx_env_info2(env, NULL, arg, bytes);
+}
- if (unlikely(env == NULL || arg == NULL))
+int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn,
+ MDBX_envinfo *arg, size_t bytes) {
+ if (unlikely((env == NULL && txn == NULL) || arg == NULL))
return MDBX_EINVAL;
- if (bytes != sizeof(MDBX_envinfo))
+ if (txn) {
+ if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
+ return MDBX_EBADSIGN;
+ if (unlikely(txn->mt_owner != mdbx_thread_self()))
+ return MDBX_THREAD_MISMATCH;
+ }
+ if (env) {
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+ if (txn && unlikely(txn->mt_env != env))
+ return MDBX_EINVAL;
+ } else {
+ env = txn->mt_env;
+ }
+
+ if (unlikely(bytes != sizeof(MDBX_envinfo)))
return MDBX_EINVAL;
const MDBX_meta *const meta0 = METAPAGE(env, 0);
const MDBX_meta *const meta1 = METAPAGE(env, 1);
const MDBX_meta *const meta2 = METAPAGE(env, 2);
- const MDBX_meta *meta;
- do {
- meta = mdbx_meta_head(env);
- arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, meta);
+ while (1) {
+ if (unlikely(env->me_flags & MDBX_FATAL_ERROR))
+ return MDBX_PANIC;
+
+ const MDBX_meta *const recent_meta = mdbx_meta_head(env);
+ arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta);
arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0);
arg->mi_meta0_sign = meta0->mm_datasync_sign;
arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1);
arg->mi_meta1_sign = meta1->mm_datasync_sign;
arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2);
arg->mi_meta2_sign = meta2->mm_datasync_sign;
- arg->mi_last_pgno = meta->mm_geo.next - 1;
- arg->mi_geo.lower = pgno2bytes(env, meta->mm_geo.lower);
- arg->mi_geo.upper = pgno2bytes(env, meta->mm_geo.upper);
- arg->mi_geo.current = pgno2bytes(env, meta->mm_geo.now);
- arg->mi_geo.shrink = pgno2bytes(env, meta->mm_geo.shrink);
- arg->mi_geo.grow = pgno2bytes(env, meta->mm_geo.grow);
+
+ const MDBX_meta *txn_meta = recent_meta;
+ arg->mi_last_pgno = txn_meta->mm_geo.next - 1;
+ arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now);
+ if (txn) {
+ arg->mi_last_pgno = txn->mt_next_pgno - 1;
+ arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno);
+
+ const txnid_t wanna_meta_txnid =
+ (txn->mt_flags & MDBX_RDONLY) ? txn->mt_txnid : txn->mt_txnid - 1;
+ txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta;
+ txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta;
+ txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta;
+ }
+ arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower);
+ arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper);
+ arg->mi_geo.shrink = pgno2bytes(env, txn_meta->mm_geo.shrink);
+ arg->mi_geo.grow = pgno2bytes(env, txn_meta->mm_geo.grow);
+
arg->mi_mapsize = env->me_mapsize;
mdbx_compiler_barrier();
- } while (unlikely(arg->mi_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) ||
- arg->mi_meta0_sign != meta0->mm_datasync_sign ||
- arg->mi_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) ||
- arg->mi_meta1_sign != meta1->mm_datasync_sign ||
- arg->mi_meta2_txnid != mdbx_meta_txnid_fluid(env, meta2) ||
- arg->mi_meta2_sign != meta2->mm_datasync_sign ||
- meta != mdbx_meta_head(env) ||
- arg->mi_recent_txnid != mdbx_meta_txnid_fluid(env, meta)));
+ if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) &&
+ arg->mi_meta0_sign == meta0->mm_datasync_sign &&
+ arg->mi_meta1_txnid == mdbx_meta_txnid_fluid(env, meta1) &&
+ arg->mi_meta1_sign == meta1->mm_datasync_sign &&
+ arg->mi_meta2_txnid == mdbx_meta_txnid_fluid(env, meta2) &&
+ arg->mi_meta2_sign == meta2->mm_datasync_sign &&
+ recent_meta == mdbx_meta_head(env) &&
+ arg->mi_recent_txnid == mdbx_meta_txnid_fluid(env, recent_meta)))
+ break;
+ }
arg->mi_maxreaders = env->me_maxreaders;
arg->mi_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX;
@@ -12235,6 +12537,12 @@ static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) {
}
int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) {
+ if (unlikely(!env))
+ return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs))
return MDBX_EINVAL;
@@ -12437,7 +12745,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) {
txn->mt_dbs[dbi].md_entries = 0;
txn->mt_dbs[dbi].md_root = P_INVALID;
txn->mt_dbs[dbi].md_seq = 0;
-
txn->mt_flags |= MDBX_TXN_DIRTY;
}
@@ -12561,8 +12868,12 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) {
}
int __cold mdbx_reader_check(MDBX_env *env, int *dead) {
- if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE))
+ if (unlikely(!env))
return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
if (dead)
*dead = 0;
return mdbx_reader_check0(env, false, dead);
@@ -12575,10 +12886,12 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) {
int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) {
mdbx_assert(env, rdt_locked >= 0);
+#if MDBX_TXN_CHECKPID
if (unlikely(env->me_pid != mdbx_getpid())) {
env->me_flags |= MDBX_FATAL_ERROR;
return MDBX_PANIC;
}
+#endif /* MDBX_TXN_CHECKPID */
MDBX_lockinfo *const lck = env->me_lck;
if (unlikely(lck == NULL)) {
@@ -12671,36 +12984,51 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) {
}
int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) {
- unsigned ret = mdbx_runtime_flags;
- mdbx_runtime_flags = flags;
-
-#ifdef __linux__
- if (flags & MDBX_DBG_DUMP) {
- int core_filter_fd = open("/proc/self/coredump_filter", O_TRUNC | O_RDWR);
- if (core_filter_fd >= 0) {
- char buf[32];
- const unsigned r = pread(core_filter_fd, buf, sizeof(buf), 0);
- if (r > 0 && r < sizeof(buf)) {
- buf[r] = 0;
- unsigned long mask = strtoul(buf, NULL, 16);
- if (mask != ULONG_MAX) {
- mask |= 1 << 3 /* Dump file-backed shared mappings */;
- mask |= 1 << 6 /* Dump shared huge pages */;
- mask |= 1 << 8 /* Dump shared DAX pages */;
- unsigned w = snprintf(buf, sizeof(buf), "0x%lx\n", mask);
- if (w > 0 && w < sizeof(buf)) {
- w = pwrite(core_filter_fd, buf, w, 0);
- (void)w;
+ const int rc = mdbx_runtime_flags;
+ if (flags != -1) {
+#if !MDBX_DEBUG
+ flags &= MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN;
+#else
+ flags &= MDBX_DBG_ASSERT | MDBX_DBG_PRINT | MDBX_DBG_TRACE |
+ MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | MDBX_DBG_DUMP |
+ MDBX_DBG_LEGACY_MULTIOPEN;
+#endif
+#if defined(__linux__) || defined(__gnu_linux__)
+ if ((mdbx_runtime_flags ^ flags) & MDBX_DBG_DUMP) {
+ /* http://man7.org/linux/man-pages/man5/core.5.html */
+ const unsigned long dump_bits =
+ 1 << 3 /* Dump file-backed shared mappings */
+ | 1 << 6 /* Dump shared huge pages */
+ | 1 << 8 /* Dump shared DAX pages */;
+ const int core_filter_fd =
+ open("/proc/self/coredump_filter", O_TRUNC | O_RDWR);
+ if (core_filter_fd != -1) {
+ char buf[32];
+ intptr_t bytes = pread(core_filter_fd, buf, sizeof(buf), 0);
+ if (bytes > 0 && (size_t)bytes < sizeof(buf)) {
+ buf[bytes] = 0;
+ const unsigned long present_mask = strtoul(buf, NULL, 16);
+ const unsigned long wanna_mask = (flags & MDBX_DBG_DUMP)
+ ? present_mask | dump_bits
+ : present_mask & ~dump_bits;
+ if (wanna_mask != present_mask) {
+ bytes = snprintf(buf, sizeof(buf), "0x%lx\n", wanna_mask);
+ if (bytes > 0 && (size_t)bytes < sizeof(buf)) {
+ bytes = pwrite(core_filter_fd, buf, bytes, 0);
+ (void)bytes;
+ }
}
}
+ close(core_filter_fd);
}
- close(core_filter_fd);
}
+#endif /* Linux */
+ mdbx_runtime_flags = flags;
}
-#endif /* __linux__ */
- mdbx_debug_logger = logger;
- return ret;
+ if (-1 != (intptr_t)logger)
+ mdbx_debug_logger = logger;
+ return rc;
}
static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
@@ -12769,7 +13097,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) {
if (rc > 1) {
asleep->mr_tid = 0;
asleep->mr_pid = 0;
- mdbx_coherent_barrier();
+ mdbx_flush_noncoherent_cpu_writeback();
}
}
}
@@ -12788,8 +13116,31 @@ int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) {
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDBX_EBADSIGN;
- env->me_sync_threshold = bytes;
- return env->me_map ? mdbx_env_sync(env, false) : MDBX_SUCCESS;
+ if (unlikely(env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)))
+ return MDBX_EACCESS;
+
+ if (unlikely(!env->me_map))
+ return MDBX_EPERM;
+
+ *env->me_autosync_threshold = bytes2pgno(env, bytes + env->me_psize - 1);
+ return bytes ? mdbx_env_sync(env, false) : MDBX_SUCCESS;
+}
+
+int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) {
+ if (unlikely(!env))
+ return MDBX_EINVAL;
+
+ if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
+ return MDBX_EBADSIGN;
+
+ if (unlikely(env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)))
+ return MDBX_EACCESS;
+
+ if (unlikely(!env->me_map))
+ return MDBX_EPERM;
+
+ *env->me_autosync_period = mdbx_osal_16dot16_to_monotime(seconds_16dot16);
+ return seconds_16dot16 ? mdbx_env_sync(env, false) : MDBX_SUCCESS;
}
int __cold mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oomfunc) {
@@ -12811,7 +13162,7 @@ MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDBX_env *env) {
#ifdef __SANITIZE_THREAD__
/* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */
-__attribute__((no_sanitize_thread, noinline))
+__attribute__((__no_sanitize_thread__, __noinline__))
#endif
int mdbx_txn_straggler(MDBX_txn *txn, int *percent)
{
@@ -12849,9 +13200,9 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent)
}
typedef struct mdbx_walk_ctx {
- MDBX_txn *mw_txn;
void *mw_user;
MDBX_pgvisitor_func *mw_visitor;
+ MDBX_cursor mw_cursor;
} mdbx_walk_ctx_t;
/* Depth-first tree traversal. */
@@ -12860,13 +13211,8 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
if (unlikely(pgno == P_INVALID))
return MDBX_SUCCESS; /* empty db */
- MDBX_cursor mc;
- memset(&mc, 0, sizeof(mc));
- mc.mc_snum = 1;
- mc.mc_txn = ctx->mw_txn;
-
MDBX_page *mp;
- int rc = mdbx_page_get(&mc, pgno, &mp, NULL);
+ int rc = mdbx_page_get(&ctx->mw_cursor, pgno, &mp, NULL);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@@ -12922,7 +13268,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
MDBX_page *op;
pgno_t large_pgno;
memcpy(&large_pgno, NODEDATA(node), sizeof(pgno_t));
- rc = mdbx_page_get(&mc, large_pgno, &op, NULL);
+ rc = mdbx_page_get(&ctx->mw_cursor, large_pgno, &op, NULL);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@@ -12934,13 +13280,14 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
const size_t over_header = PAGEHDRSZ;
const size_t over_payload = NODEDSZ(node);
- const size_t over_unused = pgno2bytes(ctx->mw_txn->mt_env, op->mp_pages) -
- over_payload - over_header;
-
- rc = ctx->mw_visitor(large_pgno, op->mp_pages, ctx->mw_user, deep, dbi,
- pgno2bytes(ctx->mw_txn->mt_env, op->mp_pages),
- MDBX_page_large, 1, over_payload, over_header,
- over_unused);
+ const size_t over_unused =
+ pgno2bytes(ctx->mw_cursor.mc_txn->mt_env, op->mp_pages) -
+ over_payload - over_header;
+
+ rc = ctx->mw_visitor(
+ large_pgno, op->mp_pages, ctx->mw_user, deep, dbi,
+ pgno2bytes(ctx->mw_cursor.mc_txn->mt_env, op->mp_pages),
+ MDBX_page_large, 1, over_payload, over_header, over_unused);
} break;
case F_SUBDATA /* sub-db */: {
@@ -12957,7 +13304,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
} break;
case F_DUPDATA /* short sub-page */: {
- if (NODEDSZ(node) < PAGEHDRSZ)
+ if (unlikely(NODEDSZ(node) < PAGEHDRSZ))
return MDBX_CORRUPTED;
MDBX_page *sp = NODEDATA(node);
@@ -13013,11 +13360,11 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
}
rc = ctx->mw_visitor(mp->mp_pgno, 1, ctx->mw_user, deep, dbi,
- ctx->mw_txn->mt_env->me_psize, type, nkeys, payload_size,
- header_size, unused_size + align_bytes);
+ ctx->mw_cursor.mc_txn->mt_env->me_psize, type, nkeys,
+ payload_size, header_size, unused_size + align_bytes);
if (unlikely(rc != MDBX_SUCCESS))
- return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : MDBX_SUCCESS;
+ return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
for (int i = 0; i < nkeys; i++) {
if (type == MDBX_page_dupfixed_leaf)
@@ -13026,6 +13373,11 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
MDBX_node *node = NODEPTR(mp, i);
if (type == MDBX_page_branch) {
rc = mdbx_env_walk(ctx, dbi, NODEPGNO(node), deep + 1);
+ if (unlikely(rc != MDBX_SUCCESS)) {
+ if (rc != MDBX_RESULT_TRUE)
+ return rc;
+ break;
+ }
continue;
}
@@ -13084,7 +13436,9 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor,
return MDBX_THREAD_MISMATCH;
mdbx_walk_ctx_t ctx;
- ctx.mw_txn = txn;
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.mw_cursor.mc_snum = 1;
+ ctx.mw_cursor.mc_txn = txn;
ctx.mw_user = user;
ctx.mw_visitor = visitor;
@@ -13131,11 +13485,9 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) {
txn->mt_canary.v = txn->mt_txnid;
if ((txn->mt_flags & MDBX_TXN_DIRTY) == 0) {
- MDBX_env *env = txn->mt_env;
txn->mt_flags |= MDBX_TXN_DIRTY;
- env->me_sync_pending += env->me_psize;
+ *txn->mt_env->me_unsynced_pages += 1;
}
-
return MDBX_SUCCESS;
}
@@ -14118,7 +14470,7 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data,
//----------------------------------------------------------------------------
#ifdef __SANITIZE_ADDRESS__
-LIBMDBX_API __attribute__((weak)) const char *__asan_default_options() {
+LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() {
return "symbolize=1:allow_addr2line=1:"
#ifdef _DEBUG
"debug=1:"
diff --git a/libs/libmdbx/src/src/osal.c b/libs/libmdbx/src/src/osal.c
index e3997e2a45..7d0ff083ed 100644
--- a/libs/libmdbx/src/src/osal.c
+++ b/libs/libmdbx/src/src/osal.c
@@ -1,4 +1,4 @@
-/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
+/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
@@ -153,13 +153,52 @@ typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 {
/*----------------------------------------------------------------------------*/
-#if !defined(_MSC_VER) && \
+#if _POSIX_C_SOURCE > 200212 && \
/* workaround for avoid musl libc wrong prototype */ ( \
defined(__GLIBC__) || defined(__GNU_LIBRARY__))
/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */
-__nothrow __noreturn void __assert_fail(const char *assertion, const char *file,
- unsigned line, const char *function);
-#endif /* _MSC_VER */
+__extern_C void __assert_fail(const char *assertion, const char *file,
+ unsigned line, const char *function)
+#ifdef __THROW
+ __THROW
+#else
+ __nothrow
+#endif /* __THROW */
+ __noreturn;
+
+#elif defined(__APPLE__) || defined(__MACH__)
+__extern_C void __assert_rtn(const char *function, const char *file, int line,
+ const char *assertion) /* __nothrow */
+#ifdef __dead2
+ __dead2
+#else
+ __noreturn
+#endif /* __dead2 */
+#ifdef __disable_tail_calls
+ __disable_tail_calls
+#endif /* __disable_tail_calls */
+ ;
+
+#define __assert_fail(assertion, file, line, function) \
+ __assert_rtn(function, file, line, assertion)
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+ defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \
+ defined(__DragonFly__)
+__extern_C void __assert(const char *function, const char *file, int line,
+ const char *assertion) /* __nothrow */
+#ifdef __dead2
+ __dead2
+#else
+ __noreturn
+#endif /* __dead2 */
+#ifdef __disable_tail_calls
+ __disable_tail_calls
+#endif /* __disable_tail_calls */
+ ;
+#define __assert_fail(assertion, file, line, function) \
+ __assert(function, file, line, assertion)
+
+#endif /* __assert_fail */
void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg,
const char *func, int line) {
@@ -271,12 +310,15 @@ int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) {
(void)alignment;
*result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */;
-#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L
- *result = memalign(alignment, bytes);
+#elif defined(_ISOC11_SOURCE)
+ *result = aligned_alloc(alignment, bytes);
return *result ? MDBX_SUCCESS : errno;
#elif _POSIX_VERSION >= 200112L
*result = nullptr;
return posix_memalign(result, alignment, bytes);
+#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L
+ *result = memalign(alignment, bytes);
+ return *result ? MDBX_SUCCESS : errno;
#else
#error FIXME
#endif
@@ -519,16 +561,30 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode,
(void)exclusive;
#ifdef O_CLOEXEC
flags |= O_CLOEXEC;
-#endif
+#endif /* O_CLOEXEC */
*fd = open(pathname, flags, mode);
if (*fd < 0)
return errno;
-#if defined(FD_CLOEXEC) && defined(F_GETFD)
- flags = fcntl(*fd, F_GETFD);
- if (flags >= 0)
- (void)fcntl(*fd, F_SETFD, flags | FD_CLOEXEC);
-#endif
+
+#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
+ int fd_flags = fcntl(*fd, F_GETFD);
+ if (fd_flags != -1)
+ (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC);
+#endif /* FD_CLOEXEC && !O_CLOEXEC */
+
+ if ((flags & (O_RDONLY | O_WRONLY | O_RDWR)) == O_WRONLY) {
+ /* assume for MDBX_env_copy() and friends output */
+#if defined(O_DIRECT)
+ int fd_flags = fcntl(*fd, F_GETFD);
+ if (fd_flags != -1)
+ (void)fcntl(*fd, F_SETFL, fd_flags | O_DIRECT);
+#endif /* O_DIRECT */
+#if defined(F_NOCACHE)
+ (void)fcntl(*fd, F_NOCACHE, 1);
+#endif /* F_NOCACHE */
+ }
#endif
+
return MDBX_SUCCESS;
}
@@ -544,7 +600,6 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) {
if (bytes > MAX_WRITE)
return MDBX_EINVAL;
#if defined(_WIN32) || defined(_WIN64)
-
OVERLAPPED ov;
ov.hEvent = 0;
ov.Offset = (DWORD)offset;
@@ -569,37 +624,43 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) {
int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes,
uint64_t offset) {
+ while (true) {
#if defined(_WIN32) || defined(_WIN64)
- if (bytes > MAX_WRITE)
- return ERROR_INVALID_PARAMETER;
-
- OVERLAPPED ov;
- ov.hEvent = 0;
- ov.Offset = (DWORD)offset;
- ov.OffsetHigh = HIGH_DWORD(offset);
+ OVERLAPPED ov;
+ ov.hEvent = 0;
+ ov.Offset = (DWORD)offset;
+ ov.OffsetHigh = HIGH_DWORD(offset);
- DWORD written;
- if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov)))
- return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */;
- return GetLastError();
+ DWORD written;
+ if (unlikely(!WriteFile(fd, buf,
+ (bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
+ &written, &ov)))
+ return GetLastError();
+ if (likely(bytes == written))
+ return MDBX_SUCCESS;
#else
- int rc;
- intptr_t written;
- do {
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
- written = pwrite(fd, buf, bytes, offset);
+ const intptr_t written =
+ pwrite(fd, buf, (bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset);
if (likely(bytes == (size_t)written))
return MDBX_SUCCESS;
- rc = errno;
- } while (rc == EINTR);
- return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */;
+ if (written < 0) {
+ const int rc = errno;
+ if (rc != EINTR)
+ return rc;
+ continue;
+ }
#endif
+ bytes -= written;
+ offset += written;
+ buf = (char *)buf + written;
+ }
}
int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
uint64_t offset, size_t expected_written) {
-#if defined(_WIN32) || defined(_WIN64)
+#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
size_t written = 0;
for (int i = 0; i < iovcnt; ++i) {
int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
@@ -625,57 +686,23 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
#endif
}
-int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) {
-#ifdef SIGPIPE
- sigset_t set, old;
- sigemptyset(&set);
- sigaddset(&set, SIGPIPE);
- int rc = pthread_sigmask(SIG_BLOCK, &set, &old);
- if (rc != 0)
- return rc;
-#endif
-
- const char *ptr = buf;
- for (;;) {
- size_t chunk = (MAX_WRITE < bytes) ? MAX_WRITE : bytes;
+int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits) {
#if defined(_WIN32) || defined(_WIN64)
- DWORD written;
- if (unlikely(!WriteFile(fd, ptr, (DWORD)chunk, &written, NULL)))
- return GetLastError();
+ return ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) == 0 ||
+ FlushFileBuffers(fd))
+ ? MDBX_SUCCESS
+ : GetLastError();
#else
- intptr_t written = write(fd, ptr, chunk);
- if (written < 0) {
- int rc = errno;
-#ifdef SIGPIPE
- if (rc == EPIPE) {
- /* Collect the pending SIGPIPE, otherwise at least OS X
- * gives it to the process on thread-exit (ITS#8504). */
- int tmp;
- sigwait(&set, &tmp);
- written = 0;
- continue;
- }
- pthread_sigmask(SIG_SETMASK, &old, NULL);
-#endif
- return rc;
- }
-#endif
- if (likely(bytes == (size_t)written)) {
-#ifdef SIGPIPE
- pthread_sigmask(SIG_SETMASK, &old, NULL);
-#endif
- return MDBX_SUCCESS;
- }
- ptr += written;
- bytes -= written;
- }
-}
-int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) {
-#if defined(_WIN32) || defined(_WIN64)
- (void)filesize_changed;
- return FlushFileBuffers(fd) ? MDBX_SUCCESS : GetLastError();
-#else
+#if defined(__APPLE__) && \
+ MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
+ if (mode_bits & MDBX_SYNC_IODQ)
+ return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
+#endif /* MacOS */
+#if defined(__linux__) || defined(__gnu_linux__)
+ if (mode_bits == MDBX_SYNC_SIZE && mdbx_linux_kernel_version >= 0x03060000)
+ return MDBX_SUCCESS;
+#endif /* Linux */
int rc;
do {
#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0
@@ -684,12 +711,12 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) {
*
* For more info about of a corresponding fdatasync() bug
* see http://www.spinics.net/lists/linux-ext4/msg33714.html */
- if (!filesize_changed) {
+ if ((mode_bits & MDBX_SYNC_SIZE) == 0) {
if (fdatasync(fd) == 0)
return MDBX_SUCCESS;
} else
#else
- (void)filesize_changed;
+ (void)mode_bits;
#endif
if (fsync(fd) == 0)
return MDBX_SUCCESS;
@@ -699,22 +726,6 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool filesize_changed) {
#endif
}
-int mdbx_filesize_sync(mdbx_filehandle_t fd) {
-#if defined(_WIN32) || defined(_WIN64)
- (void)fd;
- /* Nothing on Windows (i.e. newer 100% steady) */
- return MDBX_SUCCESS;
-#else
- for (;;) {
- if (fsync(fd) == 0)
- return MDBX_SUCCESS;
- int rc = errno;
- if (rc != EINTR)
- return rc;
- }
-#endif
-}
-
int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) {
#if defined(_WIN32) || defined(_WIN64)
BY_HANDLE_FILE_INFORMATION info;
@@ -803,8 +814,21 @@ int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) {
return MDBX_SUCCESS;
return GetLastError();
#else
+#ifdef __linux__
+ if (async && mdbx_linux_kernel_version > 0x02061300)
+ /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op,
+ since the kernel properly tracks dirty pages and flushes them to storage
+ as necessary. */
+ return MDBX_SUCCESS;
+#endif /* Linux */
const int mode = async ? MS_ASYNC : MS_SYNC;
- return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
+ int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
+#if defined(__APPLE__) && \
+ MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
+ if (rc == MDBX_SUCCESS && mode == MS_SYNC)
+ rc = likely(fcntl(map->fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
+#endif /* MacOS */
+ return rc;
#endif
}
@@ -1179,11 +1203,24 @@ retry_mapview:;
return rc;
#else
if (limit != map->length) {
- void *ptr = mremap(map->address, map->length, limit, MREMAP_MAYMOVE);
- if (ptr == MAP_FAILED)
- return errno;
+#if defined(_GNU_SOURCE) && \
+ !(defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+ defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \
+ defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__))
+ void *ptr = mremap(map->address, map->length, limit,
+ /* LY: in case changing the mapping size calling code
+ must guarantees the absence of competing threads, and
+ a willingness to another base address */
+ MREMAP_MAYMOVE);
+ if (ptr == MAP_FAILED) {
+ int err = errno;
+ return (err == EAGAIN || err == ENOMEM) ? MDBX_RESULT_TRUE : err;
+ }
map->address = ptr;
map->length = limit;
+#else
+ return MDBX_RESULT_TRUE;
+#endif /* mremap() <= _GNU_SOURCE && !__FreeBSD__ */
}
return (flags & MDBX_RDONLY) ? MDBX_SUCCESS : mdbx_ftruncate(map->fd, size);
#endif
@@ -1214,3 +1251,71 @@ __cold void mdbx_osal_jitter(bool tiny) {
#endif
}
}
+
+#if defined(_WIN32) || defined(_WIN64)
+#elif defined(__APPLE__) || defined(__MACH__)
+#include <mach/mach_time.h>
+#elif defined(__linux__) || defined(__gnu_linux__)
+static __cold clockid_t choice_monoclock() {
+ struct timespec probe;
+#if defined(CLOCK_BOOTTIME)
+ if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0)
+ return CLOCK_BOOTTIME;
+#elif defined(CLOCK_MONOTONIC_RAW)
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0)
+ return CLOCK_MONOTONIC_RAW;
+#elif defined(CLOCK_MONOTONIC_COARSE)
+ if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0)
+ return CLOCK_MONOTONIC_COARSE;
+#endif
+ return CLOCK_MONOTONIC;
+}
+#endif
+
+uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) {
+#if defined(_WIN32) || defined(_WIN64)
+ static LARGE_INTEGER performance_frequency;
+ if (performance_frequency.QuadPart == 0)
+ QueryPerformanceFrequency(&performance_frequency);
+ const uint64_t ratio = performance_frequency.QuadPart;
+#elif defined(__APPLE__) || defined(__MACH__)
+ static uint64_t ratio;
+ if (!ratio) {
+ mach_timebase_info_data_t ti;
+ mach_timebase_info(&ti);
+ ratio = UINT64_C(1000000000) * ti.denom / ti.numer;
+ }
+#else
+ const uint64_t ratio = UINT64_C(1000000000);
+#endif
+ return (ratio * seconds_16dot16 + 32768) >> 16;
+}
+
+uint64_t mdbx_osal_monotime(void) {
+#if defined(_WIN32) || defined(_WIN64)
+ LARGE_INTEGER counter;
+ counter.QuadPart = 0;
+ QueryPerformanceCounter(&counter);
+ return counter.QuadPart;
+#elif defined(__APPLE__) || defined(__MACH__)
+ return mach_absolute_time();
+#else
+
+#if defined(__linux__) || defined(__gnu_linux__)
+ static clockid_t posix_clockid = -1;
+ if (unlikely(posix_clockid < 0))
+ posix_clockid = choice_monoclock();
+#elif defined(CLOCK_MONOTONIC)
+#define posix_clockid CLOCK_MONOTONIC
+#else
+#define posix_clockid CLOCK_REALTIME
+#endif
+
+ struct timespec ts;
+ if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) {
+ ts.tv_nsec = 0;
+ ts.tv_sec = 0;
+ }
+ return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec;
+#endif
+}
diff --git a/libs/libmdbx/src/src/osal.h b/libs/libmdbx/src/src/osal.h
index 0208a52254..bd51f34a81 100644
--- a/libs/libmdbx/src/src/osal.h
+++ b/libs/libmdbx/src/src/osal.h
@@ -1,4 +1,4 @@
-/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
+/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */
/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
@@ -41,7 +41,6 @@
/*----------------------------------------------------------------------------*/
/* C99 includes */
-
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
@@ -50,11 +49,32 @@
#include <assert.h>
#include <fcntl.h>
#include <limits.h>
-#include <malloc.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
+/* C11 stdalign.h */
+#if __has_include(<stdalign.h>)
+#include <stdalign.h>
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define alignas(N) _Alignas(N)
+#elif defined(_MSC_VER)
+#define alignas(N) __declspec(align(N))
+#elif __has_attribute(__aligned__) || defined(__GNUC__)
+#define alignas(N) __attribute__((__aligned__(N)))
+#else
+#error "FIXME: Required _alignas() or equivalent."
+#endif
+
+/*----------------------------------------------------------------------------*/
+/* Systems includes */
+
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+ defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \
+ defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__)
+#include <sys/cdefs.h>
+#else
+#include <malloc.h>
#ifndef _POSIX_C_SOURCE
#ifdef _POSIX_SOURCE
#define _POSIX_C_SOURCE 1
@@ -62,14 +82,12 @@
#define _POSIX_C_SOURCE 0
#endif
#endif
+#endif /* !xBSD */
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE 0
#endif
-/*----------------------------------------------------------------------------*/
-/* Systems includes */
-
#if defined(_WIN32) || defined(_WIN64)
#define WIN32_LEAN_AND_MEAN
#include <tlhelp32.h>
@@ -174,6 +192,22 @@ typedef pthread_mutex_t mdbx_fastmutex_t;
#define SSIZE_MAX INTPTR_MAX
#endif
+#if !defined(MADV_DODUMP) && defined(MADV_CORE)
+#define MADV_DODUMP MADV_CORE
+#endif /* MADV_CORE -> MADV_DODUMP */
+
+#if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE)
+#define MADV_DONTDUMP MADV_NOCORE
+#endif /* MADV_NOCORE -> MADV_DONTDUMP */
+
+#ifndef MADV_REMOVE_OR_FREE
+#ifdef MADV_REMOVE
+#define MADV_REMOVE_OR_FREE MADV_REMOVE
+#elif defined(MADV_FREE)
+#define MADV_REMOVE_OR_FREE MADV_FREE
+#endif
+#endif /* MADV_REMOVE_OR_FREE */
+
#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
defined(i486) || defined(__i486) || defined(__i486__) || \
defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \
@@ -378,14 +412,14 @@ static __inline void mdbx_memory_barrier(void) {
/*----------------------------------------------------------------------------*/
/* Cache coherence and invalidation */
-#ifndef MDBX_CACHE_IS_COHERENT
+#ifndef MDBX_CPU_WRITEBACK_IS_COHERENT
#if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \
defined(__hppa__)
-#define MDBX_CACHE_IS_COHERENT 1
+#define MDBX_CPU_WRITEBACK_IS_COHERENT 1
#else
-#define MDBX_CACHE_IS_COHERENT 0
+#define MDBX_CPU_WRITEBACK_IS_COHERENT 0
#endif
-#endif /* MDBX_CACHE_IS_COHERENT */
+#endif /* MDBX_CPU_WRITEBACK_IS_COHERENT */
#ifndef MDBX_CACHELINE_SIZE
#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE)
@@ -397,40 +431,54 @@ static __inline void mdbx_memory_barrier(void) {
#endif
#endif /* MDBX_CACHELINE_SIZE */
-#if MDBX_CACHE_IS_COHERENT
-#define mdbx_coherent_barrier() mdbx_compiler_barrier()
+#if MDBX_CPU_WRITEBACK_IS_COHERENT
+#define mdbx_flush_noncoherent_cpu_writeback() mdbx_compiler_barrier()
#else
-#define mdbx_coherent_barrier() mdbx_memory_barrier()
+#define mdbx_flush_noncoherent_cpu_writeback() mdbx_memory_barrier()
#endif
-#if defined(__mips) || defined(__mips__) || defined(__mips64) || \
- defined(__mips64) || defined(_M_MRX000) || defined(_MIPS_)
-/* Only MIPS has explicit cache control */
+#if __has_include(<sys/cachectl.h>)
+#include <sys/cachectl.h>
+#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \
+ defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
+ defined(__MWERKS__) || defined(__sgi)
+/* MIPS should have explicit cache control */
#include <sys/cachectl.h>
#endif
-static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) {
- mdbx_coherent_barrier();
+#ifndef MDBX_CPU_CACHE_MMAP_NONCOHERENT
#if defined(__mips) || defined(__mips__) || defined(__mips64) || \
- defined(__mips64) || defined(_M_MRX000) || defined(_MIPS_)
-#if defined(DCACHE)
+ defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \
+ defined(__MWERKS__) || defined(__sgi)
+/* MIPS has cache coherency issues. */
+#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 1
+#else
+/* LY: assume no relevant mmap/dcache issues. */
+#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 0
+#endif
+#endif /* ndef MDBX_CPU_CACHE_MMAP_NONCOHERENT */
+
+static __inline void mdbx_invalidate_mmap_noncoherent_cache(void *addr,
+ size_t nbytes) {
+#if MDBX_CPU_CACHE_MMAP_NONCOHERENT
+#ifdef DCACHE
/* MIPS has cache coherency issues.
* Note: for any nbytes >= on-chip cache size, entire is flushed. */
cacheflush(addr, nbytes, DCACHE);
#else
-#error "Sorry, cacheflush() for MIPS not implemented"
-#endif /* __mips__ */
-#else
- /* LY: assume no relevant mmap/dcache issues. */
+#error "Oops, cacheflush() not available"
+#endif /* DCACHE */
+#else /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */
(void)addr;
(void)nbytes;
-#endif
+#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */
}
/*----------------------------------------------------------------------------*/
/* libc compatibility stuff */
-#if __GLIBC_PREREQ(2, 1)
+#if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \
+ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE))
#define mdbx_asprintf asprintf
#define mdbx_vasprintf vasprintf
#else
@@ -444,6 +492,10 @@ int mdbx_vasprintf(char **strp, const char *fmt, va_list ap);
/* max bytes to write in one call */
#define MAX_WRITE UINT32_C(0x3fff0000)
+#if defined(__linux__) || defined(__gnu_linux__)
+extern uint32_t mdbx_linux_kernel_version;
+#endif /* Linux */
+
/* Get the size of a memory page for the system.
* This is the basic size that the platform's memory manager uses, and is
* fundamental to the use of memory-mapped files. */
@@ -494,14 +546,19 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt,
int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset);
int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count,
uint64_t offset);
-int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count);
int mdbx_thread_create(mdbx_thread_t *thread,
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
void *arg);
int mdbx_thread_join(mdbx_thread_t thread);
-int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync);
+enum mdbx_syncmode_bits {
+ MDBX_SYNC_DATA = 1,
+ MDBX_SYNC_SIZE = 2,
+ MDBX_SYNC_IODQ = 4
+};
+
+int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits);
int mdbx_filesize_sync(mdbx_filehandle_t fd);
int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
@@ -560,6 +617,8 @@ static __inline mdbx_tid_t mdbx_thread_self(void) {
}
void mdbx_osal_jitter(bool tiny);
+uint64_t mdbx_osal_monotime(void);
+uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16);
/*----------------------------------------------------------------------------*/
/* lck stuff */
@@ -572,27 +631,82 @@ void mdbx_osal_jitter(bool tiny);
#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017)
#endif /* MDBX_OSAL_LOCK */
-#ifdef MDBX_OSAL_LOCK
-#define MDBX_OSAL_LOCK_SIZE sizeof(MDBX_OSAL_LOCK)
-#else
-#define MDBX_OSAL_LOCK_SIZE 0
-#endif /* MDBX_OSAL_LOCK_SIZE */
-
+/// \brief Инициализация объектов синхронизации внутри текущего процесса
+/// связанных с экземпляром MDBX_env.
+/// \return Код ошибки или 0 в случае успеха.
int mdbx_lck_init(MDBX_env *env);
+/// \brief Отключение от общих межпроцесных объектов и разрушение объектов
+/// синхронизации внутри текущего процесса связанных с экземпляром MDBX_env.
+void mdbx_lck_destroy(MDBX_env *env);
+
+/// \brief Подключение к общим межпроцесным объектам блокировки с попыткой
+/// захвата блокировки максимального уровня (разделяемой при недоступности
+/// эксклюзивной).
+/// В зависимости от реализации и/или платформы (Windows) может
+/// захватывать блокировку не-операционного супер-уровня (например, для
+/// инициализации разделяемых объектов синхронизации), которая затем будет
+/// понижена до операционно-эксклюзивной или разделяемой посредством
+/// явного вызова mdbx_lck_downgrade().
+/// \return
+/// MDBX_RESULT_TRUE (-1) - если удалось захватить эксклюзивную блокировку и,
+/// следовательно, текущий процесс является первым и единственным
+/// после предыдущего использования БД.
+/// MDBX_RESULT_FALSE (0) - если удалось захватить только разделяемую
+/// блокировку и, следовательно, БД уже открыта и используется другими
+/// процессами.
+/// Иначе (не 0 и не -1) - код ошибки.
int mdbx_lck_seize(MDBX_env *env);
+
+/// \brief Снижает уровень первоначальной захваченной блокировки до
+/// операционного уровня определяемого аргументом.
+/// \param
+/// complete = TRUE - понижение до разделяемой блокировки.
+/// complete = FALSE - понижение до эксклюзивной операционной блокировки.
+/// \return Код ошибки или 0 в случае успеха.
int mdbx_lck_downgrade(MDBX_env *env, bool complete);
-void mdbx_lck_destroy(MDBX_env *env);
+/// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации.
+/// \return Код ошибки или 0 в случае успеха.
int mdbx_rdt_lock(MDBX_env *env);
+
+/// \brief Разблокирует lck-файл и/или таблицу читателей после (де)регистрации.
void mdbx_rdt_unlock(MDBX_env *env);
+/// \brief Захватывает блокировку для изменения БД (при старте пишущей
+/// транзакции). Транзакции чтения при этом никак не блокируются.
+/// \return Код ошибки или 0 в случае успеха.
LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dontwait);
+
+/// \brief Освобождает блокировку по окончанию изменения БД (после завершения
+/// пишущей транзакции).
LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env);
+/// \brief Устанавливает alive-флажок присутствия (индицирующую блокировку)
+/// читателя для pid текущего процесса. Функции может выполнить не более
+/// необходимого минимума для корректной работы mdbx_rpid_check() в других
+/// процессах.
+/// \return Код ошибки или 0 в случае успеха.
int mdbx_rpid_set(MDBX_env *env);
+
+/// \brief Снимает alive-флажок присутствия (индицирующую блокировку)
+/// читателя для pid текущего процесса. Функции может выполнить не более
+/// необходимого минимума для корректной работы mdbx_rpid_check() в других
+/// процессах.
+/// \return Код ошибки или 0 в случае успеха.
int mdbx_rpid_clear(MDBX_env *env);
+/// \brief Проверяет жив ли процесс-читатель с заданным pid
+/// по alive-флажку присутствия (индицирующей блокировку),
+/// либо любым другим способом.
+/// \return
+/// MDBX_RESULT_TRUE (-1) - если процесс-читатель с соответствующим pid жив
+/// и работает с БД (индицирующая блокировка присутствует).
+/// MDBX_RESULT_FALSE (0) - если процесс-читатель с соответствующим pid
+/// отсутствует или не работает с БД (индицирующая блокировка отсутствует).
+/// Иначе (не 0 и не -1) - код ошибки.
+int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid);
+
#if defined(_WIN32) || defined(_WIN64)
typedef union MDBX_srwlock {
struct {
@@ -618,7 +732,6 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)(
_Out_opt_ LPDWORD lpMaximumComponentLength,
_Out_opt_ LPDWORD lpFileSystemFlags,
_Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize);
-
extern MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW;
typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile,
@@ -630,7 +743,6 @@ extern MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW;
typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)(
_In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass,
_Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize);
-
extern MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle;
typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
@@ -639,18 +751,21 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)(
OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode,
IN OUT PVOID InputBuffer, IN ULONG InputBufferLength,
OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength);
-
extern MDBX_NtFsControlFile mdbx_NtFsControlFile;
-#endif /* Windows */
+#if _WIN32_WINNT < _WIN32_WINNT_WIN8
+typedef struct _WIN32_MEMORY_RANGE_ENTRY {
+ PVOID VirtualAddress;
+ SIZE_T NumberOfBytes;
+} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY;
+#endif
-/* Checks reader by pid.
- *
- * Returns:
- * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock)
- * MDBX_RESULT_FALSE, if pid is dead (lock acquired)
- * or otherwise the errcode. */
-int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid);
+typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)(
+ HANDLE hProcess, ULONG_PTR NumberOfEntries,
+ PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags);
+extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
+
+#endif /* Windows */
/*----------------------------------------------------------------------------*/
/* Atomics */
@@ -755,8 +870,8 @@ static __inline bool mdbx_atomic_compare_and_swap64(volatile uint64_t *p,
/*----------------------------------------------------------------------------*/
-#if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER < 1920
-/* LY: MSVC 2015/2017 has buggy/inconsistent PRIuPTR/PRIxPTR macros
+#if defined(_MSC_VER) && _MSC_VER >= 1900
+/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros
* for internal format-args checker. */
#undef PRIuPTR
#undef PRIiPTR
diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.c b/libs/libmdbx/src/src/tools/mdbx_chk.c
index eb8d2681b1..9d3b9e063c 100644
--- a/libs/libmdbx/src/src/tools/mdbx_chk.c
+++ b/libs/libmdbx/src/src/tools/mdbx_chk.c
@@ -20,6 +20,8 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */
+/* Avoid reference to mdbx_runtime_flags from assert() */
+#define mdbx_runtime_flags (~0u)
#include "../bits.h"
typedef struct flagbit {
@@ -73,10 +75,10 @@ typedef struct {
} walk_dbi_t;
struct {
- walk_dbi_t dbi[MAX_DBI];
short *pagemap;
uint64_t total_payload_bytes;
uint64_t pgcount;
+ walk_dbi_t dbi[MAX_DBI];
} walk;
#define dbi_free walk.dbi[FREE_DBI]
@@ -91,7 +93,7 @@ MDBX_txn *txn;
MDBX_envinfo envinfo;
MDBX_stat envstat;
size_t maxkeysize, userdb_count, skipped_subdb;
-uint64_t reclaimable_pages, gc_pages, lastpgno, unused_pages;
+uint64_t reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages;
unsigned verbose;
char ignore_wrong_order, quiet;
const char *only_subdb;
@@ -105,11 +107,7 @@ struct problem {
struct problem *problems_list;
uint64_t total_problems;
-static void
-#ifdef __GNUC__
- __attribute__((format(printf, 1, 2)))
-#endif
- print(const char *msg, ...) {
+static void __printf_args(1, 2) print(const char *msg, ...) {
if (!quiet) {
va_list args;
@@ -120,11 +118,7 @@ static void
}
}
-static void
-#ifdef __GNUC__
- __attribute__((format(printf, 1, 2)))
-#endif
- error(const char *msg, ...) {
+static void __printf_args(1, 2) error(const char *msg, ...) {
total_problems++;
if (!quiet) {
@@ -181,10 +175,8 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) {
return last = dbi;
}
-static void
-#ifdef __GNUC__
- __attribute__((format(printf, 4, 5)))
-#endif
+static void __printf_args(4, 5)
+
problem_add(const char *object, uint64_t entry_number, const char *msg,
const char *extra, ...) {
total_problems++;
@@ -252,11 +244,12 @@ static size_t problems_pop(struct problem *list) {
return count;
}
-static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, int deep,
- const char *dbi_name_or_tag, size_t page_size,
- MDBX_page_type_t pagetype, size_t nentries,
- size_t payload_bytes, size_t header_bytes,
- size_t unused_bytes) {
+static int pgvisitor(const uint64_t pgno, const unsigned pgnumber,
+ void *const ctx, const int deep,
+ const char *const dbi_name_or_tag, const size_t page_size,
+ const MDBX_page_type_t pagetype, const size_t nentries,
+ const size_t payload_bytes, const size_t header_bytes,
+ const size_t unused_bytes) {
(void)ctx;
if (deep > 42) {
problem_add("deep", deep, "too large", nullptr);
@@ -326,23 +319,24 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, int deep,
}
bool already_used = false;
- do {
- if (pgno >= lastpgno)
- problem_add("page", pgno, "wrong page-no",
+ for (unsigned n = 0; n < pgnumber; ++n) {
+ uint64_t spanpgno = pgno + n;
+ if (spanpgno >= alloc_pages)
+ problem_add("page", spanpgno, "wrong page-no",
"%s-page: %" PRIu64 " > %" PRIu64 ", deep %i",
- pagetype_caption, pgno, lastpgno, deep);
- else if (walk.pagemap[pgno]) {
- walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[pgno] - 1];
- problem_add(
- "page", pgno, (branch && coll_dbi == dbi) ? "loop" : "already used",
- "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, deep);
+ pagetype_caption, spanpgno, alloc_pages, deep);
+ else if (walk.pagemap[spanpgno]) {
+ walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1];
+ problem_add("page", spanpgno,
+ (branch && coll_dbi == dbi) ? "loop" : "already used",
+ "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name,
+ deep);
already_used = true;
} else {
- walk.pagemap[pgno] = (short)(dbi - walk.dbi + 1);
+ walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1);
dbi->pages.total += 1;
}
- ++pgno;
- } while (--pgnumber);
+ }
if (already_used)
return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */
@@ -443,28 +437,38 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
if (envinfo.mi_latter_reader_txnid > txnid)
reclaimable_pages += number;
- pgno_t prev =
- MDBX_PNL_ASCENDING ? NUM_METAS - 1 : (pgno_t)envinfo.mi_last_pgno + 1;
+ pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno;
pgno_t span = 1;
for (unsigned i = 0; i < number; ++i) {
const pgno_t pgno = iptr[i];
- if (pgno < NUM_METAS || pgno > envinfo.mi_last_pgno)
+ if (pgno < NUM_METAS)
+ problem_add("entry", txnid, "wrong idl entry",
+ "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS);
+ else if (pgno >= backed_pages)
+ problem_add("entry", txnid, "wrong idl entry",
+ "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno,
+ backed_pages);
+ else if (pgno >= alloc_pages)
problem_add("entry", txnid, "wrong idl entry",
- "%u < %" PRIaPGNO " < %" PRIu64, NUM_METAS, pgno,
- envinfo.mi_last_pgno);
+ "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno,
+ alloc_pages - 1);
else {
if (MDBX_PNL_DISORDERED(prev, pgno)) {
bad = " [bad sequence]";
problem_add("entry", txnid, "bad sequence",
- "%" PRIaPGNO " <> %" PRIaPGNO, prev, pgno);
+ "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev,
+ (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'),
+ i, pgno);
}
- if (walk.pagemap && walk.pagemap[pgno]) {
- if (walk.pagemap[pgno] > 0)
+ if (walk.pagemap) {
+ int idx = walk.pagemap[pgno];
+ if (idx == 0)
+ walk.pagemap[pgno] = -1;
+ else if (idx > 0)
problem_add("page", pgno, "already used", "by %s",
- walk.dbi[walk.pagemap[pgno] - 1].name);
+ walk.dbi[idx - 1].name);
else
problem_add("page", pgno, "already listed in GC", nullptr);
- walk.pagemap[pgno] = -1;
}
}
prev = pgno;
@@ -700,7 +704,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler,
if (handler) {
rc = handler(record_count, &key, &data);
- if (rc)
+ if (MDBX_IS_ERROR(rc))
goto bailout;
}
@@ -733,7 +737,7 @@ bailout:
}
mdbx_cursor_close(mc);
- return rc || problems_count;
+ return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS;
}
static void usage(char *prog) {
@@ -1008,7 +1012,7 @@ int main(int argc, char *argv[]) {
#if defined(_WIN32) || defined(_WIN64)
rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION
#else
- rc == EBUSY
+ rc == EBUSY || rc == EAGAIN
#endif
)) {
envflags &= ~MDBX_EXCLUSIVE;
@@ -1048,20 +1052,105 @@ int main(int argc, char *argv[]) {
}
maxkeysize = rc;
- rc = mdbx_env_info(env, &envinfo, sizeof(envinfo));
+ rc = mdbx_env_info2(env, txn, &envinfo, sizeof(envinfo));
if (rc) {
error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc));
goto bailout;
}
- rc = mdbx_env_stat(env, &envstat, sizeof(envstat));
+ rc = mdbx_env_stat2(env, txn, &envstat, sizeof(envstat));
if (rc) {
error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc));
goto bailout;
}
- lastpgno = envinfo.mi_last_pgno + 1;
+ mdbx_filehandle_t dxb_fd;
+ rc = mdbx_env_get_fd(env, &dxb_fd);
+ if (rc) {
+ error("mdbx_env_get_fd failed, error %d %s\n", rc, mdbx_strerror(rc));
+ goto bailout;
+ }
+
+ uint64_t dxb_filesize = 0;
+#if defined(_WIN32) || defined(_WIN64)
+ {
+ BY_HANDLE_FILE_INFORMATION info;
+ if (!GetFileInformationByHandle(dxb_fd, &info))
+ rc = GetLastError();
+ else
+ dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
+ }
+#else
+ {
+ struct stat st;
+ STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
+ "libmdbx requires 64-bit file I/O on 64-bit systems");
+ if (fstat(dxb_fd, &st))
+ rc = errno;
+ else
+ dxb_filesize = st.st_size;
+ }
+#endif
+ if (rc) {
+ error("mdbx_filesize failed, error %d %s\n", rc, mdbx_strerror(rc));
+ goto bailout;
+ }
+
errno = 0;
+ const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize;
+ alloc_pages = txn->mt_next_pgno;
+ backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize;
+#if !(defined(_WIN32) || defined(_WIN64))
+ if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY &&
+ backed_pages != dxbfile_pages) {
+ print(" ! backed-pages %" PRIu64 " != file-pages %" PRIu64 "\n",
+ backed_pages, dxbfile_pages);
+ ++problems_meta;
+ }
+#endif /* !Windows */
+ if (dxbfile_pages < NUM_METAS)
+ print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS);
+ if (backed_pages < NUM_METAS)
+ print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS);
+ if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS)
+ goto bailout;
+ if (backed_pages > MAX_PAGENO) {
+ print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n",
+ backed_pages, MAX_PAGENO);
+ ++problems_meta;
+ backed_pages = MAX_PAGENO;
+ }
+
+ if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
+ if (backed_pages > dxbfile_pages) {
+ print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
+ backed_pages, dxbfile_pages);
+ ++problems_meta;
+ backed_pages = dxbfile_pages;
+ }
+ if (alloc_pages > backed_pages) {
+ print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
+ alloc_pages, backed_pages);
+ ++problems_meta;
+ alloc_pages = backed_pages;
+ }
+ } else {
+ /* LY: DB may be shrinked by writer downto the allocated pages. */
+ if (alloc_pages > backed_pages) {
+ print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n",
+ alloc_pages, backed_pages);
+ ++problems_meta;
+ alloc_pages = backed_pages;
+ }
+ if (alloc_pages > dxbfile_pages) {
+ print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n",
+ alloc_pages, dxbfile_pages);
+ ++problems_meta;
+ alloc_pages = dxbfile_pages;
+ }
+ if (backed_pages > dxbfile_pages)
+ backed_pages = dxbfile_pages;
+ }
if (verbose) {
print(" - pagesize %u (%u system), max keysize %" PRIuPTR
@@ -1094,17 +1183,17 @@ int main(int argc, char *argv[]) {
print(" - performs check for meta-pages clashes\n");
if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign,
envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) {
- print(" - meta-%d and meta-%d are clashed\n", 0, 1);
+ print(" ! meta-%d and meta-%d are clashed\n", 0, 1);
++problems_meta;
}
if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign,
envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) {
- print(" - meta-%d and meta-%d are clashed\n", 1, 2);
+ print(" ! meta-%d and meta-%d are clashed\n", 1, 2);
++problems_meta;
}
if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign,
envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) {
- print(" - meta-%d and meta-%d are clashed\n", 2, 0);
+ print(" ! meta-%d and meta-%d are clashed\n", 2, 0);
++problems_meta;
}
@@ -1129,7 +1218,7 @@ int main(int argc, char *argv[]) {
print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid);
fflush(NULL);
- walk.pagemap = mdbx_calloc((size_t)lastpgno, sizeof(*walk.pagemap));
+ walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap));
if (!walk.pagemap) {
rc = errno ? errno : MDBX_ENOMEM;
error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc));
@@ -1150,7 +1239,7 @@ int main(int argc, char *argv[]) {
goto bailout;
}
- for (uint64_t n = 0; n < lastpgno; ++n)
+ for (uint64_t n = 0; n < alloc_pages; ++n)
if (!walk.pagemap[n])
unused_pages += 1;
@@ -1178,7 +1267,9 @@ int main(int argc, char *argv[]) {
uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed;
if (all_leaf) {
print(", leaf %" PRIu64, all_leaf);
- if (verbose > 2)
+ if (verbose > 2 &&
+ (dbi->pages.leaf_dupfixed | dbi->pages.subleaf_dupsort |
+ dbi->pages.subleaf_dupsort))
print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64
", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")",
dbi->pages.leaf, dbi->pages.subleaf_dupsort,
@@ -1238,15 +1329,16 @@ int main(int argc, char *argv[]) {
uint64_t value = envinfo.mi_mapsize / envstat.ms_psize;
double percent = value / 100.0;
print(" - space: %" PRIu64 " total pages", value);
- value = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize;
- print(", backed %" PRIu64 " (%.1f%%)", value, value / percent);
- print(", allocated %" PRIu64 " (%.1f%%)", lastpgno, lastpgno / percent);
+ print(", backed %" PRIu64 " (%.1f%%)", backed_pages,
+ backed_pages / percent);
+ print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages,
+ alloc_pages / percent);
if (verbose > 1) {
- value = envinfo.mi_mapsize / envstat.ms_psize - lastpgno;
+ value = envinfo.mi_mapsize / envstat.ms_psize - alloc_pages;
print(", remained %" PRIu64 " (%.1f%%)", value, value / percent);
- value = lastpgno - gc_pages;
+ value = alloc_pages - gc_pages;
print(", used %" PRIu64 " (%.1f%%)", value, value / percent);
print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent);
@@ -1259,16 +1351,16 @@ int main(int argc, char *argv[]) {
}
value =
- envinfo.mi_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages;
+ envinfo.mi_mapsize / envstat.ms_psize - alloc_pages + reclaimable_pages;
print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent);
}
if (problems_maindb == 0 && problems_freedb == 0) {
if (!dont_traversal &&
(envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) {
- if (walk.pgcount != lastpgno - gc_pages) {
+ if (walk.pgcount != alloc_pages - gc_pages) {
error("used pages mismatch (%" PRIu64 " != %" PRIu64 ")\n",
- walk.pgcount, lastpgno - gc_pages);
+ walk.pgcount, alloc_pages - gc_pages);
}
if (unused_pages != gc_pages) {
error("gc pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", unused_pages,
diff --git a/libs/libmdbx/src/src/tools/mdbx_copy.c b/libs/libmdbx/src/src/tools/mdbx_copy.c
index 9b0c833a37..f700b1785f 100644
--- a/libs/libmdbx/src/src/tools/mdbx_copy.c
+++ b/libs/libmdbx/src/src/tools/mdbx_copy.c
@@ -20,6 +20,8 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */
+/* Avoid reference to mdbx_runtime_flags from assert() */
+#define mdbx_runtime_flags (~0u)
#include "../bits.h"
#if defined(_WIN32) || defined(_WIN64)
diff --git a/libs/libmdbx/src/src/tools/mdbx_dump.c b/libs/libmdbx/src/src/tools/mdbx_dump.c
index 07951548dd..8cb9c86e46 100644
--- a/libs/libmdbx/src/src/tools/mdbx_dump.c
+++ b/libs/libmdbx/src/src/tools/mdbx_dump.c
@@ -20,7 +20,10 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */
+/* Avoid reference to mdbx_runtime_flags from assert() */
+#define mdbx_runtime_flags (~0u)
#include "../bits.h"
+
#include <ctype.h>
#define PRINT 1
diff --git a/libs/libmdbx/src/src/tools/mdbx_load.c b/libs/libmdbx/src/src/tools/mdbx_load.c
index 9789e83a62..7bbc52b87e 100644
--- a/libs/libmdbx/src/src/tools/mdbx_load.c
+++ b/libs/libmdbx/src/src/tools/mdbx_load.c
@@ -20,7 +20,10 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */
+/* Avoid reference to mdbx_runtime_flags from assert() */
+#define mdbx_runtime_flags (~0u)
#include "../bits.h"
+
#include <ctype.h>
#if defined(_WIN32) || defined(_WIN64)
@@ -263,10 +266,9 @@ static int readline(MDBX_val *out, MDBX_val *buf) {
if (mode & PRINT) {
while (c2 < end) {
- if (*c2 == '\\') {
+ if (unlikely(*c2 == '\\')) {
if (c2[1] == '\\') {
- c1++;
- c2 += 2;
+ *c1++ = '\\';
} else {
if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) {
Eof = 1;
@@ -274,8 +276,8 @@ static int readline(MDBX_val *out, MDBX_val *buf) {
return EOF;
}
*c1++ = (char)unhex(++c2);
- c2 += 2;
}
+ c2 += 2;
} else {
/* copies are redundant when no escapes were used */
*c1++ = *c2++;
diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.c b/libs/libmdbx/src/src/tools/mdbx_stat.c
index e459121d76..d4ebc2e654 100644
--- a/libs/libmdbx/src/src/tools/mdbx_stat.c
+++ b/libs/libmdbx/src/src/tools/mdbx_stat.c
@@ -20,6 +20,8 @@
#pragma warning(disable : 4996) /* The POSIX name is deprecated... */
#endif /* _MSC_VER (warnings) */
+/* Avoid reference to mdbx_runtime_flags from assert() */
+#define mdbx_runtime_flags (~0u)
#include "../bits.h"
#if defined(_WIN32) || defined(_WIN64)
diff --git a/libs/libmdbx/src/test/append.cc b/libs/libmdbx/src/test/append.cc
index c5e7e91d9e..273f68b810 100644
--- a/libs/libmdbx/src/test/append.cc
+++ b/libs/libmdbx/src/test/append.cc
@@ -15,11 +15,12 @@
#include "test.h"
bool testcase_append::run() {
- db_open();
-
- txn_begin(false);
- MDBX_dbi dbi = db_table_open(true);
- db_table_clear(dbi);
+ MDBX_dbi dbi;
+ int err = db_open__begin__table_create_open_clean(dbi);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("append: bailout-prepare due '%s'", mdbx_strerror(err));
+ return true;
+ }
keyvalue_maker.setup(config.params, config.actor_id, 0 /* thread_number */);
/* LY: тест наполнения таблиц в append-режиме,
@@ -41,7 +42,10 @@ bool testcase_append::run() {
simple_checksum inserted_checksum;
uint64_t inserted_number = 0;
uint64_t serial_count = 0;
+
unsigned txn_nops = 0;
+ uint64_t commited_inserted_number = inserted_number;
+ simple_checksum commited_inserted_checksum = inserted_checksum;
while (should_continue()) {
const keygen::serial_t serial = serial_count;
if (!keyvalue_maker.increment(serial_count, 1)) {
@@ -57,10 +61,19 @@ bool testcase_append::run() {
if (cmp == 0 && (config.params.table_flags & MDBX_DUPSORT))
cmp = mdbx_dcmp(txn_guard.get(), dbi, &data->value, &last_data->value);
- int err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, flags);
+ err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, flags);
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("append: bailout-insert due '%s'", mdbx_strerror(err));
+ txn_end(true);
+ inserted_number = commited_inserted_number;
+ inserted_checksum = commited_inserted_checksum;
+ break;
+ }
+
if (cmp > 0) {
if (unlikely(err != MDBX_SUCCESS))
failure_perror("mdbx_put(appenda-a)", err);
+
memcpy(last_key->value.iov_base, key->value.iov_base,
last_key->value.iov_len = key->value.iov_len);
memcpy(last_data->value.iov_base, data->value.iov_base,
@@ -74,22 +87,40 @@ bool testcase_append::run() {
}
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("append: bailout-commit due '%s'", mdbx_strerror(err));
+ inserted_number = commited_inserted_number;
+ inserted_checksum = commited_inserted_checksum;
+ break;
+ }
+ commited_inserted_number = inserted_number;
+ commited_inserted_checksum = inserted_checksum;
txn_nops = 0;
}
report(1);
}
- txn_restart(false, true);
+ if (txn_guard) {
+ err = breakable_commit();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("append: bailout-commit due '%s'", mdbx_strerror(err));
+ inserted_number = commited_inserted_number;
+ inserted_checksum = commited_inserted_checksum;
+ }
+ }
//----------------------------------------------------------------------------
+ txn_begin(true);
cursor_open(dbi);
MDBX_val check_key, check_data;
- int err =
+ err =
mdbx_cursor_get(cursor_guard.get(), &check_key, &check_data, MDBX_FIRST);
- if (unlikely(err != MDBX_SUCCESS))
- failure_perror("mdbx_cursor_get(MDBX_FIRST)", err);
+ if (likely(inserted_number)) {
+ if (unlikely(err != MDBX_SUCCESS))
+ failure_perror("mdbx_cursor_get(MDBX_FIRST)", err);
+ }
simple_checksum read_checksum;
uint64_t read_count = 0;
@@ -115,15 +146,18 @@ bool testcase_append::run() {
read_checksum.value, inserted_checksum.value);
cursor_close();
+ txn_end(true);
//----------------------------------------------------------------------------
- if (txn_guard)
- txn_end(false);
if (dbi) {
if (config.params.drop_table && !mode_readonly()) {
txn_begin(false);
db_table_drop(dbi);
- txn_end(false);
+ err = breakable_commit();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("append: bailout-clean due '%s'", mdbx_strerror(err));
+ return true;
+ }
} else
db_table_close(dbi);
}
diff --git a/libs/libmdbx/src/test/base.h b/libs/libmdbx/src/test/base.h
index 0b4d26e51b..5ca134a81d 100644
--- a/libs/libmdbx/src/test/base.h
+++ b/libs/libmdbx/src/test/base.h
@@ -39,6 +39,10 @@
#include <SDKDDKVer.h>
#endif /* WINDOWS */
+#ifdef __APPLE__
+#define _DARWIN_C_SOURCE
+#endif
+
#include <errno.h>
#include <limits.h>
#include <stdio.h>
diff --git a/libs/libmdbx/src/test/cases.cc b/libs/libmdbx/src/test/cases.cc
index 023a80020c..a98834a457 100644
--- a/libs/libmdbx/src/test/cases.cc
+++ b/libs/libmdbx/src/test/cases.cc
@@ -63,8 +63,7 @@ void testcase_setup(const char *casename, actor_params &params,
log_notice(">>> testcase_setup(%s)", casename);
configure_actor(last_space_id, ac_jitter, nullptr, params);
configure_actor(last_space_id, ac_hill, nullptr, params);
- configure_actor(last_space_id, ac_jitter, nullptr, params);
- configure_actor(last_space_id, ac_hill, nullptr, params);
+ configure_actor(last_space_id, ac_ttl, nullptr, params);
configure_actor(last_space_id, ac_jitter, nullptr, params);
configure_actor(last_space_id, ac_hill, nullptr, params);
configure_actor(last_space_id, ac_ttl, nullptr, params);
diff --git a/libs/libmdbx/src/test/config.cc b/libs/libmdbx/src/test/config.cc
index bfae5c14df..dd150e9a91 100644
--- a/libs/libmdbx/src/test/config.cc
+++ b/libs/libmdbx/src/test/config.cc
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright 2017-2019 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
@@ -412,6 +412,8 @@ void dump(const char *title) {
i->params.max_tables);
log_info("drop table: %s\n", i->params.drop_table ? "Yes" : "No");
+ log_info("ignore MDBX_MAP_FULL error: %s\n",
+ i->params.ignore_dbfull ? "Yes" : "No");
indent.pop();
}
diff --git a/libs/libmdbx/src/test/config.h b/libs/libmdbx/src/test/config.h
index b8a4b6827d..89889d8eb6 100644
--- a/libs/libmdbx/src/test/config.h
+++ b/libs/libmdbx/src/test/config.h
@@ -38,7 +38,8 @@ enum actor_status {
as_running,
as_successful,
as_killed,
- as_failed
+ as_failed,
+ as_coredump,
};
const char *testcase2str(const actor_testcase);
@@ -101,6 +102,22 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option,
bool parse_option(int argc, char *const argv[], int &narg, const char *option,
int32_t &value, const int32_t minval, const int32_t maxval,
const int32_t default_value = -1);
+
+inline bool parse_option_intptr(int argc, char *const argv[], int &narg,
+ const char *option, intptr_t &value,
+ const intptr_t minval, const intptr_t maxval,
+ const intptr_t default_value = -1) {
+ static_assert(sizeof(intptr_t) == 4 || sizeof(intptr_t) == 8, "WTF?");
+ if (sizeof(intptr_t) == 8)
+ return parse_option(argc, argv, narg, option,
+ *reinterpret_cast<int64_t *>(&value), int64_t(minval),
+ int64_t(maxval), int64_t(default_value));
+ else
+ return parse_option(argc, argv, narg, option,
+ *reinterpret_cast<int32_t *>(&value), int32_t(minval),
+ int32_t(maxval), int32_t(default_value));
+}
+
//-----------------------------------------------------------------------------
#pragma pack(push, 1)
@@ -248,6 +265,7 @@ struct actor_params_pod {
keygen_params_pod keygen;
bool drop_table;
+ bool ignore_dbfull;
};
struct actor_config_pod {
diff --git a/libs/libmdbx/src/test/darwin/LICENSE b/libs/libmdbx/src/test/darwin/LICENSE
new file mode 100644
index 0000000000..6a0dd3066b
--- /dev/null
+++ b/libs/libmdbx/src/test/darwin/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2015, Aleksey Demakov
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/libs/libmdbx/src/test/darwin/README.md b/libs/libmdbx/src/test/darwin/README.md
new file mode 100644
index 0000000000..a6a8fd1a91
--- /dev/null
+++ b/libs/libmdbx/src/test/darwin/README.md
@@ -0,0 +1,8 @@
+# DarwinPthreadBarrier
+
+A pthread_barrier_t implementation for Mac OS/X
+
+There is no pthread_barrier_t in Mac OS/X pthreads. This project fixes
+this omission by providing a simple-minded barrier implementation based
+on a pair of pthread_mutex_t and pthread_cond_t.
+
diff --git a/libs/libmdbx/src/test/darwin/pthread_barrier.c b/libs/libmdbx/src/test/darwin/pthread_barrier.c
new file mode 100644
index 0000000000..054aa00708
--- /dev/null
+++ b/libs/libmdbx/src/test/darwin/pthread_barrier.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015, Aleksey Demakov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "pthread_barrier.h"
+
+#include <errno.h>
+
+#ifdef __APPLE__
+
+int pthread_barrierattr_init(pthread_barrierattr_t *attr) {
+ memset(attr, 0, sizeof(pthread_barrierattr_t));
+ int m = pthread_mutexattr_init(&attr->mattr);
+ int c = pthread_condattr_init(&attr->cattr);
+ return m ? m : c;
+}
+
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr) {
+ int c = pthread_condattr_destroy(&attr->cattr);
+ int m = pthread_mutexattr_destroy(&attr->mattr);
+ return m ? m : c;
+}
+
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *__restrict attr,
+ int *__restrict pshared) {
+ return pthread_condattr_getpshared(&attr->cattr, pshared);
+}
+
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared) {
+ int m = pthread_mutexattr_setpshared(&attr->mattr, pshared);
+ int c = pthread_condattr_setpshared(&attr->cattr, pshared);
+ return m ? m : c;
+}
+
+int pthread_barrier_init(pthread_barrier_t *__restrict barrier,
+ const pthread_barrierattr_t *__restrict attr,
+ unsigned count) {
+ if (count == 0)
+ return errno = EINVAL;
+
+ int rc = pthread_mutex_init(&barrier->mutex, attr ? &attr->mattr : 0);
+ if (rc)
+ return rc;
+
+ rc = pthread_cond_init(&barrier->cond, attr ? &attr->cattr : 0);
+ if (rc) {
+ int errno_save = errno;
+ pthread_mutex_destroy(&barrier->mutex);
+ errno = errno_save;
+ return rc;
+ }
+
+ barrier->limit = count;
+ barrier->count = 0;
+ barrier->phase = 0;
+ return 0;
+}
+
+int pthread_barrier_destroy(pthread_barrier_t *barrier) {
+ pthread_mutex_destroy(&barrier->mutex);
+ pthread_cond_destroy(&barrier->cond);
+ return 0;
+}
+
+int pthread_barrier_wait(pthread_barrier_t *barrier) {
+ int rc = pthread_mutex_lock(&barrier->mutex);
+ if (rc)
+ return rc;
+
+ barrier->count++;
+ if (barrier->count >= barrier->limit) {
+ barrier->phase++;
+ barrier->count = 0;
+ pthread_cond_broadcast(&barrier->cond);
+ pthread_mutex_unlock(&barrier->mutex);
+ return PTHREAD_BARRIER_SERIAL_THREAD;
+ } else {
+ unsigned phase = barrier->phase;
+ do
+ pthread_cond_wait(&barrier->cond, &barrier->mutex);
+ while (phase == barrier->phase);
+ pthread_mutex_unlock(&barrier->mutex);
+ return 0;
+ }
+}
+
+#endif /* __APPLE__ */
diff --git a/libs/libmdbx/src/test/darwin/pthread_barrier.h b/libs/libmdbx/src/test/darwin/pthread_barrier.h
new file mode 100644
index 0000000000..efa9b9b751
--- /dev/null
+++ b/libs/libmdbx/src/test/darwin/pthread_barrier.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015, Aleksey Demakov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef PTHREAD_BARRIER_H
+#define PTHREAD_BARRIER_H
+
+#include <pthread.h>
+
+#ifdef __APPLE__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(PTHREAD_BARRIER_SERIAL_THREAD)
+#define PTHREAD_BARRIER_SERIAL_THREAD (1)
+#endif
+
+#if !defined(PTHREAD_PROCESS_PRIVATE)
+#define PTHREAD_PROCESS_PRIVATE (42)
+#endif
+#if !defined(PTHREAD_PROCESS_SHARED)
+#define PTHREAD_PROCESS_SHARED (43)
+#endif
+
+typedef struct {
+ pthread_mutexattr_t mattr;
+ pthread_condattr_t cattr;
+} pthread_barrierattr_t;
+
+typedef struct {
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ unsigned int limit;
+ unsigned int count;
+ unsigned int phase;
+} pthread_barrier_t;
+
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *__restrict attr,
+ int *__restrict pshared);
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+int pthread_barrier_init(pthread_barrier_t *__restrict barrier,
+ const pthread_barrierattr_t *__restrict attr,
+ unsigned int count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __APPLE__ */
+
+#endif /* PTHREAD_BARRIER_H */
diff --git a/libs/libmdbx/src/test/gc.sh b/libs/libmdbx/src/test/gc.sh
deleted file mode 100644
index 9c1407f53e..0000000000
--- a/libs/libmdbx/src/test/gc.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-make check
-TESTDB_PREFIX=${1:-/dev/shm/mdbx-gc-test}.
-
-function rep9 { printf "%*s" $1 '' | tr ' ' '9'; }
-function join { local IFS="$1"; shift; echo "$*"; }
-function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; }
-
-options=(writemap coalesce lifo)
-
-function bits2list {
- local -n arr=$1
- local i
- local list=()
- for ((i=0; i<${#arr[@]}; ++i)) do
- list[$i]=$(bit2option $1 $2 $i)
- done
- join , "${list[@]}"
-}
-
-function probe {
- echo "=============================================== $(date)"
- echo "${caption}: $*"
- rm -f ${TESTDB_PREFIX}* \
- && ./mdbx_test --repeat=12 --pathname=${TESTDB_PREFIX}db "$@" | lz4 > ${TESTDB_PREFIX}log.lz4 \
- && ./mdbx_chk -nvvv ${TESTDB_PREFIX}db | tee ${TESTDB_PREFIX}chk \
- && ([ ! -e ${TESTDB_PREFIX}db-copy ] || ./mdbx_chk -nvvv ${TESTDB_PREFIX}db-copy | tee ${TESTDB_PREFIX}chk-copy) \
- || (echo "FAILED"; exit 1)
-}
-
-###############################################################################
-
-count=0
-for nops in {2..7}; do
- for ((wbatch=nops-1; wbatch > 0; --wbatch)); do
- loops=$(((333 >> nops) / nops + 3))
- for ((rep=0; rep++ < loops; )); do
- for ((bits=2**${#options[@]}; --bits >= 0; )); do
- seed=$(date +%N)
- caption="Probe #$((++count)) int-key,w/o-dups, repeat ${rep} of ${loops}" probe \
- --pagesize=min --size=6G --table=+key.integer,-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \
- --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
- --keygen.seed=${seed} basic
- caption="Probe #$((++count)) int-key,with-dups, repeat ${rep} of ${loops}" probe \
- --pagesize=min --size=6G --table=+key.integer,+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
- --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
- --keygen.seed=${seed} basic
- caption="Probe #$((++count)) int-key,int-data, repeat ${rep} of ${loops}" probe \
- --pagesize=min --size=6G --table=+key.integer,+data.integer --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
- --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
- --keygen.seed=${seed} basic
- caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \
- --pagesize=min --size=6G --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \
- --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
- --keygen.seed=${seed} basic
- caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \
- --pagesize=min --size=6G --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
- --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
- --keygen.seed=${seed} basic
- done
- done
- done
-done
-
-echo "=== ALL DONE ====================== $(date)"
diff --git a/libs/libmdbx/src/test/hill.cc b/libs/libmdbx/src/test/hill.cc
index 5b083e1fcc..1b03ddf0fc 100644
--- a/libs/libmdbx/src/test/hill.cc
+++ b/libs/libmdbx/src/test/hill.cc
@@ -15,11 +15,12 @@
#include "test.h"
bool testcase_hill::run() {
- db_open();
-
- txn_begin(false);
- MDBX_dbi dbi = db_table_open(true);
- txn_end(false);
+ MDBX_dbi dbi;
+ int err = db_open__begin__table_create_open_clean(dbi);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("hill: bailout-prepare due '%s'", mdbx_strerror(err));
+ return true;
+ }
/* LY: тест "холмиком":
* - сначала наполняем таблицу циклическими CRUD-манипуляциями,
@@ -59,14 +60,15 @@ bool testcase_hill::run() {
: MDBX_NODUPDATA;
uint64_t serial_count = 0;
+ uint64_t commited_serial = serial_count;
unsigned txn_nops = 0;
- if (!txn_guard)
- txn_begin(false);
while (should_continue()) {
const keygen::serial_t a_serial = serial_count;
- if (unlikely(!keyvalue_maker.increment(serial_count, 1)))
- failure("uphill: unexpected key-space overflow");
+ if (unlikely(!keyvalue_maker.increment(serial_count, 1))) {
+ log_notice("uphill: unexpected key-space overflow");
+ break;
+ }
const keygen::serial_t b_serial = serial_count;
assert(b_serial > a_serial);
@@ -76,26 +78,52 @@ bool testcase_hill::run() {
log_trace("uphill: insert-a (age %" PRIu64 ") %" PRIu64, age_shift,
a_serial);
generate_pair(a_serial, a_key, a_data_1, age_shift);
- int rc = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value,
- insert_flags);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_put(insert-a.1)", rc);
+ err = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value,
+ insert_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("uphill: bailout at insert-a due '%s'", mdbx_strerror(err));
+ txn_restart(true, false);
+ serial_count = commited_serial;
+ break;
+ }
+ failure_perror("mdbx_put(insert-a.1)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err));
+ serial_count = commited_serial;
+ break;
+ }
+ commited_serial = a_serial;
txn_nops = 0;
}
// создаем вторую запись из пары
log_trace("uphill: insert-b %" PRIu64, b_serial);
generate_pair(b_serial, b_key, b_data, 0);
- rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value,
- insert_flags);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_put(insert-b)", rc);
+ err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value,
+ insert_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("uphill: bailout at insert-b due '%s'", mdbx_strerror(err));
+ txn_restart(true, false);
+ serial_count = commited_serial;
+ break;
+ }
+ failure_perror("mdbx_put(insert-b)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err));
+ serial_count = commited_serial;
+ break;
+ }
+ commited_serial = a_serial;
txn_nops = 0;
}
@@ -104,25 +132,51 @@ bool testcase_hill::run() {
a_serial);
generate_pair(a_serial, a_key, a_data_0, 0);
checkdata("uphill: update-a", dbi, a_key->value, a_data_1->value);
- rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value,
- &a_data_1->value, update_flags);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_replace(update-a: 1->0)", rc);
+ err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value,
+ &a_data_1->value, update_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("uphill: bailout at update-a due '%s'", mdbx_strerror(err));
+ txn_restart(true, false);
+ serial_count = commited_serial;
+ break;
+ }
+ failure_perror("mdbx_replace(update-a: 1->0)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err));
+ serial_count = commited_serial;
+ break;
+ }
+ commited_serial = a_serial;
txn_nops = 0;
}
// удаляем вторую запись
log_trace("uphill: delete-b %" PRIu64, b_serial);
checkdata("uphill: delete-b", dbi, b_key->value, b_data->value);
- rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_del(b)", rc);
+ err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("uphill: bailout at delete-b due '%s'", mdbx_strerror(err));
+ txn_restart(true, false);
+ serial_count = commited_serial;
+ break;
+ }
+ failure_perror("mdbx_del(b)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("uphill: bailout at commit due '%s'", mdbx_strerror(err));
+ serial_count = commited_serial;
+ break;
+ }
+ commited_serial = a_serial;
txn_nops = 0;
}
@@ -134,7 +188,7 @@ bool testcase_hill::run() {
}
}
- while (serial_count > 0) {
+ while (serial_count > 1) {
if (unlikely(!keyvalue_maker.increment(serial_count, -2)))
failure("downhill: unexpected key-space underflow");
@@ -150,26 +204,48 @@ bool testcase_hill::run() {
generate_pair(a_serial, a_key, a_data_0, 0);
generate_pair(a_serial, a_key, a_data_1, age_shift);
checkdata("downhill: update-a", dbi, a_key->value, a_data_0->value);
- int rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value,
- &a_data_0->value, update_flags);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_put(update-a: 0->1)", rc);
+ err = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value,
+ &a_data_0->value, update_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("downhill: bailout at update-a due '%s'",
+ mdbx_strerror(err));
+ txn_end(true);
+ break;
+ }
+ failure_perror("mdbx_put(update-a: 0->1)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err));
+ break;
+ }
txn_nops = 0;
}
// создаем вторую запись из пары
log_trace("downhill: insert-b %" PRIu64, b_serial);
generate_pair(b_serial, b_key, b_data, 0);
- rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value,
- insert_flags);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_put(insert-b)", rc);
+ err = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value,
+ insert_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("downhill: bailout at insert-a due '%s'",
+ mdbx_strerror(err));
+ txn_end(true);
+ break;
+ }
+ failure_perror("mdbx_put(insert-b)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err));
+ break;
+ }
txn_nops = 0;
}
@@ -177,38 +253,67 @@ bool testcase_hill::run() {
log_trace("downhill: delete-a (age %" PRIu64 ") %" PRIu64, age_shift,
a_serial);
checkdata("downhill: delete-a", dbi, a_key->value, a_data_1->value);
- rc = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_del(a)", rc);
+ err = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("downhill: bailout at delete-a due '%s'",
+ mdbx_strerror(err));
+ txn_end(true);
+ break;
+ }
+ failure_perror("mdbx_del(a)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err));
+ break;
+ }
txn_nops = 0;
}
// удаляем вторую запись
log_trace("downhill: delete-b %" PRIu64, b_serial);
checkdata("downhill: delete-b", dbi, b_key->value, b_data->value);
- rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_del(b)", rc);
+ err = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("downhill: bailout at delete-b due '%s'",
+ mdbx_strerror(err));
+ txn_end(true);
+ break;
+ }
+ failure_perror("mdbx_del(b)", err);
+ }
if (++txn_nops >= config.params.batch_write) {
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err));
+ break;
+ }
txn_nops = 0;
}
report(1);
}
- if (txn_guard)
- txn_end(false);
+ if (txn_guard) {
+ err = breakable_commit();
+ if (unlikely(err != MDBX_SUCCESS))
+ log_notice("downhill: bailout at commit due '%s'", mdbx_strerror(err));
+ }
if (dbi) {
if (config.params.drop_table && !mode_readonly()) {
txn_begin(false);
db_table_drop(dbi);
- txn_end(false);
+ err = breakable_commit();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("hill: bailout-clean due '%s'", mdbx_strerror(err));
+ return true;
+ }
} else
db_table_close(dbi);
}
diff --git a/libs/libmdbx/src/test/keygen.cc b/libs/libmdbx/src/test/keygen.cc
index 30cdf7a571..0110b049bf 100644
--- a/libs/libmdbx/src/test/keygen.cc
+++ b/libs/libmdbx/src/test/keygen.cc
@@ -184,7 +184,8 @@ bool maker::increment(serial_t &serial, int delta) const {
}
serial_t target = serial + (int64_t)delta;
- if (target > mask(mapping.width)) {
+ if (target > mask(mapping.width) ||
+ ((delta > 0) ? target < serial : target > serial)) {
log_extra("keygen-increment: %" PRIu64 "%-d => %" PRIu64 ", overflow",
serial, delta, target);
return false;
diff --git a/libs/libmdbx/src/test/long_stochastic.sh b/libs/libmdbx/src/test/long_stochastic.sh
new file mode 100644
index 0000000000..954d18268a
--- /dev/null
+++ b/libs/libmdbx/src/test/long_stochastic.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+if ! which make cc c++ tee lz4 >/dev/null; then
+ echo "Please install the following prerequisites: make cc c++ tee lz4" >&2
+ exit 1
+fi
+
+set -euo pipefail
+
+UNAME="$(uname -s 2>/dev/null || echo Unknown)"
+case ${UNAME} in
+ Linux)
+ MAKE=make
+ if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then
+ TESTDB_DIR="/dev/shm/mdbx-test.$$"
+ fi
+ mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/*
+ if LC_ALL=C free | grep -q -i available; then
+ ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 7) / 1024))
+ else
+ ram_avail_mb=$(($(LC_ALL=C free | grep -i Mem: | tr -s [:blank:] ' ' | cut -d ' ' -f 4) / 1024))
+ fi
+ ;;
+ FreeBSD)
+ MAKE=gmake
+ if [[ ! -v TESTDB_DIR || -z "$TESTDB_DIR" ]]; then
+ for old_test_dir in $(ls -d /tmp/mdbx-test.[0-9]*); do
+ umount $old_test_dir && rm -r $old_test_dir
+ done
+ TESTDB_DIR="/tmp/mdbx-test.$$"
+ rm -rf $TESTDB_DIR && mkdir -p $TESTDB_DIR && mount -t tmpfs tmpfs $TESTDB_DIR
+ else
+ mkdir -p $TESTDB_DIR && rm -f $TESTDB_DIR/*
+ fi
+ ram_avail_mb=$(($(LC_ALL=C vmstat -s | grep -ie '[0-9] pages free$' | cut -d p -f 1) * ($(LC_ALL=C vmstat -s | grep -ie '[0-9] bytes per page$' | cut -d b -f 1) / 1024) / 1024))
+ ;;
+ *)
+ echo "FIXME: ${UNAME} not supported by this script"
+ exit 2
+ ;;
+esac
+
+echo "=== ${ram_avail_mb}M RAM available"
+ram_reserve4logs_mb=1234
+if [ $ram_avail_mb -lt $ram_reserve4logs_mb ]; then
+ echo "=== At least ${ram_reserve4logs_mb}Mb RAM required"
+ exit 3
+fi
+
+#
+# В режимах отличных от MDBX_WRITEMAP изменения до записи в файл
+# будут накапливаться в памяти, что может потребовать свободной
+# памяти размером с БД. Кроме этого, в тест входит сценарий
+# создания копия БД на ходу. Поэтому БД не может быть больше 1/3
+# от доступной памяти. Однако, следует учесть что malloc() будет
+# не сразу возвращать выделенную память системе, а также
+# предусмотреть места для логов.
+#
+# In non-MDBX_WRITEMAP modes, updates (dirty pages) will
+# accumulate in memory before writing to the disk, which may
+# require a free memory up to the size of a whole database. In
+# addition, the test includes a script create a copy of the
+# database on the go. Therefore, the database cannot be more 1/3
+# of available memory. Moreover, should be taken into account
+# that malloc() will not return the allocated memory to the
+# system immediately, as well some space is required for logs.
+#
+db_size_mb=$(((ram_avail_mb - ram_reserve4logs_mb) / 4))
+if [ $db_size_mb -gt 3072 ]; then
+ db_size_mb=3072
+fi
+echo "=== use ${db_size_mb}M for DB"
+
+${MAKE} TESTDB=${TESTDB_DIR}/smoke.db TESTLOG=${TESTDB_DIR}/smoke.log check
+rm -f ${TESTDB_DIR}/*
+
+###############################################################################
+
+function rep9 { printf "%*s" $1 '' | tr ' ' '9'; }
+function join { local IFS="$1"; shift; echo "$*"; }
+function bit2option { local -n arr=$1; (( ($2&(1<<$3)) != 0 )) && echo -n '+' || echo -n '-'; echo "${arr[$3]}"; }
+
+options=(writemap coalesce lifo)
+
+function bits2list {
+ local -n arr=$1
+ local i
+ local list=()
+ for ((i=0; i<${#arr[@]}; ++i)) do
+ list[$i]=$(bit2option $1 $2 $i)
+ done
+ join , "${list[@]}"
+}
+
+function probe {
+ echo "=============================================== $(date)"
+ echo "${caption}: $*"
+ rm -f ${TESTDB_DIR}/* \
+ && ./mdbx_test --ignore-dbfull --repeat=42 --pathname=${TESTDB_DIR}/long.db "$@" | lz4 > ${TESTDB_DIR}/long.log.lz4 \
+ && ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db | tee ${TESTDB_DIR}/long-chk.log \
+ && ([ ! -e ${TESTDB_DIR}/long.db-copy ] || ./mdbx_chk -nvvv ${TESTDB_DIR}/long.db-copy | tee ${TESTDB_DIR}/long-chk-copy.log) \
+ || (echo "FAILED"; exit 1)
+}
+
+###############################################################################
+
+count=0
+for nops in $(seq 2 6); do
+ for ((wbatch=nops-1; wbatch > 0; --wbatch)); do
+ loops=$(((111 >> nops) / nops + 3))
+ for ((rep=0; rep++ < loops; )); do
+ for ((bits=2**${#options[@]}; --bits >= 0; )); do
+ seed=$(($(date +%s) + RANDOM))
+ caption="Probe #$((++count)) int-key,w/o-dups, repeat ${rep} of ${loops}" probe \
+ --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \
+ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
+ --keygen.seed=${seed} basic
+ caption="Probe #$((++count)) int-key,with-dups, repeat ${rep} of ${loops}" probe \
+ --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
+ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
+ --keygen.seed=${seed} basic
+ caption="Probe #$((++count)) int-key,int-data, repeat ${rep} of ${loops}" probe \
+ --pagesize=min --size-upper=${db_size_mb}M --table=+key.integer,+data.integer --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
+ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
+ --keygen.seed=${seed} basic
+ caption="Probe #$((++count)) w/o-dups, repeat ${rep} of ${loops}" probe \
+ --pagesize=min --size-upper=${db_size_mb}M --table=-data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \
+ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
+ --keygen.seed=${seed} basic
+ caption="Probe #$((++count)) with-dups, repeat ${rep} of ${loops}" probe \
+ --pagesize=min --size-upper=${db_size_mb}M --table=+data.dups --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \
+ --nops=$( rep9 $nops ) --batch.write=$( rep9 $wbatch ) --mode=$(bits2list options $bits) \
+ --keygen.seed=${seed} basic
+ done
+ done
+ done
+done
+
+echo "=== ALL DONE ====================== $(date)"
diff --git a/libs/libmdbx/src/test/loop.bat b/libs/libmdbx/src/test/loop.bat
deleted file mode 100644
index 0e3b6271c9..0000000000
--- a/libs/libmdbx/src/test/loop.bat
+++ /dev/null
@@ -1,15 +0,0 @@
-@echo off
-
-del test.db test.db-lck
-
-:loop
-
-mdbx_test.exe --pathname=test.db --dont-cleanup-after basic > test.log
-if errorlevel 1 goto fail
-
-mdbx_chk.exe -nvvv test.db > chk.log
-if errorlevel 1 goto fail
-goto loop
-
-:fail
-echo FAILED
diff --git a/libs/libmdbx/src/test/main.cc b/libs/libmdbx/src/test/main.cc
index 9dc3eccbae..959359a515 100644
--- a/libs/libmdbx/src/test/main.cc
+++ b/libs/libmdbx/src/test/main.cc
@@ -27,6 +27,8 @@ void actor_params::set_defaults(const std::string &tmpdir) {
loglevel =
#ifdef NDEBUG
logging::info;
+#elif defined(_WIN32) || defined(_WIN64)
+ logging::verbose;
#else
logging::trace;
#endif
@@ -70,6 +72,7 @@ void actor_params::set_defaults(const std::string &tmpdir) {
inject_writefaultn = 0;
drop_table = false;
+ ignore_dbfull = false;
max_readers = 42;
max_tables = 42;
@@ -179,17 +182,19 @@ int main(int argc, char *const argv[]) {
params.datalen_max = datalen_max;
continue;
}
- if (config::parse_option(argc, argv, narg, "size-lower", params.size_lower,
- mdbx_limits_dbsize_min(params.pagesize),
- mdbx_limits_dbsize_max(params.pagesize)))
+ if (config::parse_option_intptr(argc, argv, narg, "size-lower",
+ params.size_lower,
+ mdbx_limits_dbsize_min(params.pagesize),
+ mdbx_limits_dbsize_max(params.pagesize)))
continue;
- if (config::parse_option(argc, argv, narg, "size", params.size_now,
- mdbx_limits_dbsize_min(params.pagesize),
- mdbx_limits_dbsize_max(params.pagesize)))
+ if (config::parse_option_intptr(argc, argv, narg, "size-upper",
+ params.size_upper,
+ mdbx_limits_dbsize_min(params.pagesize),
+ mdbx_limits_dbsize_max(params.pagesize)))
continue;
- if (config::parse_option(argc, argv, narg, "size-upper", params.size_upper,
- mdbx_limits_dbsize_min(params.pagesize),
- mdbx_limits_dbsize_max(params.pagesize)))
+ if (config::parse_option_intptr(argc, argv, narg, "size", params.size_now,
+ mdbx_limits_dbsize_min(params.pagesize),
+ mdbx_limits_dbsize_max(params.pagesize)))
continue;
if (config::parse_option(
argc, argv, narg, "shrink-threshold", params.shrink_threshold, 0,
@@ -288,6 +293,9 @@ int main(int argc, char *const argv[]) {
continue;
if (config::parse_option(argc, argv, narg, "drop", params.drop_table))
continue;
+ if (config::parse_option(argc, argv, narg, "ignore-dbfull",
+ params.ignore_dbfull))
+ continue;
if (config::parse_option(argc, argv, narg, "dump-config",
global::config::dump_config))
continue;
diff --git a/libs/libmdbx/src/test/osal-unix.cc b/libs/libmdbx/src/test/osal-unix.cc
index fd691e354f..0157bace23 100644
--- a/libs/libmdbx/src/test/osal-unix.cc
+++ b/libs/libmdbx/src/test/osal-unix.cc
@@ -21,6 +21,10 @@
#include <sys/wait.h>
#include <unistd.h>
+#ifdef __APPLE__
+#include "darwin/pthread_barrier.c"
+#endif
+
struct shared_t {
pthread_barrier_t barrier;
pthread_mutex_t mutex;
@@ -199,7 +203,9 @@ retry:
if (WIFEXITED(status))
childs[pid] =
(WEXITSTATUS(status) == EXIT_SUCCESS) ? as_successful : as_failed;
- else if (WIFSIGNALED(status) || WCOREDUMP(status))
+ else if (WCOREDUMP(status))
+ childs[pid] = as_coredump;
+ else if (WIFSIGNALED(status))
childs[pid] = as_killed;
else if (WIFSTOPPED(status))
childs[pid] = as_debuging;
@@ -216,7 +222,7 @@ retry:
if (ts.tv_sec == 0 && ts.tv_nsec == 0)
ts.tv_nsec = 1;
if (nanosleep(&ts, &ts) == 0) {
- /* timeout and no signal fomr child */
+ /* timeout and no signal from child */
pid = 0;
return 0;
}
diff --git a/libs/libmdbx/src/test/osal-windows.cc b/libs/libmdbx/src/test/osal-windows.cc
index 5858e89530..975d8268f5 100644
--- a/libs/libmdbx/src/test/osal-windows.cc
+++ b/libs/libmdbx/src/test/osal-windows.cc
@@ -312,14 +312,22 @@ actor_status osal_actor_info(const mdbx_pid_t pid) {
case EXIT_SUCCESS:
status = as_successful;
break;
- // case EXCEPTION_BREAKPOINT:
+ case EXCEPTION_BREAKPOINT:
case EXCEPTION_SINGLE_STEP:
status = as_debuging;
break;
case STATUS_CONTROL_C_EXIT:
- case EXCEPTION_NONCONTINUABLE_EXCEPTION:
status = as_killed;
break;
+ case EXCEPTION_ACCESS_VIOLATION:
+ case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
+ case EXCEPTION_DATATYPE_MISALIGNMENT:
+ case EXCEPTION_STACK_OVERFLOW:
+ case EXCEPTION_INVALID_DISPOSITION:
+ case EXCEPTION_ILLEGAL_INSTRUCTION:
+ case EXCEPTION_NONCONTINUABLE_EXCEPTION:
+ status = as_coredump;
+ break;
default:
status = as_failed;
break;
diff --git a/libs/libmdbx/src/test/test.cc b/libs/libmdbx/src/test/test.cc
index cf61f1eeee..e9e925e9a9 100644
--- a/libs/libmdbx/src/test/test.cc
+++ b/libs/libmdbx/src/test/test.cc
@@ -55,6 +55,8 @@ const char *status2str(actor_status status) {
return "killed";
case as_failed:
return "failed";
+ case as_coredump:
+ return "coredump";
}
}
@@ -137,6 +139,8 @@ void testcase::db_open() {
if (!db_guard)
db_prepare();
+
+ jitter_delay(true);
int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(),
(unsigned)config.params.mode_flags, 0640);
if (unlikely(rc != MDBX_SUCCESS))
@@ -170,20 +174,42 @@ void testcase::txn_begin(bool readonly, unsigned flags) {
flags);
}
+int testcase::breakable_commit() {
+ int rc = MDBX_SUCCESS;
+ log_trace(">> txn_commit");
+ assert(txn_guard);
+
+ MDBX_txn *txn = txn_guard.release();
+ txn_inject_writefault(txn);
+ int err = mdbx_txn_commit(txn);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ rc = err;
+ err = mdbx_txn_abort(txn);
+ if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH))
+ failure_perror("mdbx_txn_abort()", err);
+ } else
+ failure_perror("mdbx_txn_commit()", err);
+ }
+
+ log_trace("<< txn_commit: %s", rc ? "failed" : "Ok");
+ return rc;
+}
+
void testcase::txn_end(bool abort) {
log_trace(">> txn_end(%s)", abort ? "abort" : "commit");
assert(txn_guard);
MDBX_txn *txn = txn_guard.release();
if (abort) {
- int rc = mdbx_txn_abort(txn);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_txn_abort()", rc);
+ int err = mdbx_txn_abort(txn);
+ if (unlikely(err != MDBX_SUCCESS && err != MDBX_THREAD_MISMATCH))
+ failure_perror("mdbx_txn_abort()", err);
} else {
txn_inject_writefault(txn);
- int rc = mdbx_txn_commit(txn);
- if (unlikely(rc != MDBX_SUCCESS))
- failure_perror("mdbx_txn_commit()", rc);
+ int err = mdbx_txn_commit(txn);
+ if (unlikely(err != MDBX_SUCCESS))
+ failure_perror("mdbx_txn_commit()", err);
}
log_trace("<< txn_end(%s)", abort ? "abort" : "commit");
@@ -211,6 +237,16 @@ void testcase::cursor_close() {
log_trace("<< cursor_close()");
}
+int testcase::breakable_restart() {
+ int rc = MDBX_SUCCESS;
+ if (txn_guard)
+ rc = breakable_commit();
+ if (cursor_guard)
+ cursor_close();
+ txn_begin(false, 0);
+ return rc;
+}
+
void testcase::txn_restart(bool abort, bool readonly, unsigned flags) {
if (txn_guard)
txn_end(abort);
@@ -394,6 +430,28 @@ void testcase::update_canary(uint64_t increment) {
log_trace("<< update_canary: sequence = %" PRIu64, canary_now.y);
}
+int testcase::db_open__begin__table_create_open_clean(MDBX_dbi &dbi) {
+ db_open();
+
+ int err, retry_left = 42;
+ for (;;) {
+ txn_begin(false);
+ dbi = db_table_open(true);
+ db_table_clear(dbi);
+ err = breakable_commit();
+ if (likely(err == MDBX_SUCCESS)) {
+ txn_begin(false);
+ return MDBX_SUCCESS;
+ }
+ if (--retry_left == 0)
+ break;
+ jitter_delay(true);
+ }
+ log_notice("db_begin_table_create_open_clean: bailout due '%s'",
+ mdbx_strerror(err));
+ return err;
+}
+
MDBX_dbi testcase::db_table_open(bool create) {
log_trace(">> testcase::db_table_create");
@@ -513,22 +571,27 @@ bool test_execute(const actor_config &config_const) {
if (!test->setup()) {
log_notice("test setup failed");
return false;
- } else if (!test->run()) {
+ }
+ if (!test->run()) {
log_notice("test failed");
return false;
- } else if (!test->teardown()) {
+ }
+ if (!test->teardown()) {
log_notice("test teardown failed");
return false;
- } else {
- if (config.params.nrepeat == 1)
- log_info("test successed");
- else if (config.params.nrepeat == 1)
+ }
+
+ if (config.params.nrepeat == 1)
+ log_info("test successed");
+ else {
+ if (config.params.nrepeat)
log_info("test successed (iteration %zi of %zi)", iter,
size_t(config.params.nrepeat));
else
log_info("test successed (iteration %zi)", iter);
config.params.keygen.seed += INT32_C(0xA4F4D37B);
}
+
} while (config.params.nrepeat == 0 || iter < config.params.nrepeat);
return true;
} catch (const std::exception &pipets) {
diff --git a/libs/libmdbx/src/test/test.h b/libs/libmdbx/src/test/test.h
index 117a66f93c..fb5ad4ee83 100644
--- a/libs/libmdbx/src/test/test.h
+++ b/libs/libmdbx/src/test/test.h
@@ -105,7 +105,9 @@ protected:
void db_open();
void db_close();
void txn_begin(bool readonly, unsigned flags = 0);
+ int breakable_commit();
void txn_end(bool abort);
+ int breakable_restart();
void txn_restart(bool abort, bool readonly, unsigned flags = 0);
void cursor_open(unsigned dbi);
void cursor_close();
@@ -121,6 +123,7 @@ protected:
void db_table_drop(MDBX_dbi handle);
void db_table_clear(MDBX_dbi handle);
void db_table_close(MDBX_dbi handle);
+ int db_open__begin__table_create_open_clean(MDBX_dbi &dbi);
bool wait4start();
void report(size_t nops_done);
diff --git a/libs/libmdbx/src/test/ttl.cc b/libs/libmdbx/src/test/ttl.cc
index 1ecfd0c7e5..b2650f0c13 100644
--- a/libs/libmdbx/src/test/ttl.cc
+++ b/libs/libmdbx/src/test/ttl.cc
@@ -29,12 +29,12 @@ static unsigned edge2count(uint64_t edge, unsigned count_max) {
}
bool testcase_ttl::run() {
- db_open();
-
- txn_begin(false);
- MDBX_dbi dbi = db_table_open(true);
- db_table_clear(dbi);
- txn_end(false);
+ MDBX_dbi dbi;
+ int err = db_open__begin__table_create_open_clean(dbi);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("ttl: bailout-prepare due '%s'", mdbx_strerror(err));
+ return true;
+ }
/* LY: тест "эмуляцией time-to-live":
* - организуется "скользящее окно", которое двигается вперед вдоль
@@ -53,11 +53,26 @@ bool testcase_ttl::run() {
*/
/* LY: для параметризации используем подходящие параметры, которые не имеют
- * здесь смысла в первоначальном значении */
- const unsigned window_max =
- (config.params.batch_read > 999) ? config.params.batch_read : 1000;
- const unsigned count_max =
- (config.params.batch_write > 999) ? config.params.batch_write : 1000;
+ * здесь смысла в первоначальном значении. */
+ const unsigned window_max_lower =
+#ifdef __APPLE__
+ 333;
+#else
+ 999;
+#endif
+ const unsigned count_max_lower =
+#ifdef __APPLE__
+ 333;
+#else
+ 999;
+#endif
+
+ const unsigned window_max = (config.params.batch_read > window_max_lower)
+ ? config.params.batch_read
+ : window_max_lower;
+ const unsigned count_max = (config.params.batch_write > count_max_lower)
+ ? config.params.batch_write
+ : count_max_lower;
log_info("ttl: using `batch_read` value %u for window_max", window_max);
log_info("ttl: using `batch_write` value %u for count_max", count_max);
@@ -73,15 +88,13 @@ bool testcase_ttl::run() {
std::deque<std::pair<uint64_t, unsigned>> fifo;
uint64_t serial = 0;
while (should_continue()) {
- if (!txn_guard)
- txn_begin(false);
const uint64_t salt = prng64_white(seed) /* mdbx_txn_id(txn_guard.get()) */;
const unsigned window_width = edge2window(salt, window_max);
- const unsigned head_count = edge2count(salt, count_max);
- log_info("ttl: step #%zu (serial %" PRIu64
- ", window %u, count %u) salt %" PRIu64,
- nops_completed, serial, window_width, head_count, salt);
+ unsigned head_count = edge2count(salt, count_max);
+ log_verbose("ttl: step #%zu (serial %" PRIu64
+ ", window %u, count %u) salt %" PRIu64,
+ nops_completed, serial, window_width, head_count, salt);
if (window_width) {
while (fifo.size() > window_width) {
@@ -93,9 +106,14 @@ bool testcase_ttl::run() {
for (unsigned n = 0; n < tail_count; ++n) {
log_trace("ttl: remove-tail %" PRIu64, serial);
generate_pair(tail_serial);
- int err = mdbx_del(txn_guard.get(), dbi, &key->value, &data->value);
- if (unlikely(err != MDBX_SUCCESS))
+ err = mdbx_del(txn_guard.get(), dbi, &key->value, &data->value);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("ttl: tail-bailout due '%s'", mdbx_strerror(err));
+ goto bailout;
+ }
failure_perror("mdbx_del(tail)", err);
+ }
if (unlikely(!keyvalue_maker.increment(tail_serial, 1)))
failure("ttl: unexpected key-space overflow on the tail");
}
@@ -106,30 +124,54 @@ bool testcase_ttl::run() {
fifo.clear();
}
- txn_restart(false, false);
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("ttl: bailout at commit due '%s'", mdbx_strerror(err));
+ break;
+ }
fifo.push_front(std::make_pair(serial, head_count));
-
+ retry:
for (unsigned n = 0; n < head_count; ++n) {
log_trace("ttl: insert-head %" PRIu64, serial);
generate_pair(serial);
- int err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value,
- insert_flags);
- if (unlikely(err != MDBX_SUCCESS))
+ err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value,
+ insert_flags);
+ if (unlikely(err != MDBX_SUCCESS)) {
+ if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) {
+ log_notice("ttl: head-insert skip due '%s'", mdbx_strerror(err));
+ txn_restart(true, false);
+ serial = fifo.front().first;
+ fifo.front().second = head_count = n;
+ goto retry;
+ }
failure_perror("mdbx_put(head)", err);
+ }
- if (unlikely(!keyvalue_maker.increment(serial, 1)))
- failure("uphill: unexpected key-space overflow");
+ if (unlikely(!keyvalue_maker.increment(serial, 1))) {
+ log_notice("ttl: unexpected key-space overflow");
+ goto bailout;
+ }
+ }
+ err = breakable_restart();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("ttl: head-commit skip due '%s'", mdbx_strerror(err));
+ serial = fifo.front().first;
+ fifo.pop_front();
}
-
- txn_end(false);
report(1);
}
+bailout:
+ txn_end(true);
if (dbi) {
if (config.params.drop_table && !mode_readonly()) {
txn_begin(false);
db_table_drop(dbi);
- txn_end(false);
+ err = breakable_commit();
+ if (unlikely(err != MDBX_SUCCESS)) {
+ log_notice("ttl: bailout-clean due '%s'", mdbx_strerror(err));
+ return true;
+ }
} else
db_table_close(dbi);
}
diff --git a/libs/libmdbx/src/test/utils.cc b/libs/libmdbx/src/test/utils.cc
index ddf47a4cd9..d9b3538b99 100644
--- a/libs/libmdbx/src/test/utils.cc
+++ b/libs/libmdbx/src/test/utils.cc
@@ -17,6 +17,9 @@
#if defined(HAVE_IEEE754_H) || __has_include(<ieee754.h>)
#include <ieee754.h>
#endif
+#if defined(__APPLE__) || defined(__MACH__)
+#include <mach/mach_time.h>
+#endif /* defined(__APPLE__) || defined(__MACH__) */
std::string format(const char *fmt, ...) {
va_list ap, ones;
@@ -353,7 +356,7 @@ void jitter_delay(bool extra) {
cpu_relax();
if (dice > 2) {
unsigned us = entropy_white() &
- (extra ? 0xfffff /* 1.05 s */ : 0x3ff /* 1 ms */);
+ (extra ? 0xffff /* 656 ms */ : 0x3ff /* 1 ms */);
log_trace("== jitter.delay: %0.6f", us / 1000000.0);
osal_udelay(us);
}
diff --git a/libs/libmdbx/src/test/utils.h b/libs/libmdbx/src/test/utils.h
index efda8394fc..d1b859acd4 100644
--- a/libs/libmdbx/src/test/utils.h
+++ b/libs/libmdbx/src/test/utils.h
@@ -26,9 +26,14 @@
#endif
#if __GNUC_PREREQ(4, 4) || defined(__clang__)
+#ifndef bswap64
#define bswap64(v) __builtin_bswap64(v)
+#endif
+#ifndef bswap32
#define bswap32(v) __builtin_bswap32(v)
-#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#endif
+#if (__GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)) && \
+ !defined(bswap16)
#define bswap16(v) __builtin_bswap16(v)
#endif
@@ -184,52 +189,9 @@ static __inline uint64_t rot64(uint64_t v, unsigned s) {
}
#endif /* rot64 */
-#ifndef mul_32x32_64
-static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) {
- return a * (uint64_t)b;
-}
-#endif /* mul_32x32_64 */
-
-#ifndef mul_64x64_128
-
-static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) {
- *sum += addend;
- return (*sum < addend) ? 1u : 0u;
-}
-
-static __inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, uint64_t *h) {
-#if defined(__SIZEOF_INT128__) || \
- (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
- __uint128_t r = (__uint128_t)a * (__uint128_t)b;
- /* modern GCC could nicely optimize this */
- *h = r >> 64;
- return r;
-#elif defined(mul_64x64_high)
- *h = mul_64x64_high(a, b);
- return a * b;
-#else
- /* performs 64x64 to 128 bit multiplication */
- uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b);
- uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b);
- uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32);
- *h = mul_32x32_64(a >> 32, b >> 32) + (lh >> 32) + (hl >> 32) +
- add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32);
- return ll;
-#endif
-}
-
-#endif /* mul_64x64_128() */
-
-#ifndef mul_64x64_high
-static __inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) {
- uint64_t h;
- mul_64x64_128(a, b, &h);
- return h;
-}
-#endif /* mul_64x64_high */
-
static __inline bool is_power2(size_t x) { return (x & (x - 1)) == 0; }
+#undef roundup2
static __inline size_t roundup2(size_t value, size_t granularity) {
assert(is_power2(granularity));
return (value + granularity - 1) & ~(granularity - 1);
@@ -285,18 +247,20 @@ struct simple_checksum {
simple_checksum() : value(0) {}
- void push(uint32_t data) {
+ void push(const uint32_t &data) {
value += data * UINT64_C(9386433910765580089) + 1;
value ^= value >> 41;
value *= UINT64_C(0xBD9CACC22C6E9571);
}
- void push(uint64_t data) {
+ void push(const uint64_t &data) {
push((uint32_t)data);
push((uint32_t)(data >> 32));
}
- void push(bool data) { push(data ? UINT32_C(0x780E) : UINT32_C(0xFA18E)); }
+ void push(const bool data) {
+ push(data ? UINT32_C(0x780E) : UINT32_C(0xFA18E));
+ }
void push(const void *ptr, size_t bytes) {
const uint8_t *data = (const uint8_t *)ptr;
@@ -309,7 +273,7 @@ struct simple_checksum {
void push(const std::string &str) { push(str.data(), str.size()); }
void push(unsigned salt, const MDBX_val &val) {
- push(val.iov_len);
+ push(unsigned(val.iov_len));
push(salt);
push(val.iov_base, val.iov_len);
}