diff options
69 files changed, 15704 insertions, 10048 deletions
diff --git a/libs/libmdbx/libmdbx.vcxproj b/libs/libmdbx/libmdbx.vcxproj index 6c9a5d07d3..e7d489ca88 100644 --- a/libs/libmdbx/libmdbx.vcxproj +++ b/libs/libmdbx/libmdbx.vcxproj @@ -28,18 +28,21 @@ <ClCompile> <PreprocessorDefinitions>MDBX_CONFIG_MANUAL_TLS_CALLBACK=1;MDBX_DEBUG=1;LIBMDBX_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions> </ClCompile> + <Link> + <AdditionalDependencies>ntdll.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> </ItemDefinitionGroup> <ItemGroup> - <ClCompile Include="src\src\lck-windows.c"> + <ClCompile Include="src\src\elements\lck-windows.c"> <PrecompiledHeader>NotUsing</PrecompiledHeader> </ClCompile> - <ClCompile Include="src\src\mdbx.c"> + <ClCompile Include="src\src\elements\core.c"> <PrecompiledHeader>NotUsing</PrecompiledHeader> </ClCompile> - <ClCompile Include="src\src\osal.c"> + <ClCompile Include="src\src\elements\osal.c"> <PrecompiledHeader>NotUsing</PrecompiledHeader> </ClCompile> - <ClCompile Include="src\src\version.c"> + <ClCompile Include="src\src\elements\version.c"> <PrecompiledHeader>NotUsing</PrecompiledHeader> </ClCompile> </ItemGroup> diff --git a/libs/libmdbx/src/.gitignore b/libs/libmdbx/src/.gitignore index 2ea34c8009..0496b62164 100644 --- a/libs/libmdbx/src/.gitignore +++ b/libs/libmdbx/src/.gitignore @@ -12,15 +12,11 @@ .idea .le.ini .vs/ -Win32/ -build-* cmake-build-* +@* core -example +mdbx_example libmdbx.creator.user -mdbx-dll.VC.VC.opendb -mdbx-dll.VC.db -mdbx-dll.vcxproj.filters mdbx_chk mdbx_copy mdbx_dump @@ -28,11 +24,12 @@ mdbx_load mdbx_stat mdbx_test test.log -test/test.vcxproj.user test/tmp.db test/tmp.db-lck tmp.db tmp.db-lck valgrind.* -x64/ -x86/ +src/elements/version.c +src/elements/config.h +dist/ +*.tar* diff --git a/libs/libmdbx/src/.travis.yml b/libs/libmdbx/src/.travis.yml index e46991f597..c1bd6da5e9 100644 --- a/libs/libmdbx/src/.travis.yml +++ b/libs/libmdbx/src/.travis.yml @@ -1,15 +1,44 @@ -language: c -dist: xenial +language: c cpp -compiler: -- gcc -- clang +matrix: + include: + - os: linux + dist: trusty + env: CC=cc CXX=c++ + - os: linux + dist: trusty + compiler: clang + env: CC=clang CXX=clang++ + - os: linux + dist: xenial + compiler: gcc + env: CC=gcc CXX=g++ + - os: linux + dist: bionic + compiler: clang + env: CC=clang CXX=clang++ + - os: osx + osx_image: xcode11 + env: CC=cc CXX=c++ + - os: osx + osx_image: xcode9.4 + env: CC=cc CXX=c++ -os: -- linux -- osx - -script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi +script: > + if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then + git fetch --unshallow --tags --prune && + git submodule foreach --recursive git fetch --unshallow --tags --prune && + (if which clang-format-6.0 > /dev/null && make reformat && [[ -n $(git diff) ]]; + then + echo "You must run 'make reformat' before submitting a pull request"; + echo ""; + git diff; + exit -1; + fi) && + make --keep-going all && MALLOC_CHECK_=7 MALLOC_PERTURB_=42 make --keep-going check + else + [ ! -s cov-int/scm_log.txt ] || cat cov-int/scm_log.txt; + fi && sleep 3 env: global: @@ -17,6 +46,8 @@ env: before_install: - echo -n | openssl s_client -connect scan.coverity.com:443 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | sudo tee -a /etc/ssl/certs/ca- + - ${CC} --version + - ${CXX} --version addons: coverity_scan: @@ -25,6 +56,6 @@ addons: version: 0.1 description: "Build submitted via Travis CI" notification_email: leo@yuriev.ru - build_command_prepend: "make clean" - build_command: "make all -j 2" + build_command_prepend: "git fetch --unshallow --tags --prune && make dist" + build_command: "make MDBX_OPTIONS=-DMDBX_DEBUG=2 -C dist all" branch_pattern: coverity_scan diff --git a/libs/libmdbx/src/AUTHORS b/libs/libmdbx/src/AUTHORS index 0a1a199682..10910e5744 100644 --- a/libs/libmdbx/src/AUTHORS +++ b/libs/libmdbx/src/AUTHORS @@ -29,3 +29,4 @@ Quanah Gibson-Mount <quanah@openldap.org> Salvador Ortiz <sog@msg.com.mx> Sebastien Launay <sebastien@slaunay.fr> Vladimir Romanov <vromanov@gmail.com> +Zano Foundation <crypto.sowle@gmail.com> diff --git a/libs/libmdbx/src/CMakeLists.dist-minimal b/libs/libmdbx/src/CMakeLists.dist-minimal new file mode 100644 index 0000000000..48592297a6 --- /dev/null +++ b/libs/libmdbx/src/CMakeLists.dist-minimal @@ -0,0 +1,308 @@ +## +## This is the minimal template for CMakeList.txt which could be used +## to build libmdbx from the "amalgamated form" of libmdbx's source code. +## +## The amalgamated form is intended to embedding libmdbx in other projects +## in cases when using as git-submodule is not acceptable or inconveniently. +## +## The amalgamated form could be generated from full git repository +## on Linux just by `make dist`. +## + +## +## Copyright 2019 Leonid Yuriev <leo@yuriev.ru> +## and other libmdbx authors: please see AUTHORS file. +## All rights reserved. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted only as authorized by the OpenLDAP +## Public License. +## +## A copy of this license is available in the file LICENSE in the +## top-level directory of the distribution or, alternatively, at +## <http://www.OpenLDAP.org/license.html>. +## + +## +## libmdbx = { Revised and extended descendant of Symas LMDB. } +## Please see README.md at https://github.com/leo-yuriev/libmdbx +## +## Libmdbx is superior to LMDB in terms of features and reliability, +## not inferior in performance. libmdbx works on Linux, FreeBSD, MacOS X +## and other systems compliant with POSIX.1-2008, but also support Windows +## as a complementary platform. +## +## The next version is under active non-public development and will be +## released as MithrilDB and libmithrildb for libraries & packages. +## Admittedly mythical Mithril is resembling silver but being stronger and +## lighter than steel. Therefore MithrilDB is rightly relevant name. +## +## MithrilDB will be radically different from libmdbx by the new database +## format and API based on C++17, as well as the Apache 2.0 License. +## The goal of this revolution is to provide a clearer and robust API, +## add more features and new valuable properties of database. +## +## The Future will (be) Positive. Всё будет хорошо. +## + +cmake_minimum_required(VERSION 3.8.2) +cmake_policy(PUSH) +cmake_policy(VERSION 3.8.2) +if(NOT CMAKE_VERSION VERSION_LESS 3.9) + cmake_policy(SET CMP0069 NEW) +endif() +if(NOT CMAKE_VERSION VERSION_LESS 3.12) + cmake_policy(SET CMP0075 NEW) +endif() +if(NOT CMAKE_VERSION VERSION_LESS 3.13) + cmake_policy(SET CMP0077 NEW) +endif() + +if(DEFINED PROJECT_NAME) + set(SUBPROJECT ON) + set(NOT_SUBPROJECT OFF) +else() + set(SUBPROJECT OFF) + set(NOT_SUBPROJECT ON) + project(libmdbx C CXX) +endif() + +find_package(Threads REQUIRED) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE MinSizeRel CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif() + +list(FIND CMAKE_C_COMPILE_FEATURES c_std_11 HAS_C11) +if(NOT HAS_C11 LESS 0) + set(MDBX_C_STANDARD 11) +else() + set(MDBX_C_STANDARD 99) +endif() +message(STATUS "Use C${MDBX_C_STANDARD} for libmdbx") + +# not supported by this (minimal) script +add_definitions(-DMDBX_AVOID_CRT=0) + +# provide build timestamp +string(TIMESTAMP MDBX_BUILD_TIMESTAMP UTC) +add_definitions(-DMDBX_BUILD_TIMESTAMP="${MDBX_BUILD_TIMESTAMP}") + +# provide compiler info +execute_process(COMMAND sh -c "${CMAKE_C_COMPILER} --version | head -1" + OUTPUT_VARIABLE MDBX_BUILD_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE rc) +if(rc OR NOT MDBX_BUILD_COMPILER) + string(STRIP "${CMAKE_C_COMPILER_ID}-${CMAKE_C_COMPILER_VERSION}" MDBX_BUILD_COMPILER) +endif() +add_definitions(-DMDBX_BUILD_COMPILER="${MDBX_BUILD_COMPILER}") + +# provide cpu/arch-system pair +if(CMAKE_C_COMPILER_TARGET) + set(MDBX_BUILD_TARGET "${CMAKE_C_COMPILER_TARGET}") +elseif(CMAKE_C_PLATFORM_ID AND NOT CMAKE_C_PLATFORM_ID STREQUAL CMAKE_SYSTEM_NAME) + string(STRIP "${CMAKE_C_PLATFORM_ID}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_LIBRARY_ARCHITECTURE) + string(STRIP "${CMAKE_LIBRARY_ARCHITECTURE}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_GENERATOR_PLATFORM AND NOT CMAKE_C_PLATFORM_ID STREQUAL CMAKE_SYSTEM_NAME) + string(STRIP "${CMAKE_GENERATOR_PLATFORM}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_SYSTEM_ARCH) + string(STRIP "${CMAKE_SYSTEM_ARCH}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +else() + string(STRIP "${CMAKE_SYSTEM_PROCESSOR}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +endif() +add_definitions(-DMDBX_BUILD_TARGET="${MDBX_BUILD_TARGET}") + +# provide build target-config +if(CMAKE_CONFIGURATION_TYPES) + add_definitions(-DMDBX_BUILD_CONFIG="$<CONFIG>") +else() + add_definitions(-DMDBX_BUILD_CONFIG="${CMAKE_BUILD_TYPE}") +endif() + +# provide build cflags +set(MDBX_BUILD_FLAGS "") +list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_FLAGS}) +list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_DEFINES}) +if(CMAKE_CONFIGURATION_TYPES) + add_definitions(-DMDBX_BUILD_FLAGS_CONFIG="$<$<CONFIG:Debug>:${CMAKE_C_FLAGS_DEBUG} ${CMAKE_C_DEFINES_DEBUG}>$<$<CONFIG:Release>:${CMAKE_C_FLAGS_RELEASE} ${CMAKE_C_DEFINES_RELEASE}>$<$<CONFIG:RelWithDebInfo>:${CMAKE_C_FLAGS_RELWITHDEBINFO} ${CMAKE_C_DEFINES_RELWITHDEBINFO}>$<$<CONFIG:MinSizeRel>:${CMAKE_C_FLAGS_MINSIZEREL} ${CMAKE_C_DEFINES_MINSIZEREL}>") +else() + string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPERCASE) + list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPERCASE}}) + list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_DEFINES_${CMAKE_BUILD_TYPE_UPPERCASE}}) +endif() +list(REMOVE_DUPLICATES MDBX_BUILD_FLAGS) +string(REPLACE ";" " " MDBX_BUILD_FLAGS "${MDBX_BUILD_FLAGS}") +add_definitions(-DMDBX_BUILD_FLAGS="${MDBX_BUILD_FLAGS}") + +# shared library +if(NOT DEFINED MDBX_BUILD_SHARED_LIBRARY) + if(DEFINED BUILD_SHARED_LIBS) + option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL)" ${BUILD_SHARED_LIBS}) + else() + option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL)" ON) + endif() +endif() +if(MDBX_BUILD_SHARED_LIBRARY) + add_library(mdbx SHARED mdbx.c mdbx.h) + set_target_properties(mdbx PROPERTIES + C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON + PUBLIC_HEADER mdbx.h) + target_compile_definitions(mdbx PRIVATE LIBMDBX_EXPORTS INTERFACE LIBMDBX_IMPORTS) + if(DEFINED INTERPROCEDURAL_OPTIMIZATION) + set_target_properties(mdbx PROPERTIES + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}>) + endif() + target_link_libraries(mdbx PRIVATE ${CMAKE_THREAD_LIBS_INIT}) + if(WIN32) + target_link_libraries(mdbx PRIVATE ntdll.lib) + endif() + if(${CMAKE_SYSTEM_NAME} STREQUAL "SunOS" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Solaris") + target_link_libraries(mdbx PRIVATE kstat) + endif() +endif() + +# static library used for tools, to avoid rpath/dll-path troubles +add_library(mdbx-static STATIC mdbx.c mdbx.h) +set_target_properties(mdbx-static PROPERTIES + C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON + PUBLIC_HEADER mdbx.h) +target_link_libraries(mdbx-static INTERFACE ${CMAKE_THREAD_LIBS_INIT}) +if(DEFINED INTERPROCEDURAL_OPTIMIZATION) + set_target_properties(mdbx-static PROPERTIES + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}>) +endif() +if(WIN32) + target_link_libraries(mdbx-static INTERFACE ntdll.lib) +endif() +if(${CMAKE_SYSTEM_NAME} STREQUAL "SunOS" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Solaris") + target_link_libraries(mdbx-static INTERFACE kstat) +endif() + +# mdbx-tools +foreach(TOOL mdbx_chk mdbx_copy mdbx_stat mdbx_dump mdbx_load) + add_executable(${TOOL} ${TOOL}.c) + set_target_properties(${TOOL} PROPERTIES + C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON) + if(DEFINED INTERPROCEDURAL_OPTIMIZATION) + set_target_properties(${TOOL} PROPERTIES + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}>) + endif() + target_link_libraries(${TOOL} mdbx-static) +endforeach() + +# installation +if(MDBX_BUILD_SHARED_LIBRARY) + if(CMAKE_VERSION VERSION_LESS 3.12) + install(TARGETS mdbx EXPORT libmdbx + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) + else() + install(TARGETS mdbx EXPORT libmdbx + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + NAMELINK_COMPONENT devel + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) + endif() +endif() + +if(CMAKE_VERSION VERSION_LESS 3.12) + install(TARGETS mdbx-static EXPORT libmdbx EXCLUDE_FROM_ALL + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) +else() + install(TARGETS mdbx-static EXPORT libmdbx EXCLUDE_FROM_ALL + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + NAMELINK_COMPONENT devel + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) +endif() + +install( + TARGETS + mdbx_chk + mdbx_stat + mdbx_copy + mdbx_dump + mdbx_load + RUNTIME + DESTINATION bin + COMPONENT runtime) + +install( + FILES + man1/mdbx_chk.1 + man1/mdbx_stat.1 + man1/mdbx_copy.1 + man1/mdbx_dump.1 + man1/mdbx_load.1 + DESTINATION man/man1 + COMPONENT doc) + +cmake_policy(POP) diff --git a/libs/libmdbx/src/CMakeLists.txt b/libs/libmdbx/src/CMakeLists.txt new file mode 100644 index 0000000000..46f827ebc2 --- /dev/null +++ b/libs/libmdbx/src/CMakeLists.txt @@ -0,0 +1,346 @@ +## +## Copyright 2019 Leonid Yuriev <leo@yuriev.ru> +## and other libmdbx authors: please see AUTHORS file. +## All rights reserved. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted only as authorized by the OpenLDAP +## Public License. +## +## A copy of this license is available in the file LICENSE in the +## top-level directory of the distribution or, alternatively, at +## <http://www.OpenLDAP.org/license.html>. +## + +## +## libmdbx = { Revised and extended descendant of Symas LMDB. } +## Please see README.md at https://github.com/leo-yuriev/libmdbx +## +## Libmdbx is superior to LMDB in terms of features and reliability, +## not inferior in performance. libmdbx works on Linux, FreeBSD, MacOS X +## and other systems compliant with POSIX.1-2008, but also support Windows +## as a complementary platform. +## +## The next version is under active non-public development and will be +## released as MithrilDB and libmithrildb for libraries & packages. +## Admittedly mythical Mithril is resembling silver but being stronger and +## lighter than steel. Therefore MithrilDB is rightly relevant name. +## +## MithrilDB will be radically different from libmdbx by the new database +## format and API based on C++17, as well as the Apache 2.0 License. +## The goal of this revolution is to provide a clearer and robust API, +## add more features and new valuable properties of database. +## +## The Future will (be) Positive. Всё будет хорошо. +## + +cmake_minimum_required(VERSION 3.8.2) +cmake_policy(PUSH) +cmake_policy(VERSION 3.8.2) +if(NOT CMAKE_VERSION VERSION_LESS 3.13) + cmake_policy(SET CMP0077 NEW) +endif() +if(NOT CMAKE_VERSION VERSION_LESS 3.12) + cmake_policy(SET CMP0075 NEW) +endif() +if(NOT CMAKE_VERSION VERSION_LESS 3.9) + cmake_policy(SET CMP0069 NEW) + include(CheckIPOSupported) + check_ipo_supported(RESULT CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE) +else() + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE FALSE) +endif() + +if(DEFINED PROJECT_NAME) + set(SUBPROJECT ON) + set(NOT_SUBPROJECT OFF) + if(NOT DEFINED BUILD_TESTING) + set(BUILD_TESTING OFF) + endif() +else() + set(SUBPROJECT OFF) + set(NOT_SUBPROJECT ON) + project(libmdbx C CXX) + if(NOT DEFINED BUILD_TESTING) + set(BUILD_TESTING ON) + endif() +endif() + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE MinSizeRel CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif() + +macro(add_mdbx_option NAME DESCRIPTION DEFAULT) + list(APPEND MDBX_BUILD_OPTIONS ${NAME}) + if(NOT ${DEFAULT} STREQUAL "AUTO") + option(${NAME} "${DESCRIPTION}" ${DEFAULT}) + elseif(NOT DEFINED ${NAME}) + set(${NAME}_AUTO ON) + endif() +endmacro() + +# only for compatibility testing +# set(CMAKE_CXX_STANDARD 14) + +if(NOT "$ENV{TEAMCITY_PROCESS_FLOW_ID}" STREQUAL "") + set(CI TEAMCITY) + message(STATUS "TeamCity CI") +elseif(NOT "$ENV{TRAVIS}" STREQUAL "") + set(CI TRAVIS) + message(STATUS "Travis CI") +elseif(NOT "$ENV{CIRCLECI}" STREQUAL "") + set(CI CIRCLE) + message(STATUS "Circle CI") +elseif(NOT "$ENV{APPVEYOR}" STREQUAL "") + set(CI APPVEYOR) + message(STATUS "AppVeyor CI") +elseif(NOT "$ENV{CI}" STREQUAL "") + set(CI "$ENV{CI}") + message(STATUS "Other CI (${CI})") +else() + message(STATUS "Assume No any CI environment") + unset(CI) +endif() + +# output all mdbx-related targets in single directory +if(NOT DEFINED MDBX_OUTPUT_DIR) + set(MDBX_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) +endif() +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${MDBX_OUTPUT_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${MDBX_OUTPUT_DIR}) +set(CMAKE_PDB_OUTPUT_DIRECTORY ${MDBX_OUTPUT_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${MDBX_OUTPUT_DIR}) + +include(CheckLibraryExists) +include(CheckIncludeFiles) +include(CheckCCompilerFlag) +include(CheckSymbolExists) +include(CheckCSourceRuns) +include(CheckCXXSourceRuns) +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) +include(TestBigEndian) +include(CheckFunctionExists) +include(FindPackageMessage) +include(CheckStructHasMember) +include(CMakeDependentOption) +include(GNUInstallDirs) + +if(CMAKE_C_COMPILER_ID STREQUAL "MSVC" AND MSVC_VERSION LESS 1900) + message(SEND_ERROR "MSVC compiler ${MSVC_VERSION} is too old for building MDBX." + " At least 'Microsoft Visual Studio 2015' is required.") +endif() + +# Set default build type to Release. This is to ease a User's life. +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif() +string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPERCASE) + +include(cmake/utils.cmake) +include(cmake/compiler.cmake) +include(cmake/profile.cmake) + +find_program(ECHO echo) +find_program(CAT cat) +find_program(GIT git) +find_program(LD ld) + +# CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) +# CHECK_INCLUDE_FILES(sys/uio.h HAVE_SYS_UIO_H) +# CHECK_INCLUDE_FILES(sys/stat.h HAVE_SYS_STAT_H) + +CHECK_FUNCTION_EXISTS(pow NOT_NEED_LIBM) +if(NOT_NEED_LIBM) + set(LIB_MATH "") +else() + set(CMAKE_REQUIRED_LIBRARIES m) + CHECK_FUNCTION_EXISTS(pow HAVE_LIBM) + if(HAVE_LIBM) + set(LIB_MATH m) + else() + message(FATAL_ERROR "No libm found for math support") + endif() +endif() + +find_package(Threads REQUIRED) + +if(SUBPROJECT) + if(NOT DEFINED BUILD_SHARED_LIBS) + option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" OFF) + endif() + if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) + option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independed (PIC)" ON) + endif() +else() + option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" ON) + option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independed (PIC)" ON) + if (CC_HAS_ARCH_NATIVE) + option(BUILD_FOR_NATIVE_CPU "Generate code for the compiling machine CPU" OFF) + endif() + + if(CMAKE_CONFIGURATION_TYPES OR NOT CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") + set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT ON) + else() + set(INTERPROCEDURAL_OPTIMIZATION_DEFAULT OFF) + endif() + + if(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE + OR GCC_LTO_AVAILABLE OR MSVC_LTO_AVAILABLE OR CLANG_LTO_AVAILABLE) + option(INTERPROCEDURAL_OPTIMIZATION "Enable interprocedural/LTO optimization" ${INTERPROCEDURAL_OPTIMIZATION_DEFAULT}) + endif() + + if(INTERPROCEDURAL_OPTIMIZATION) + if(GCC_LTO_AVAILABLE) + set(LTO_ENABLED TRUE) + set(CMAKE_AR ${CMAKE_GCC_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_NM ${CMAKE_GCC_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE) + set(CMAKE_RANLIB ${CMAKE_GCC_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + message(STATUS "MDBX indulge Link-Time Optimization by GCC") + elseif(CLANG_LTO_AVAILABLE) + set(LTO_ENABLED TRUE) + set(CMAKE_AR ${CMAKE_CLANG_AR} CACHE PATH "Path to ar program with LTO-plugin" FORCE) + set(CMAKE_NM ${CMAKE_CLANG_NM} CACHE PATH "Path to nm program with LTO-plugin" FORCE) + set(CMAKE_RANLIB ${CMAKE_CLANG_RANLIB} CACHE PATH "Path to ranlib program with LTO-plugin" FORCE) + message(STATUS "MDBX indulge Link-Time Optimization by CLANG") + elseif(MSVC_LTO_AVAILABLE) + set(LTO_ENABLED TRUE) + message(STATUS "MDBX indulge Link-Time Optimization by MSVC") + elseif(CMAKE_INTERPROCEDURAL_OPTIMIZATION_AVAILABLE) + message(STATUS "MDBX indulge Interprocedural Optimization by CMake") + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + set(LTO_ENABLED TRUE) + else() + message(WARNING "Unable to engage interprocedural/LTO optimization.") + endif() + else() + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE) + set(LTO_ENABLED FALSE) + endif() + + find_program(VALGRIND valgrind) + if(VALGRIND) + # LY: cmake is ugly and nasty. + # - therefore memcheck-options should be defined before including ctest; + # - otherwise ctest may ignore it. + set(MEMORYCHECK_SUPPRESSIONS_FILE + "${PROJECT_SOURCE_DIR}/test/valgrind_suppress.txt" + CACHE FILEPATH "Suppressions file for Valgrind" FORCE) + set(MEMORYCHECK_COMMAND_OPTIONS + "--trace-children=yes --leak-check=full --track-origins=yes --error-exitcode=42 --error-markers=@ --errors-for-leak-kinds=definite --fair-sched=yes --suppressions=${MEMORYCHECK_SUPPRESSIONS_FILE}" + CACHE STRING "Valgrind options" FORCE) + set(VALGRIND_COMMAND_OPTIONS "${MEMORYCHECK_COMMAND_OPTIONS}" CACHE STRING "Valgrind options" FORCE) + endif() + + # + # Enable 'make tags' target. + find_program(CTAGS ctags) + if(CTAGS) + add_custom_target(tags COMMAND ${CTAGS} -R -f tags + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + add_custom_target(ctags DEPENDS tags) + endif(CTAGS) + + # + # Enable 'make reformat' target. + find_program(CLANG_FORMAT + NAMES clang-format-6.0 clang-format-5.0 clang-format-4.0 + clang-format-3.9 clang-format-3.8 clang-format-3.7 clang-format) + if(CLANG_FORMAT AND UNIX) + add_custom_target(reformat + VERBATIM + COMMAND + git ls-files | + grep -E \\.\(c|cxx|cc|cpp|h|hxx|hpp\)\(\\.in\)?\$ | + xargs ${CLANG_FORMAT} -i --style=file + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + endif() + + if(NOT "${PROJECT_BINARY_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}") + add_custom_target(distclean) + add_custom_command(TARGET distclean + COMMAND ${CMAKE_COMMAND} -E remove_directory "${PROJECT_BINARY_DIR}" + COMMENT "Removing the build directory and its content") + elseif(IS_DIRECTORY .git AND GIT) + add_custom_target(distclean) + add_custom_command(TARGET distclean + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + COMMAND ${GIT} submodule foreach --recursive git clean -f -X -d + COMMAND ${GIT} clean -f -X -d + COMMENT "Removing all build files from the source directory") + endif() + + setup_compile_flags() +endif(SUBPROJECT) + +list(FIND CMAKE_C_COMPILE_FEATURES c_std_11 HAS_C11) +if(NOT HAS_C11 LESS 0) + set(MDBX_C_STANDARD 11) +else() + set(MDBX_C_STANDARD 99) +endif() +message(STATUS "Use C${MDBX_C_STANDARD} for libmdbx") + +################################################################################ +################################################################################ +# +# #### ##### ##### # #### # # #### +# # # # # # # # # ## # # +# # # # # # # # # # # # #### +# # # ##### # # # # # # # # +# # # # # # # # # ## # # +# #### # # # #### # # #### +# + +set(MDBX_BUILD_OPTIONS ENABLE_ASAN MDBX_USE_VALGRIND ENABLE_GPROF ENABLE_GCOV) +add_mdbx_option(MDBX_BUILD_SHARED_LIBRARY "Build libmdbx as shared library (DLL)" ${BUILD_SHARED_LIBS}) +add_mdbx_option(MDBX_ALLOY_BUILD "Build MDBX library as single object file" ON) +add_mdbx_option(MDBX_TXN_CHECKOWNER "Checking transaction matches the calling thread inside libmdbx's API" ON) +add_mdbx_option(MDBX_TXN_CHECKPID "Paranoid checking PID inside libmdbx's API" AUTO) +mark_as_advanced(MDBX_TXN_CHECKPID) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + add_mdbx_option(MDBX_DISABLE_GNU_SOURCE "Don't use nonstandard GNU/Linux extension functions" OFF) + mark_as_advanced(MDBX_DISABLE_GNU_SOURCE) +endif() +if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + add_mdbx_option(MDBX_OSX_SPEED_INSTEADOF_DURABILITY "Disable use fcntl(F_FULLFSYNC) in favor of speed" OFF) + mark_as_advanced(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) +endif() +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + add_mdbx_option(MDBX_AVOID_CRT "Avoid dependence from MSVC CRT" ${NOT_SUBPROJECT}) + if(NOT MDBX_BUILD_SHARED_LIBRARY) + add_mdbx_option(MDBX_CONFIG_MANUAL_TLS_CALLBACK + "Provide mdbx_dll_handler() for manual initialization" OFF) + mark_as_advanced(MDBX_CONFIG_MANUAL_TLS_CALLBACK) + endif() +else() + add_mdbx_option(MDBX_USE_OFDLOCKS "Use Open file description locks (aka OFD locks, non-POSIX)" AUTO) + mark_as_advanced(MDBX_USE_OFDLOCKS) +endif() +add_mdbx_option(MDBX_LOCKING "Locking method (Win32=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO) +mark_as_advanced(MDBX_LOCKING) +add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake." AUTO) +mark_as_advanced(MDBX_TRUST_RTC) +option(MDBX_ENABLE_TESTS "Build MDBX tests." ${BUILD_TESTING}) + +################################################################################ +################################################################################ + +add_subdirectory(src) +if(MDBX_ENABLE_TESTS) + add_subdirectory(test) +endif() + +set(PACKAGE "libmdbx") +set(CPACK_PACKAGE_VERSION_MAJOR ${MDBX_VERSION_MAJOR}) +set(CPACK_PACKAGE_VERSION_MINOR ${MDBX_VERSION_MINOR}) +set(CPACK_PACKAGE_VERSION_PATCH ${MDBX_VERSION_RELEASE}) +set(CPACK_PACKAGE_VERSION_COMMIT ${MDBX_VERSION_REVISION}) +set(PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${CPACK_PACKAGE_VERSION_COMMIT}") +message(STATUS "libmdbx package version is ${PACKAGE_VERSION}") + +cmake_policy(POP) diff --git a/libs/libmdbx/src/GNUmakefile b/libs/libmdbx/src/GNUmakefile new file mode 100644 index 0000000000..6dff934609 --- /dev/null +++ b/libs/libmdbx/src/GNUmakefile @@ -0,0 +1,371 @@ +# This makefile is for GNU Make, and nowadays provided +# just for compatibility and preservation of traditions. +# Please use CMake in case of any difficulties or problems. +# +# Preprocessor macros (for MDBX_OPTIONS) of interest... +# Note that the defaults should already be correct for most platforms; +# you should not need to change any of these. Read their descriptions +# in README and source code if you do. There may be other macros of interest. +SHELL := env bash + +# install sandbox +SANDBOX ?= + +# install prefixes (inside sandbox) +prefix ?= /usr/local +mandir ?= $(prefix)/man + +# lib/bin suffix for multiarch/biarch, e.g. '.x86_64' +suffix ?= + +CC ?= gcc +LD ?= ld +MDBX_OPTIONS ?= -DNDEBUG=1 +CFLAGS ?= -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -std=gnu11 -pthread -Wno-error=attributes +# -Wno-tautological-compare + +# HINT: Try append '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old. +LIBS ?= $(shell uname | grep -qi SunOS && echo "-lkstat") $(shell uname | grep -qi -e Darwin -e OpenBSD || echo "-lrt") + +LDFLAGS ?= $(shell $(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1')$(shell $(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip') +EXE_LDFLAGS ?= -pthread + +################################################################################ + +UNAME := $(shell uname -s 2>/dev/null || echo Unknown) +define uname2sosuffix + case "$(UNAME)" in + Darwin*|Mach*) echo dylib;; + CYGWIN*|MINGW*|MSYS*|Windows*) echo dll;; + *) echo so;; + esac +endef +SO_SUFFIX := $(shell $(uname2sosuffix)) + +HEADERS := mdbx.h +LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX) +TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk +MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 mdbx_chk.1 + +.PHONY: mdbx all install clean test dist check + +all: $(LIBRARIES) $(TOOLS) + +mdbx: libmdbx.a libmdbx.$(SO_SUFFIX) + +tools: $(TOOLS) + +strip: all + strip libmdbx.$(SO_SUFFIX) $(TOOLS) + +clean: + rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* \ + *.gcov *.log *.err src/*.o test/*.o mdbx_example dist \ + config.h src/elements/config.h src/elements/version.c *.tar* + +libmdbx.a: mdbx-static.o + $(AR) rs $@ $? + +libmdbx.$(SO_SUFFIX): mdbx-dylib.o + $(CC) $(CFLAGS) $^ -pthread -shared $(LDFLAGS) $(LIBS) -o $@ + +#> dist-cutoff-begin +ifeq ($(wildcard mdbx.c),mdbx.c) +#< dist-cutoff-end + +################################################################################ +# Amalgamated source code, i.e. distributed after `make dists` +MAN_SRCDIR := man1/ + +config.h: mdbx.c $(lastword $(MAKEFILE_LIST)) + (echo '#define MDBX_BUILD_TIMESTAMP "$(shell date +%Y-%m-%dT%H:%M:%S%z)"' \ + && echo '#define MDBX_BUILD_FLAGS "$(CFLAGS) $(LDFLAGS) $(LIBS)"' \ + && echo '#define MDBX_BUILD_COMPILER "$(shell set -o pipefail; $(CC) --version | head -1 || echo 'Please use GCC or CLANG compatible compiler')"' \ + && echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || echo 'Please use GCC or CLANG compatible compiler')"' \ + ) > $@ + +mdbx-dylib.o: config.h mdbx.c $(lastword $(MAKEFILE_LIST)) + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c mdbx.c -o $@ + +mdbx-static.o: config.h mdbx.c $(lastword $(MAKEFILE_LIST)) + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c mdbx.c -o $@ + +mdbx_%: mdbx_%.c libmdbx.a + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -o $@ + +#> dist-cutoff-begin +else +################################################################################ +# Plain (non-amalgamated) sources with test + +define uname2osal + case "$(UNAME)" in + CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;; + *) echo unix;; + esac +endef + +define uname2titer + case "$(UNAME)" in + Darwin*|Mach*) echo 2;; + *) echo 12;; + esac +endef + +DIST_EXTRA := LICENSE README.md CMakeLists.txt GNUmakefile Makefile $(addprefix man1/, $(MANPAGES)) +DIST_SRC := mdbx.h mdbx.c $(addsuffix .c, $(TOOLS)) + +TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db +TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log +TEST_OSAL := $(shell $(uname2osal)) +TEST_ITER := $(shell $(uname2titer)) +TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) +TEST_INC := $(wildcard test/*.h) +TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) +CXX ?= g++ +CXXSTD ?= $(shell $(CXX) -std=c++27 -c test/test.cc -o /dev/null 2>/dev/null && echo -std=c++17 || echo -std=c++11) +CXXFLAGS := $(CXXSTD) $(filter-out -std=gnu11,$(CFLAGS)) + +MAN_SRCDIR := src/man1/ +ALLOY_DEPS := $(wildcard src/elements/*) +MDBX_VERSION_GIT = ${shell set -o pipefail; git describe --tags | sed -n 's|^v*\([0-9]\{1,\}\.[0-9]\{1,\}\.[0-9]\{1,\}\)\(.*\)|\1|p' || echo 'Please fetch tags and/or install latest git version'} +MDBX_GIT_TIMESTAMP = $(shell git show --no-patch --format=%cI HEAD || echo 'Please install latest get version') +MDBX_GIT_DESCRIBE = $(shell git describe --tags --long --dirty=-dirty || echo 'Please fetch tags and/or install latest git version') +MDBX_VERSION_SUFFIX = $(shell set -o pipefail; echo -n '$(MDBX_GIT_DESCRIBE)' | tr -c -s '[a-zA-Z0-9]' _) +MDBX_BUILD_SOURCERY = $(shell set -o pipefail; $(MAKE) -s src/elements/version.c && (openssl dgst -r -sha256 src/elements/version.c || sha256sum src/elements/version.c || shasum -a 256 src/elements/version.c) 2>/dev/null | cut -d ' ' -f 1 || echo 'Please install openssl or sha256sum or shasum')_$(MDBX_VERSION_SUFFIX) + +check: test mdbx_example dist + +test: all mdbx_example mdbx_test + rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \ + (./mdbx_test --progress --console=no --repeat=$(TEST_ITER) --pathname=$(TEST_DB) --dont-cleanup-after basic && \ + ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=12 --pathname=$(TEST_DB) --dont-cleanup-after basic) \ + | tee -a $(TEST_LOG) | tail -n 42) \ + && ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy + +mdbx_example: mdbx.h example/example-mdbx.c libmdbx.$(SO_SUFFIX) + $(CC) $(CFLAGS) -I. example/example-mdbx.c ./libmdbx.$(SO_SUFFIX) -o $@ + +test-singleprocess: all mdbx_test + rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \ + (./mdbx_test --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --hill && \ + ./mdbx_test --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \ + ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=42 --pathname=$(TEST_DB) --dont-cleanup-after --nested) \ + | tee -a $(TEST_LOG) | tail -n 42) \ + && ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy + +test-fault: all mdbx_test + rm -f $(TEST_DB) $(TEST_LOG) && (set -o pipefail; ./mdbx_test --progress --console=no --pathname=$(TEST_DB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TEST_LOG) | tail -n 42) \ + ; ./mdbx_chk -vvnw $(TEST_DB) && ([ ! -e $(TEST_DB)-copy ] || ./mdbx_chk -vvn $(TEST_DB)-copy) + +VALGRIND=valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt +memcheck test-valgrind: all mdbx_test + @echo "$(MDBX_OPTIONS)" | grep -q MDBX_USE_VALGRIND || echo "WARNING: Please build libmdbx with -DMDBX_USE_VALGRIND to avoid false-positives from Valgrind !!!" >&2 + rm -f valgrind-*.log $(TEST_DB) $(TEST_LOG) && (set -o pipefail; \ + ($(VALGRIND) ./mdbx_test --mode=-writemap,-mapasync,-lifo --progress --console=no --repeat=4 --pathname=$(TEST_DB) --dont-cleanup-after basic && \ + $(VALGRIND) ./mdbx_test --progress --console=no --pathname=$(TEST_DB) --dont-cleanup-before --dont-cleanup-after --copy && \ + $(VALGRIND) ./mdbx_test --progress --console=no --repeat=2 --pathname=$(TEST_DB) --dont-cleanup-after basic) \ + | tee -a $(TEST_LOG) | tail -n 42) \ + && $(VALGRIND) ./mdbx_chk -vvn $(TEST_DB) && ./mdbx_chk -vvn $(TEST_DB)-copy + +define test-rule +$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) mdbx.h $(lastword $(MAKEFILE_LIST)) + $(CXX) $(CXXFLAGS) $(MDBX_OPTIONS) -c $(1) -o $$@ + +endef +$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file)))) + +mdbx_%: src/tools/mdbx_%.c libmdbx.a + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' $^ $(EXE_LDFLAGS) $(LIBS) -o $@ + +mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX) + $(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) $(LIBS) -o $@ + +git_DIR := $(shell if [ -d .git ]; then echo .git; elif [ -s .git -a -f .git ]; then grep '^gitdir: ' .git | cut -d ':' -f 2; else echo "Please use libmdbx as a git-submodule or the amalgamated source code" >&2 && echo git_directory; fi) + +src/elements/version.c: src/elements/version.c.in $(lastword $(MAKEFILE_LIST)) $(git_DIR)/HEAD $(git_DIR)/index $(git_DIR)/refs/tags + sed \ + -e "s|@MDBX_GIT_TIMESTAMP@|$(MDBX_GIT_TIMESTAMP)|" \ + -e "s|@MDBX_GIT_TREE@|$(shell git show --no-patch --format=%T HEAD || echo 'Please install latest get version')|" \ + -e "s|@MDBX_GIT_COMMIT@|$(shell git show --no-patch --format=%H HEAD || echo 'Please install latest get version')|" \ + -e "s|@MDBX_GIT_DESCRIBE@|$(MDBX_GIT_DESCRIBE)|" \ + -e "s|\$${MDBX_VERSION_MAJOR}|$(shell echo '$(MDBX_VERSION_GIT)' | cut -d . -f 1)|" \ + -e "s|\$${MDBX_VERSION_MINOR}|$(shell echo '$(MDBX_VERSION_GIT)' | cut -d . -f 2)|" \ + -e "s|\$${MDBX_VERSION_RELEASE}|$(shell echo '$(MDBX_VERSION_GIT)' | cut -d . -f 3)|" \ + -e "s|\$${MDBX_VERSION_REVISION}|$(shell git rev-list --count --no-merges HEAD || echo 'Please fetch tags and/or install latest git version')|" \ + src/elements/version.c.in > $@ + +src/elements/config.h: src/elements/version.c $(lastword $(MAKEFILE_LIST)) + (echo '#define MDBX_BUILD_TIMESTAMP "$(shell date +%Y-%m-%dT%H:%M:%S%z)"' \ + && echo '#define MDBX_BUILD_FLAGS "$(CFLAGS) $(LDFLAGS) $(LIBS)"' \ + && echo '#define MDBX_BUILD_COMPILER "$(shell set -o pipefail; $(CC) --version | head -1 || echo 'Please use GCC or CLANG compatible compiler')"' \ + && echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || echo 'Please use GCC or CLANG compatible compiler')"' \ + && echo '#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)' \ + ) > $@ + +mdbx-dylib.o: src/elements/config.h src/elements/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -DLIBMDBX_EXPORTS=1 -c src/alloy.c -o $@ + +mdbx-static.o: src/elements/config.h src/elements/version.c src/alloy.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) + $(CC) $(CFLAGS) $(MDBX_OPTIONS) '-DMDBX_CONFIG_H="config.h"' -ULIBMDBX_EXPORTS -c src/alloy.c -o $@ + +.PHONY: dist +dist: libmdbx-sources-$(MDBX_VERSION_SUFFIX).tar.gz $(lastword $(MAKEFILE_LIST)) + +libmdbx-sources-$(MDBX_VERSION_SUFFIX).tar.gz: $(addprefix dist/, $(DIST_SRC) $(DIST_EXTRA)) $(addprefix dist/man1/,$(MANPAGES)) + tar -c --owner=0 --group=0 -C dist $(DIST_SRC) $(DIST_EXTRA) -f - | gzip -c > $@ \ + && rm dist/@tmp-shared_internals.inc + +dist/mdbx.h: mdbx.h src/elements/version.c $(lastword $(MAKEFILE_LIST)) + mkdir -p dist && cp $< $@ + +dist/Makefile: Makefile + mkdir -p dist && cp $< $@ + +dist/GNUmakefile: GNUmakefile + mkdir -p dist && sed -e '/^#> dist-cutoff-begin/,/^#< dist-cutoff-end/d' $< > $@ + +dist/@tmp-shared_internals.inc: src/elements/version.c $(ALLOY_DEPS) $(lastword $(MAKEFILE_LIST)) + mkdir -p dist && sed \ + -e 's|#pragma once|#define MDBX_ALLOY 1\n#define MDBX_BUILD_SOURCERY $(MDBX_BUILD_SOURCERY)|' \ + -e 's|#include "../../mdbx.h"|@INCLUDE "mdbx.h"|' \ + -e '/#include "defs.h"/r src/elements/defs.h' \ + -e '/#include "osal.h"/r src/elements/osal.h' \ + -e '/#include "options.h"/r src/elements/options.h' \ + src/elements/internals.h > $@ + +dist/mdbx.c: dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST)) + mkdir -p dist && (cat dist/@tmp-shared_internals.inc \ + && cat src/elements/core.c src/elements/osal.c src/elements/version.c \ + && echo '#if defined(_WIN32) || defined(_WIN64)' \ + && cat src/elements/lck-windows.c && echo '#else /* LCK-implementation */' \ + && cat src/elements/lck-posix.c && echo '#endif /* LCK-implementation */' \ + ) | grep -v -e '#include "' -e '#pragma once' | sed 's|@INCLUDE|#include|' > $@ + +define dist-tool-rule +dist/$(1).c: src/tools/$(1).c src/tools/wingetopt.h src/tools/wingetopt.c \ + dist/@tmp-shared_internals.inc $(lastword $(MAKEFILE_LIST)) + mkdir -p dist && sed \ + -e '/#include "..\/elements\/internals.h"/r dist/@tmp-shared_internals.inc' \ + -e '/#include "wingetopt.h"/r src/tools/wingetopt.c' \ + src/tools/$(1).c \ + | grep -v -e '#include "' -e '#pragma once' -e '#define MDBX_ALLOY' \ + | sed 's|@INCLUDE|#include|' > $$@ + +endef +$(foreach file,$(TOOLS),$(eval $(call dist-tool-rule,$(file)))) + +dist/man1/mdbx_%.1: src/man1/mdbx_%.1 + mkdir -p dist/man1/ && cp $< $@ +dist/LICENSE: LICENSE + mkdir -p dist/man1/ && cp $< $@ +dist/README.md: README.md + mkdir -p dist/man1/ && cp $< $@ +dist/CMakeLists.txt: CMakeLists.dist-minimal + mkdir -p dist/man1/ && cp $< $@ +endif + +################################################################################ +# Cross-compilation simple test + +CROSS_LIST = mips-linux-gnu-gcc \ + powerpc64-linux-gnu-gcc powerpc-linux-gnu-gcc \ + arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc \ + sh4-linux-gnu-gcc mips64-linux-gnuabi64-gcc + +# hppa-linux-gnu-gcc - don't supported by current qemu release +# s390x-linux-gnu-gcc - qemu troubles (hang/abort) +# sparc64-linux-gnu-gcc - qemu troubles (fcntl for F_SETLK/F_GETLK) +# alpha-linux-gnu-gcc - qemu (or gcc) troubles (coredump) + +CROSS_LIST_NOQEMU = hppa-linux-gnu-gcc s390x-linux-gnu-gcc \ + sparc64-linux-gnu-gcc alpha-linux-gnu-gcc + +cross-gcc: + @echo "CORRESPONDING CROSS-COMPILERs ARE REQUIRED." + @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" + @for CC in $(CROSS_LIST_NOQEMU) $(CROSS_LIST); do \ + echo "===================== $$CC"; \ + $(MAKE) clean && CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static $(MAKE) all || exit $$?; \ + done + +# Unfortunately qemu don't provide robust support for futexes. +# Therefore it is impossible to run full multi-process tests. +cross-qemu: + @echo "CORRESPONDING CROSS-COMPILERs AND QEMUs ARE REQUIRED." + @echo "FOR INSTANCE: " + @echo " 1) apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" + @echo " 2) apt install binfmt-support qemu-user-static qemu-user qemu-system-arm qemu-system-mips qemu-system-misc qemu-system-ppc qemu-system-sparc" + @for CC in $(CROSS_LIST); do \ + echo "===================== $$CC + qemu"; \ + $(MAKE) clean && \ + CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static MDBX_OPTIONS="-DMDBX_SAFE4QEMU $(MDBX_OPTIONS)" \ + $(MAKE) test-singleprocess || exit $$?; \ + done + +#< dist-cutoff-end +install: $(LIBRARIES) $(TOOLS) $(HEADERS) + mkdir -p $(SANDBOX)$(prefix)/bin$(suffix) \ + && cp -t $(SANDBOX)$(prefix)/bin$(suffix) $(TOOLS) && \ + mkdir -p $(SANDBOX)$(prefix)/lib$(suffix) \ + && cp -t $(SANDBOX)$(prefix)/lib$(suffix) $(LIBRARIES) && \ + mkdir -p $(SANDBOX)$(prefix)/include \ + && cp -t $(SANDBOX)$(prefix)/include $(HEADERS) && \ + mkdir -p $(SANDBOX)$(mandir)/man1 \ + && cp -t $(SANDBOX)$(mandir)/man1 $(addprefix $(MAN_SRCDIR), $(MANPAGES)) + +################################################################################ +# Benchmarking by ioarena + +IOARENA ?= $(shell \ + (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ + (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ + (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena) +NN ?= 25000000 + +ifneq ($(wildcard $(IOARENA)),) + +.PHONY: bench clean-bench re-bench + +clean-bench: + rm -rf bench-*.txt _ioarena/* + +re-bench: clean-bench bench + +define bench-rule +bench-$(1)_$(2).txt: $(3) $(IOARENA) $(lastword $(MAKEFILE_LIST)) + LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ + $(IOARENA) -D $(1) -B crud -m nosync -n $(2) \ + | tee $$@ | grep throughput && \ + LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ + $(IOARENA) -D $(1) -B get,iterate -m sync -r 4 -n $(2) \ + | tee -a $$@ | grep throughput \ + || mv -f $$@ $$@.error + +endef + +$(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) + +$(eval $(call bench-rule,sophia,$(NN))) +$(eval $(call bench-rule,leveldb,$(NN))) +$(eval $(call bench-rule,rocksdb,$(NN))) +$(eval $(call bench-rule,wiredtiger,$(NN))) +$(eval $(call bench-rule,forestdb,$(NN))) +$(eval $(call bench-rule,lmdb,$(NN))) +$(eval $(call bench-rule,nessdb,$(NN))) +$(eval $(call bench-rule,sqlite3,$(NN))) +$(eval $(call bench-rule,ejdb,$(NN))) +$(eval $(call bench-rule,vedisdb,$(NN))) +$(eval $(call bench-rule,dummy,$(NN))) + +$(eval $(call bench-rule,debug,10)) + +bench: bench-mdbx_$(NN).txt + +.PHONY: bench-debug + +bench-debug: bench-debug_10.txt + +bench-quartet: bench-mdbx_$(NN).txt bench-lmdb_$(NN).txt bench-rocksdb_$(NN).txt bench-wiredtiger_$(NN).txt + +endif diff --git a/libs/libmdbx/src/Makefile b/libs/libmdbx/src/Makefile index 2d261bc7c3..0595d23524 100644 --- a/libs/libmdbx/src/Makefile +++ b/libs/libmdbx/src/Makefile @@ -1,270 +1,5 @@ -# GNU Makefile for libmdbx, https://abf.io/erthink/libmdbx - -######################################################################## -# Configuration. The compiler options must enable threaded compilation. -# -# Preprocessor macros (for XCFLAGS) of interest... -# Note that the defaults should already be correct for most -# platforms; you should not need to change any of these. -# Read their descriptions in mdb.c if you do. There may be -# other macros of interest. You should read mdb.c -# before changing any of them. -# - -# install sandbox -SANDBOX ?= - -# install prefixes (inside sandbox) -prefix ?= /usr/local -mandir ?= $(prefix)/man - -# lib/bin suffix for multiarch/biarch, e.g. '.x86_64' -suffix ?= - -CC ?= gcc -CXX ?= g++ -LD ?= ld -CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden - -XCFLAGS ?= -DNDEBUG=1 -DLIBMDBX_EXPORTS=1 -CFLAGS += -D_GNU_SOURCE=1 -std=gnu11 -pthread $(XCFLAGS) -CXXFLAGS = -std=c++11 $(filter-out -std=gnu11,$(CFLAGS)) -TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.db -TESTLOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log - -# LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old -LDFLAGS ?= $(shell $(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1')$(shell $(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip') -EXE_LDFLAGS ?= -pthread - -# LY: just for benchmarking -IOARENA ?= $(shell \ - (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ - (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ - (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena) -NN ?= 25000000 - -######################################################################## - -ifdef MSVC - UNAME := Windows - LCK_IMPL := windows - TEST_OSAL := windows - TEST_ITER := 42 -else - UNAME := $(shell uname -s 2>/dev/null || echo Unknown) - define uname2lck - case "$(UNAME)" in - Linux) echo linux;; - CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;; - *) echo posix;; - esac - endef - define uname2osal - case "$(UNAME)" in - CYGWIN*|MINGW*|MSYS*|Windows*) echo windows;; - *) echo unix;; - esac - endef - define uname2titer - case "$(UNAME)" in - Darwin*|Mach*) echo 3;; - *) echo 42;; - esac - endef - define uname2suffix - case "$(UNAME)" in - Darwin*|Mach*) echo dylib;; - CYGWIN*|MINGW*|MSYS*|Windows*) echo dll;; - *) echo so;; - esac - endef - LCK_IMPL := $(shell $(uname2lck)) - TEST_OSAL := $(shell $(uname2osal)) - TEST_ITER := $(shell $(uname2titer)) - SO_SUFFIX := $(shell $(uname2suffix)) -endif - -HEADERS := mdbx.h -LIBRARIES := libmdbx.a libmdbx.$(SO_SUFFIX) -TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk -MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 -SHELL := /bin/bash - -CORE_SRC := src/lck-$(LCK_IMPL).c $(filter-out $(wildcard src/lck-*.c), $(wildcard src/*.c)) -CORE_INC := $(wildcard src/*.h) -CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC)) -TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) -TEST_INC := $(wildcard test/*.h) -TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) - -.PHONY: mdbx all install clean check coverage - -all: $(LIBRARIES) $(TOOLS) mdbx_test example - -mdbx: libmdbx.a libmdbx.$(SO_SUFFIX) - -example: mdbx.h tutorial/sample-mdbx.c libmdbx.$(SO_SUFFIX) - $(CC) $(CFLAGS) -I. tutorial/sample-mdbx.c ./libmdbx.$(SO_SUFFIX) -o example - -tools: $(TOOLS) - -install: $(LIBRARIES) $(TOOLS) $(HEADERS) - mkdir -p $(SANDBOX)$(prefix)/bin$(suffix) \ - && cp -t $(SANDBOX)$(prefix)/bin$(suffix) $(TOOLS) && \ - mkdir -p $(SANDBOX)$(prefix)/lib$(suffix) \ - && cp -t $(SANDBOX)$(prefix)/lib$(suffix) $(LIBRARIES) && \ - mkdir -p $(SANDBOX)$(prefix)/include \ - && cp -t $(SANDBOX)$(prefix)/include $(HEADERS) && \ - mkdir -p $(SANDBOX)$(mandir)/man1 \ - && cp -t $(SANDBOX)$(mandir)/man1 $(MANPAGES) - -clean: - rm -rf $(TOOLS) mdbx_test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o - -check: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --repeat=$(TEST_ITER) --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ - && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy - -check-singleprocess: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; \ - ./mdbx_test --repeat=4 --pathname=$(TESTDB) --dont-cleanup-after --hill && \ - ./mdbx_test --repeat=2 --pathname=$(TESTDB) --dont-cleanup-before --dont-cleanup-after --copy \ - | tee -a $(TESTLOG) | tail -n 42) \ - && ./mdbx_chk -vvn $(TESTDB) && ./mdbx_chk -vvn $(TESTDB)-copy - -check-fault: all - rm -f $(TESTDB) $(TESTLOG) && (set -o pipefail; ./mdbx_test --pathname=$(TESTDB) --inject-writefault=42 --dump-config --dont-cleanup-after basic | tee -a $(TESTLOG) | tail -n 42) \ - ; ./mdbx_chk -vvnw $(TESTDB) && ([ ! -e $(TESTDB)-copy ] || ./mdbx_chk -vvn $(TESTDB)-copy) - -define core-rule -$(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile - $(CC) $(CFLAGS) -c $(1) -o $$@ - -endef -$(foreach file,$(CORE_SRC),$(eval $(call core-rule,$(file)))) - -define test-rule -$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) mdbx.h Makefile - $(CXX) $(CXXFLAGS) -c $(1) -o $$@ - -endef -$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file)))) - -libmdbx.a: $(CORE_OBJ) - $(AR) rs $@ $? - -libmdbx.$(SO_SUFFIX): $(CORE_OBJ) - $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ - -mdbx_%: src/tools/mdbx_%.c libmdbx.a - $(CC) $(CFLAGS) $^ $(EXE_LDFLAGS) -o $@ - -mdbx_test: $(TEST_OBJ) libmdbx.$(SO_SUFFIX) - $(CXX) $(CXXFLAGS) $(TEST_OBJ) -Wl,-rpath . -L . -l mdbx $(EXE_LDFLAGS) -o $@ - -############################################################################### - -ifneq ($(wildcard $(IOARENA)),) - -.PHONY: bench clean-bench re-bench - -clean-bench: - rm -rf bench-*.txt _ioarena/* - -re-bench: clean-bench bench - -define bench-rule -bench-$(1)_$(2).txt: $(3) $(IOARENA) Makefile - LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ - $(IOARENA) -D $(1) -B crud -m nosync -n $(2) \ - | tee $$@ | grep throughput && \ - LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}" \ - $(IOARENA) -D $(1) -B get,iterate -m sync -r 4 -n $(2) \ - | tee -a $$@ | grep throughput \ - || mv -f $$@ $$@.error - -endef - -$(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) - -$(eval $(call bench-rule,sophia,$(NN))) -$(eval $(call bench-rule,leveldb,$(NN))) -$(eval $(call bench-rule,rocksdb,$(NN))) -$(eval $(call bench-rule,wiredtiger,$(NN))) -$(eval $(call bench-rule,forestdb,$(NN))) -$(eval $(call bench-rule,lmdb,$(NN))) -$(eval $(call bench-rule,nessdb,$(NN))) -$(eval $(call bench-rule,sqlite3,$(NN))) -$(eval $(call bench-rule,ejdb,$(NN))) -$(eval $(call bench-rule,vedisdb,$(NN))) -$(eval $(call bench-rule,dummy,$(NN))) - -$(eval $(call bench-rule,debug,10)) - -bench: bench-mdbx_$(NN).txt - -.PHONY: bench-debug - -bench-debug: bench-debug_10.txt - -bench-quartet: bench-mdbx_$(NN).txt bench-lmdb_$(NN).txt bench-rocksdb_$(NN).txt bench-wiredtiger_$(NN).txt - -endif - -############################################################################### - -ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \ - echo -n "probe by $2 ($$(readlink -f $$(which $$CC))): " && \ - $(MAKE) clean >$1.log 2>$1.err && \ - $(MAKE) CC=$$(readlink -f $$CC) XCFLAGS="-UNDEBUG -DMDBX_DEBUG=2 -DLIBMDBX_EXPORTS=1" check 1>$1.log 2>$1.err && echo "OK" \ - || ( echo "FAILED"; cat $1.err >&2; exit 1 ); \ - else echo "no $2 ($1) for probe"; fi; ) -ci: - @if [ "$$(readlink -f $$(which $(CC)))" != "$$(readlink -f $$(which gcc || echo /bin/false))" -a \ - "$$(readlink -f $$(which $(CC)))" != "$$(readlink -f $$(which clang || echo /bin/false))" -a \ - "$$(readlink -f $$(which $(CC)))" != "$$(readlink -f $$(which icc || echo /bin/false))" ]; then \ - $(call ci-rule,$(CC),default C compiler); \ - fi - @$(call ci-rule,gcc,GCC) - @$(call ci-rule,clang,clang LLVM) - @$(call ci-rule,icc,Intel C) - -############################################################################### - -CROSS_LIST = mips-linux-gnu-gcc \ - powerpc64-linux-gnu-gcc powerpc-linux-gnu-gcc \ - arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc - -# hppa-linux-gnu-gcc - don't supported by current qemu release -# s390x-linux-gnu-gcc - qemu troubles (hang/abort) -# sh4-linux-gnu-gcc - qemu troubles (pread syscall, etc) -# mips64-linux-gnuabi64-gcc - qemu troubles (pread syscall, etc) -# sparc64-linux-gnu-gcc - qemu troubles (fcntl for F_SETLK/F_GETLK) -# alpha-linux-gnu-gcc - qemu (or gcc) troubles (coredump) - -CROSS_LIST_NOQEMU = hppa-linux-gnu-gcc s390x-linux-gnu-gcc \ - sh4-linux-gnu-gcc mips64-linux-gnuabi64-gcc \ - sparc64-linux-gnu-gcc alpha-linux-gnu-gcc - -cross-gcc: - @echo "CORRESPONDING CROSS-COMPILERs ARE REQUIRED." - @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" - @for CC in $(CROSS_LIST_NOQEMU) $(CROSS_LIST); do \ - echo "===================== $$CC"; \ - $(MAKE) clean && CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static $(MAKE) all || exit $$?; \ - done - -# -# Unfortunately qemu don't provide robust support for futexes. -# Therefore it is impossible to run full multi-process tests. -cross-qemu: - @echo "CORRESPONDING CROSS-COMPILERs AND QEMUs ARE REQUIRED." - @echo "FOR INSTANCE: " - @echo " 1) apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" - @echo " 2) apt install binfmt-support qemu-user-static qemu-user qemu-system-arm qemu-system-mips qemu-system-misc qemu-system-ppc qemu-system-sparc" - @for CC in $(CROSS_LIST); do \ - echo "===================== $$CC + qemu"; \ - $(MAKE) clean && \ - CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static XCFLAGS="-DMDBX_SAFE4QEMU $(XCFLAGS)" \ - $(MAKE) check-singleprocess || exit $$?; \ - done +all install mdbx tools strip clean test check dist test-singleprocess test-fault memcheck test-valgrind cross-gcc cross-qemu bench bench-quartet clean-bench: + @CC=$(CC) \ + CXX=`if test -n "$(CXX)" && which "$(CXX)" > /dev/null; then echo "$(CXX)"; elif test -n "$(CCC)" && which "$(CCC)" > /dev/null; then echo "$(CCC)"; else echo "c++"; fi` \ + `which gmake || which gnumake || echo 'echo "GNU Make is required"; exit 2;'` \ + $(MAKEFLAGS) -f GNUmakefile $@ diff --git a/libs/libmdbx/src/README-RU.md b/libs/libmdbx/src/README-RU.md deleted file mode 100644 index e040b5c4bb..0000000000 --- a/libs/libmdbx/src/README-RU.md +++ /dev/null @@ -1,830 +0,0 @@ -### The [repository now only mirrored on the Github](https://abf.io/erthink/libmdbx) due to illegal discriminatory restrictions for Russian Crimea and for sovereign crimeans. -<!-- Required extensions: pymdownx.betterem, pymdownx.tilde, pymdownx.emoji, pymdownx.tasklist, pymdownx.superfences --> ------ - -libmdbx -====================================== -Доработанный и расширенный потомок [Lightning Memory-Mapped Database](https://ru.bmstu.wiki/LMDB_(Lightning_Memory-Mapped_Database)) (aka _LMDB_). -English version is [here](README.md). - -_libmdbx_ превосходит LMDB по возможностям и надежности, не уступая в -производительности. _libmdbx_ работает на Linux, FreeBSD, MacOS X и -других ОС соответствующих POSIX.1-2008, а также поддерживает Windows в -качестве дополнительной платформы. - -Отдельно ведётся не-публичная разработка следующей версии, в которой -будет кардинальное изменение как API, так и формата базы данных. Цель -этой революции - обеспечение более четкого и надежного API, добавление -новых функций, а также наделение базы данных новыми свойствами. - -*Всё будет хорошо. The Future will (be) [Positive](https://www.ptsecurity.ru).* - -[![Build Status](https://travis-ci.org/leo-yuriev/libmdbx.svg?branch=master)](https://travis-ci.org/leo-yuriev/libmdbx) -[![Build status](https://ci.appveyor.com/api/projects/status/ue94mlopn50dqiqg/branch/master?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/master) -[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx) - -## Содержание -- [Обзор](#Обзор) - - [Сравнение с другими базами данных](#Сравнение-с-другими-базами-данных) - - [История & Выражение признательности](#История) -- [Описание](#Описание) - - [Ключевые свойства](#Ключевые-свойства) - - [Доработки и усовершенствования относительно LMDB](#Доработки-и-усовершенствования-относительно-lmdb) - - [Недостатки и Компромиссы](#Недостатки-и-Компромиссы) - - [Проблема долгих чтений](#Проблема-долгих-чтений) - - [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации) -- [Использование](#Использование) - - [Сборка](#Сборка) - - [Привязки к другим языкам](#Привязки-к-другим-языкам) -- [Сравнение производительности](#Сравнение-производительности) - - [Интегральная производительность](#Интегральная-производительность) - - [Масштабируемость чтения](#Масштабируемость-чтения) - - [Синхронная фиксация](#Синхронная-фиксация) - - [Отложенная фиксация](#Отложенная-фиксация) - - [Асинхронная фиксация](#Асинхронная-фиксация) - - [Потребление ресурсов](#Потребление-ресурсов) - ------ - -## Обзор -_libmdbx_ - это встраиваемый key-value движок хранения со специфическим -набором свойств и возможностей, ориентированный на создание уникальных -легковесных решений с предельной производительностью. - -_libmdbx_ позволяет множеству процессов совместно читать и обновлять -несколько key-value таблиц с соблюдением -[ACID](https://ru.wikipedia.org/wiki/ACID), при минимальных накладных -расходах и амортизационной стоимости любых операций Olog(N). - -_libmdbx_ обеспечивает -[serializability](https://en.wikipedia.org/wiki/Serializability) -изменений и согласованность данных после аварий. При этом транзакции, -изменяющие данные, никак не мешают операциям чтения и выполняются строго -последовательно с использованием единственного -[мьютекса](https://en.wikipedia.org/wiki/Mutual_exclusion). - -_libmdbx_ позволяет выполнять операции чтения с гарантиями -[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom), -параллельно на каждом ядре CPU, без использования атомарных операций -и/или примитивов синхронизации. - -_libmdbx_ не использует -[LSM](https://en.wikipedia.org/wiki/Log-structured_merge-tree), а -основан на [B+Tree](https://en.wikipedia.org/wiki/B%2B_tree) с -[отображением](https://en.wikipedia.org/wiki/Memory-mapped_file) всех -данных в память, при этом текущая версия не использует -[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging). Это -предопределяет многие свойства, в том числе удачные и противопоказанные -сценарии использования. - - -### Сравнение с другими базами данных - -На данный момент, пожалуйста, обратитесь к [главе "сравнение BoltDB с -другими базами -данных"](https://github.com/coreos/bbolt#comparison-with-other-databases), -которая также (в основном) применима к MDBX. - - -### История -_libmdbx_ является результатом переработки и развития "Lightning -Memory-Mapped Database", известной под аббревиатурой -[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). -Изначально доработка производилась в составе проекта -[ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP). Примерно за год -работы внесенные изменения приобрели самостоятельную ценность. Осенью -2015 доработанный движок был выделен в отдельный проект, который был -[представлен на конференции Highload++ -2015](http://www.highload.ru/2015/abstracts/1831.html). - -В начале 2017 года движок _libmdbx_ получил новый импульс развития, -благодаря использованию в [Fast Positive -Tables](https://github.com/leo-yuriev/libfpta), aka ["Позитивные -Таблицы"](https://github.com/leo-yuriev/libfpta) by [Positive -Technologies](https://www.ptsecurity.ru). - - -### Выражение признательности - -Говард Чу (Howard Chu) <hyc@openldap.org> является автором движка LMDB, от -которого в 2015 году произошел MDBX. - -Мартин Хеденфальк (Martin Hedenfalk) <martin@bzero.se> является автором кода -`btree.c`, который использовался для начала разработки LMDB. - ------ - -Описание -======== - -## Ключевые свойства - -_libmdbx_ наследует все ключевые возможности и особенности своего -прародителя -[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database), -но с устранением ряда описываемых далее проблем и архитектурных -недочетов. - -1. Данные хранятся в упорядоченном отображении (ordered map), ключи -всегда отсортированы, поддерживается выборка диапазонов (range lookups). - -2. Данные отображается в память каждого работающего с БД процесса. К -данным и ключам обеспечивается прямой доступ в памяти без необходимости -их копирования. - -3. Транзакции согласно [ACID](https://ru.wikipedia.org/wiki/ACID), -посредством [MVCC](https://ru.wikipedia.org/wiki/MVCC) и -[COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8). -Изменения строго последовательны и не блокируются чтением, конфликты -между транзакциями невозможны. При этом гарантируется чтение только -зафиксированных данных, см [relaxing -serializability](https://en.wikipedia.org/wiki/Serializability). - -4. Чтение и поиск [без -блокировок](https://ru.wikipedia.org/wiki/%D0%9D%D0%B5%D0%B1%D0%BB%D0%BE%D0%BA%D0%B8%D1%80%D1%83%D1%8E%D1%89%D0%B0%D1%8F_%D1%81%D0%B8%D0%BD%D1%85%D1%80%D0%BE%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F), -без [атомарных -операций](https://ru.wikipedia.org/wiki/%D0%90%D1%82%D0%BE%D0%BC%D0%B0%D1%80%D0%BD%D0%B0%D1%8F_%D0%BE%D0%BF%D0%B5%D1%80%D0%B0%D1%86%D0%B8%D1%8F). -Читатели не блокируются операциями записи и не конкурируют между собой, -чтение масштабируется линейно по ядрам CPU. - > Для точности следует отметить, что "подключение к БД" (старт первой - > читающей транзакции в потоке) и "отключение от БД" (закрытие БД или - > завершение потока) требуют краткосрочного захвата блокировки для - > регистрации/дерегистрации текущего потока в "таблице читателей". - -5. Эффективное хранение дубликатов (ключей с несколькими значениями), -без дублирования ключей, с сортировкой значений, в том числе -целочисленных (для вторичных индексов). - -6. Эффективная поддержка коротких ключей фиксированной длины, в том -числе целочисленных. - -7. Амортизационная стоимость любой операции Olog(N), -[WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write -Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). - -8. Нет [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) и -журнала транзакций, после сбоев не требуется восстановление. Не -требуется компактификация или какое-либо периодическое обслуживание. -Поддерживается резервное копирование "по горячему", на работающей БД без -приостановки изменения данных. - -9. Отсутствует какое-либо внутреннее управление памятью или -кэшированием. Всё необходимое штатно выполняет ядро ОС. - - -## Доработки и усовершенствования относительно LMDB - -1. Автоматическое динамическое управление размером БД согласно -параметрам задаваемым функцией `mdbx_env_set_geometry()`, включая шаг -приращения и порог уменьшения размера БД, а также выбор размера -страницы. Соответственно, это позволяет снизить фрагментированность -файла БД на диске и освободить место, в том числе в **Windows**. - -2. Автоматическая без-затратная компактификация БД путем возврата -освобождающихся страниц в область нераспределенного резерва в конце -файла данных. При этом уменьшается количество страниц находящихся в -памяти и участвующих в в обмене с диском. - -3. Режим `LIFO RECLAIM`. - - Для повторного использования выбираются не самые старые, а - самые новые страницы из доступных. За счет этого цикл - использования страниц всегда имеет минимальную длину и не - зависит от общего числа выделенных страниц. - - В результате механизмы кэширования и обратной записи работают с - максимально возможной эффективностью. В случае использования - контроллера дисков или системы хранения с - [BBWC](https://en.wikipedia.org/wiki/BBWC) возможно - многократное увеличение производительности по записи - (обновлению данных). - -4. Быстрая оценка количества элементов попадающих в запрашиваемый -диапазон значений ключа посредством функций `mdbx_estimate_range()`, -`mdbx_estimate_move()` и `mdbx_estimate_distance()` для выбора -оптимального плана выполнения запроса. - -5. Утилита `mdbx_chk` для проверки целостности структуры БД. - -6. Поддержка ключей и значений нулевой длины, включая сортированные -дубликаты. - -7. Возможность связать с каждой завершаемой транзакцией до 3 -дополнительных маркеров посредством `mdbx_canary_put()`, и прочитать их -в транзакции чтения посредством `mdbx_canary_get()`. - -8. Возможность посредством `mdbx_replace()` обновить или удалить запись -с получением предыдущего значения данных, а также адресно изменить -конкретное multi-значение. - -9. Генерация последовательностей посредством `mdbx_dbi_sequence()`. - -10. Обработчик `OOM-KICK`. - - Посредством `mdbx_env_set_oomfunc()` может быть установлен - внешний обработчик (callback), который будет вызван при - исчерпании свободных страниц по причине долгой операцией чтения - на фоне интенсивного изменения данных. - Обработчику будет передан PID и pthread_id виновника. - В свою очередь обработчик может предпринять одно из действий: - - * нейтрализовать виновника (отправить сигнал kill #9), если - долгое чтение выполняется сторонним процессом; - - * отменить или перезапустить проблемную операцию чтения, если - операция выполняется одним из потоков текущего процесса; - - * подождать некоторое время, в расчете на то, что проблемная операция - чтения будет штатно завершена; - - * прервать текущую операцию изменения данных с возвратом кода - ошибки. - -11. Возможность открыть БД в эксклюзивном режиме посредством флага - `MDBX_EXCLUSIVE`, в том числе на сетевом носителе. - -12. Возможность получить отставание текущей транзакции чтения от -последней версии данных в БД посредством `mdbx_txn_straggler()`. - -13. Возможность явно запросить обновление существующей записи, без -создания новой посредством флажка `MDBX_CURRENT` для `mdbx_put()`. - -14. Исправленный вариант `mdbx_cursor_count()`, возвращающий корректное -количество дубликатов для всех типов таблиц и любого положения курсора. - -15. Возможность получить посредством `mdbx_env_info()` дополнительную -информацию, включая номер самой старой версии БД (снимка данных), -который используется одним из читателей. - -16. Функция `mdbx_del()` не игнорирует дополнительный (уточняющий) -аргумент `data` для таблиц без дубликатов (без флажка `MDBX_DUPSORT`), а -при его ненулевом значении всегда использует его для сверки с удаляемой -записью. - -17. Возможность открыть dbi-таблицу, одновременно с установкой -компараторов для ключей и данных, посредством `mdbx_dbi_open_ex()`. - -18. Возможность посредством `mdbx_is_dirty()` определить находятся ли -некоторый ключ или данные в "грязной" странице БД. Таким образом, -избегая лишнего копирования данных перед выполнением модифицирующих -операций (значения, размещенные в "грязных" страницах, могут быть -перезаписаны при изменениях, иначе они будут неизменны). - -19. Корректное обновление текущей записи, в том числе сортированного -дубликата, при использовании режима `MDBX_CURRENT` в -`mdbx_cursor_put()`. - -20. Возможность узнать есть ли за текущей позицией курсора строка данных -посредством `mdbx_cursor_eof()`. - -21. Дополнительный код ошибки `MDBX_EMULTIVAL`, который возвращается из -`mdbx_put()` и `mdbx_replace()` при попытке выполнить неоднозначное -обновление или удаления одного из нескольких значений с одним ключом. - -22. Возможность посредством `mdbx_get_ex()` получить значение по -заданному ключу, одновременно с количеством дубликатов. - -23. Наличие функций `mdbx_cursor_on_first()` и `mdbx_cursor_on_last()`, -которые позволяют быстро выяснить стоит ли курсор на первой/последней -позиции. - -24. Возможность автоматического формирования контрольных точек (сброса -данных на диск) при накоплении заданного объёма изменений, -устанавливаемого функцией `mdbx_env_set_syncbytes()`. - -25. Управление отладкой и получение отладочных сообщений посредством -`mdbx_setup_debug()`. - -26. Функция `mdbx_env_pgwalk()` для обхода всех страниц БД. - -27. Три мета-страницы вместо двух, что позволяет гарантированно -консистентно обновлять слабые контрольные точки фиксации без риска -повредить крайнюю сильную точку фиксации. - -28. Гарантия сохранности БД в режиме `WRITEMAP+MAPSYNC`. - > В текущей версии _libmdbx_ вам предоставляется выбор между безопасным - > режимом (по умолчанию) асинхронной фиксации, и режимом `UTTERLY_NOSYNC` - > когда при системной аварии есть шанс полного разрушения БД как в LMDB. - > Для подробностей смотрите раздел - > [Сохранность данных в режиме асинхронной фиксации](#Сохранность-данных-в-режиме-асинхронной-фиксации). - -29. Возможность закрыть БД в "грязном" состоянии (без сброса данных и -формирования сильной точки фиксации) посредством `mdbx_env_close_ex()`. - -30. При завершении читающих транзакций, открытые в них DBI-хендлы не -закрываются и не теряются при завершении таких транзакций посредством -`mdbx_txn_abort()` или `mdbx_txn_reset()`. Что позволяет избавится от ряда -сложно обнаруживаемых ошибок. - -31. Все курсоры, как в транзакциях только для чтения, так и в пишущих, -могут быть переиспользованы посредством `mdbx_cursor_renew()` и ДОЛЖНЫ -ОСВОБОЖДАТЬСЯ ЯВНО. - > - > ## _ВАЖНО_, Обратите внимание! - > - > Это единственное изменение в API, которое значимо меняет - > семантику управления курсорами и может приводить к утечкам - > памяти. Следует отметить, что это изменение вынужденно. - > Так устраняется неоднозначность с массой тяжких последствий: - > - > - обращение к уже освобожденной памяти; - > - попытки повторного освобождения памяти; - > - повреждение памяти и ошибки сегментации. - -32. На **MacOS X** для синхронизации данных с диском _по-умолчанию_ -используется системная функция `fcntl(F_FULLFSYNC)`, так как [только -этим гарантируется сохранность -данных](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) -при сбое электропитания. К сожалению, в сценариях с высокой -интенсивностью пишущих транзакций, использование `F_FULLFSYNC` приводит -к существенной деградации производительности в сравнении с LMDB, где -используется системная функция `fsync()`. Поэтому _libmdbx_ позволяет -переопределить это поведение определением опции -`MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` при сборке библиотеки. - -33. На **Windows** _libmdbx_ использует файловые блокировки -`LockFileEx()`, так как это позволяет размещать БД на сетевых дисках, а -также обеспечивает защиту от некомпетентных действий пользователя -([защиту от -дурака](https://ru.wikipedia.org/wiki/%D0%97%D0%B0%D1%89%D0%B8%D1%82%D0%B0_%D0%BE%D1%82_%D0%B4%D1%83%D1%80%D0%B0%D0%BA%D0%B0)). -Поэтому _libmdbx_ может немного отставать в тестах производительность от -LMDB, где используются именованные мьютексы. - - -## Недостатки и Компромиссы - -1. Единовременно может выполняться не более одной транзакция изменения данных - (один писатель). Зато все изменения всегда последовательны, не может быть - конфликтов или логических ошибок при откате транзакций. - -2. Отсутствие [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) - обуславливает относительно большой - [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write - Amplification Factor). Поэтому фиксация изменений на диске может быть - достаточно дорогой и являться главным ограничением производительности - при интенсивном изменении данных. - > В качестве компромисса _libmdbx_ предлагает несколько режимов ленивой - > и/или периодической фиксации. В том числе режим `MAPASYNC`, при котором - > изменения происходят только в памяти и асинхронно фиксируются на диске - > ядром ОС. - > - > Однако, следует воспринимать это свойство аккуратно и взвешенно. - > Например, полная фиксация транзакции в БД с журналом потребует минимум 2 - > IOPS (скорее всего 3-4) из-за накладных расходов в файловой системе. В - > _libmdbx_ фиксация транзакции также требует от 2 IOPS. Однако, в БД с - > журналом кол-во IOPS будет меняться в зависимости от файловой системы, - > но не от кол-ва записей или их объема. Тогда как в _libmdbx_ кол-во - > будет расти логарифмически от кол-ва записей/строк в БД (по высоте - > b+tree). - -3. [COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8) - для реализации [MVCC](https://ru.wikipedia.org/wiki/MVCC) выполняется на - уровне страниц в [B+ - дереве](https://ru.wikipedia.org/wiki/B-%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D0%BE). - Поэтому изменение данных амортизационно требует копирования Olog(N) - страниц, что расходует [пропускную способность оперативной - памяти](https://en.wikipedia.org/wiki/Memory_bandwidth) и является - основным ограничителем производительности в режиме `MAPASYNC`. - > Этот недостаток неустраним, тем не менее следует дать некоторые пояснения. - > Дело в том, что фиксация изменений на диске потребует гораздо более - > значительного копирования данных в памяти и массы других затратных операций. - > Поэтому обусловленное этим недостатком падение производительности становится - > заметным только при отказе от фиксации изменений на диске. - > Соответственно, корректнее сказать, что _libmdbx_ позволяет - > получить персистентность ценой минимального падения производительности. - > Если же нет необходимости оперативно сохранять данные, то логичнее - > использовать `std::map`. - -4. В _LMDB_ существует проблема долгих чтений (приостановленных читателей), - которая приводит к деградации производительности и переполнению БД. - > В _libmdbx_ предложены средства для предотвращения, быстрого выхода из - > некомфортной ситуации и устранения её последствий. Подробности ниже. - -5. В _LMDB_ есть вероятность разрушения БД в режиме `WRITEMAP+MAPASYNC`. - В _libmdbx_ для `WRITEMAP+MAPASYNC` гарантируется как сохранность базы, - так и согласованность данных. - > Дополнительно, в качестве альтернативы, предложен режим `UTTERLY_NOSYNC`. - > Подробности ниже. - - -### Проблема долгих чтений -*Следует отметить*, что проблема "сборки мусора" так или иначе -существует во всех СУБД (Vacuum в PostgreSQL). Однако в случае _libmdbx_ -и LMDB она проявляется более остро, прежде всего из-за высокой -производительности, а также из-за намеренного упрощения внутренних -механизмов ради производительности. - -Понимание проблемы требует некоторых пояснений, которые -изложены ниже, но могут быть сложны для быстрого восприятия. -Поэтому, тезисно: - -* Изменение данных на фоне долгой операции чтения может - приводить к исчерпанию места в БД. - -* После чего любая попытка обновить данные будет приводить к - ошибке `MAP_FULL` до завершения долгой операции чтения. - -* Характерными примерами долгих чтений являются горячее - резервное копирования и отладка клиентского приложения при - активной транзакции чтения. - -* В оригинальной _LMDB_ после этого будет наблюдаться - устойчивая деградация производительности всех механизмов - обратной записи на диск (в I/O контроллере, в гипервизоре, - в ядре ОС). - -* В _libmdbx_ предусмотрен механизм аварийного прерывания таких - операций, а также режим `LIFO RECLAIM` устраняющий последующую - деградацию производительности. - -Операции чтения выполняются в контексте снимка данных (версии -БД), который был актуальным на момент старта транзакции чтения. Такой -читаемый снимок поддерживается неизменным до завершения операции. В свою -очередь, это не позволяет повторно использовать страницы БД в -последующих версиях (снимках БД). - -Другими словами, если обновление данных выполняется на фоне долгой -операции чтения, то вместо повторного использования "старых" ненужных -страниц будут выделяться новые, так как "старые" страницы составляют -снимок БД, который еще используется долгой операцией чтения. - -В результате, при интенсивном изменении данных и достаточно длительной -операции чтения, в БД могут быть исчерпаны свободные страницы, что не -позволит создавать новые снимки/версии БД. Такая ситуация будет -сохраняться до завершения операции чтения, которая использует старый -снимок данных и препятствует повторному использованию страниц БД. - -Однако, на этом проблемы не заканчиваются. После описанной ситуации, все -дополнительные страницы, которые были выделены пока переработка старых -была невозможна, будут участвовать в цикле выделения/освобождения до -конца жизни экземпляра БД. В оригинальной _LMDB_ этот цикл использования -страниц работает по принципу [FIFO](https://ru.wikipedia.org/wiki/FIFO). -Поэтому увеличение количества циркулирующий страниц, с точки зрения -механизмов кэширования и/или обратной записи, выглядит как увеличение -рабочего набор данных. Проще говоря, однократное попадание в ситуацию -"уснувшего читателя" приводит к устойчивому эффекту вымывания I/O кэша -при всех последующих изменениях данных. - -Для устранения описанных проблемы в _libmdbx_ сделаны существенные -доработки, подробности ниже. Иллюстрации к проблеме "долгих чтений" -можно найти в [слайдах презентации](http://www.slideshare.net/leoyuriev/lmdb). - -Там же приведен пример количественной оценки прироста производительности -за счет эффективной работы [BBWC](https://en.wikipedia.org/wiki/BBWC) -при включении `LIFO RECLAIM` в _libmdbx_. - -### Сохранность данных в режиме асинхронной фиксации -При работе в режиме `WRITEMAP+MAPSYNC` запись измененных страниц -выполняется ядром ОС, что имеет ряд преимуществ. Так например, при крахе -приложения, ядро ОС сохранит все изменения. - -Однако, при аварийном отключении питания или сбое в ядре ОС, на диске -может быть сохранена только часть измененных страниц БД. При этом с -большой вероятностью может оказаться, что будут сохранены мета-страницы -со ссылками на страницы с новыми версиями данных, но не сами новые -данные. В этом случае БД будет безвозвратна разрушена, даже если до -аварии производилась полная синхронизация данных (посредством -`mdbx_env_sync()`). - -В _libmdbx_ эта проблема устранена путем полной переработки -пути записи данных: - -* В режиме `WRITEMAP+MAPSYNC` _libmdbx_ не обновляет - мета-страницы непосредственно, а поддерживает их теневые копии - с переносом изменений после фиксации данных. - -* При завершении транзакций, в зависимости от состояния - синхронности данных между диском и оперативной памятью, - _libmdbx_ помечает точки фиксации либо как сильные (strong), - либо как слабые (weak). Так например, в режиме - `WRITEMAP+MAPSYNC` завершаемые транзакции помечаются как - слабые, а при явной синхронизации данных - как сильные. - -* В _libmdbx_ поддерживается не две, а три отдельные мета-страницы. - Это позволяет выполнять фиксацию транзакций с формированием как - сильной, так и слабой точки фиксации, без потери двух предыдущих - точек фиксации (из которых одна может быть сильной, а вторая слабой). - В результате, _libmdbx_ позволяет в произвольном порядке чередовать - сильные и слабые точки фиксации без нарушения соответствующих - гарантий в случае неожиданной системной аварии во время фиксации. - -* При открытии БД выполняется автоматический откат к последней - сильной фиксации. Этим обеспечивается гарантия сохранности БД. - -Такая гарантия надежности не дается бесплатно. Для сохранности данных, -страницы, формирующие крайний снимок с сильной фиксацией, не должны -повторно использоваться (перезаписываться) до формирования следующей -сильной точки фиксации. Таким образом, крайняя точка фиксации создает -описанный выше эффект "долгого чтения". Разница же здесь в том, что при -исчерпании свободных страниц ситуация будет автоматически исправлена, -посредством записи изменений на диск и формирования новой сильной точки -фиксации. - -Таким образом, в режиме безопасной асинхронной фиксации _libmdbx_ будет -всегда использовать новые страницы до исчерпания места в БД или до -явного формирования сильной точки фиксации посредством -`mdbx_env_sync()`. При этом суммарный трафик записи на диск будет -примерно такой же, как если бы отдельно фиксировалась каждая транзакция. - -В текущей версии _libmdbx_ вам предоставляется выбор между безопасным -режимом (по умолчанию) асинхронной фиксации, и режимом `UTTERLY_NOSYNC` -когда при системной аварии есть шанс полного разрушения БД как в LMDB. - -В последующих версиях _libmdbx_ будут предусмотрены средства для -асинхронной записи данных на диск с автоматическим формированием сильных -точек фиксации. - --------------------------------------------------------------------------------- - -Использование -============= - -## Сборка - -Для сборки на всех платформах кроме Windows вам потребуются не-дремучие -версии: GNU Make, [bash](https://ru.wikipedia.org/wiki/Bash), компиляторы C и C++ совместимые с GCC или CLANG. - -Исторически сборка _libmdbx_ основывается на одном -[Makefile](https://ru.wikipedia.org/wiki/Makefile), что предполагает -разные рецепты сборки в зависимости от целевой платформы. В следующих -версиях планируется переход на использование -[CMake](https://ru.wikipedia.org/wiki/CMake), с отказом от поддержки -других инструментов. - -#### Выгрузка DSO/DLL и деструкторы Thread-Local-Storage объектов -При сборке _libmdbx_ в виде разделяемой библиотеки, либо использовании -статической _libmdbx_ в составе другой динамической библиотеке, -желательно убедиться, что ваша система обеспечивает корректность вызова -деструкторов Thread-Local-Storage объектов при выгрузке динамических -библиотек. - -Если это не так, то при выгрузке динамической библиотеки с _libmdbx_ -внутри возможна либо утечка ресурсов, либо падения из-за вызова -деструкторов из уже выгруженного DSO/DLL объекта. Проблема может -проявляться только в многопоточном приложении, которое производит -выгрузку разделяемых динамических библиотек с кодом _libmdbx_ внутри, -после использования _libmdbx_. Заведомо известно, что TLS-деструкторы -корректно обслуживаются: - -- На всех актуальных версиях Windows (Windows 7 и последующих). - -- На системах c функцией -[`__cxa_thread_atexit_impl()`](https://sourceware.org/glibc/wiki/Destructor%20support%20for%20thread_local%20variables) -в стандартной библиотеке C. В том числе на системах с GNU libc версии -2.18 и выше. - -- На системах с libpthread/ntpl из состава GNU libc с исправлением -ошибок [#21031](https://sourceware.org/bugzilla/show_bug.cgi?id=21031) и -[#21032](https://sourceware.org/bugzilla/show_bug.cgi?id=21032), либо -где нет подобных ошибок в реализации pthreads. - -### Linux и другие платформы с GNU Make -Для сборки библиотеки достаточно выполнить `make all` в директории с -исходными текстами, а для выполнения базовых тестов `make check`. - -Если установленный в система `make` не является GNU Make, то при попытке -сборки будет масса ошибок от make. В этом случае, возможно, вместо -`make` вам следует использовать `gmake`, либо даже `gnu-make` и т.п. - -### FreeBSD и родственные платформы -Как правило, на таких системах по-умолчанию используется Berkeley Make. -А GNU Make вызывается командой `gmake` или может отсутствовать. Кроме -этого может отсутствовать [`bash`](https://ru.wikipedia.org/wiki/Bash). - -Вам необходимо установить требуемые компоненты: GNU Make, bash, -компиляторы C и C++ совместимые с GCC или CLANG. После этого для сборки -библиотеки достаточно выполнить `gmake all` (или `make all`) в -директории с исходными текстами, а для выполнения базовых тестов `gmake -check` (или `make check`). - -### Windows -Для сборки libmdbx_ для ОС Windows рекомендуется использовать [Microsoft -Visual Studio](https://ru.wikipedia.org/wiki/Microsoft_Visual_Studio), -но не такие инструменты как MinGW, MSYS или Cygwin. Для этого в набор -исходных кодов _libmdbx_ входят соответствующие файлы проектов -совместимые с Visual Studio 2015, Windows SDK для Windows 8.1 и более -поздними версиями. Достаточно открыть `mdbx.sln` и выполнить сборку -библиотеки. - -Для сборки с более новыми версиями SDK или Visual Studio должно быть -достаточно выполнить "Retarget solution". Для сборки под старые версии -Windows (например Windows XP) или более старыми компиляторами вам -потребуется самостоятельно преобразовать или воссоздать файлы проектов. - -Сборка посредством MinGW, MSYS или Cygwin потенциально возможна. Однако, -эти сценарии не тестируются и вероятно потребуют от вас доработки -`Makefile`. Следует отметить, что в _libmdbx_ предприняты усилия для -устранения runtime зависимостей от CRT и других библиотек Visual Studio. -Для этого достаточно при сборке определить опцию `MDBX_AVOID_CRT`. - -Пример запуска базового сценария тестирования можно найти в -[CI-сценарии](appveyor.yml) для [AppVeyor](https://www.appveyor.com/). -Для выполнения [сценария длительного стохастического -тестирования](test/long_stochastic.sh) потребуется -[`bash`](https://ru.wikipedia.org/wiki/Bash), а само тестирование -рекомендуется выполнять с размещением тестовых данных на -[RAM-диске](https://ru.wikipedia.org/wiki/RAM-%D0%B4%D0%B8%D1%81%D0%BA). - -### MacOS X -Актуальные [нативные сборочные -инструменты](https://ru.wikipedia.org/wiki/Xcode) для MacOS X включают -GNU Make, CLANG и устаревшую версию bash. Поэтому для сборки библиотеки -достаточно выполнить `make all` в директории с исходными текстами, а для -выполнения базовых тестов `make check`. Если же что-то пойдет не так, то -рекомендуется установить [Homebrew](https://brew.sh/) и попробовать ещё -раз. - -Для выполнения [сценария длительного стохастического -тестирования](test/long_stochastic.sh) потребуется установка актуальной -(не устаревшей) версии [`bash`](https://ru.wikipedia.org/wiki/Bash). Для -этого рекомендуется установить [Homebrew](https://brew.sh/), а затем -выполнить `brew install bash`. - -## Привязки к другим языкам - - | Runtime | GitHub | Author | - | -------- | ------ | ------ | - | Java | [mdbxjni](https://github.com/castortech/mdbxjni) | [Castor Technologies](https://castortech.com/) | - | .NET | [mdbx.NET](https://github.com/wangjia184/mdbx.NET) | [Jerry Wang](https://github.com/wangjia184) | - --------------------------------------------------------------------------------- - -Сравнение производительности -============================ - -Все представленные ниже данные получены многократным прогоном тестов на -ноутбуке Lenovo Carbon-2, i7-4600U 2.1 ГГц, 8 Гб ОЗУ, с SSD-диском -SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб. - -Исходный код бенчмарка [_IOArena_](https://github.com/pmwkaa/ioarena) и -сценарии тестирования [доступны на -github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). - - -## Интегральная производительность - -Показана соотнесенная сумма ключевых показателей производительности в трёх -бенчмарках: - - - Чтение/Поиск на машине с 4-мя процессорами; - - - Транзакции с [CRUD](https://ru.wikipedia.org/wiki/CRUD)-операциями - (вставка, чтение, обновление, удаление) в режиме **синхронной фиксации** - данных (fdatasync при завершении каждой транзакции или аналог); - - - Транзакции с [CRUD](https://ru.wikipedia.org/wiki/CRUD)-операциями - (вставка, чтение, обновление, удаление) в режиме **отложенной фиксации** - данных (отложенная запись посредством файловой систем или аналог); - -*Бенчмарк в режиме асинхронной записи не включен по двум причинам:* - - 1. Такое сравнение не совсем правомочно, его следует делать с движками - ориентированными на хранение данных в памяти ([Tarantool](https://tarantool.io/), [Redis](https://redis.io/)). - - 2. Превосходство libmdbx становится еще более подавляющим, что мешает - восприятию информации. - -![Comparison #1: Integral Performance](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-1.png) - --------------------------------------------------------------------------------- - -## Масштабируемость чтения - -Для каждого движка показана суммарная производительность при -одновременном выполнении запросов чтения/поиска в 1-2-4-8 потоков на -машине с 4-мя физическими процессорами. - -![Comparison #2: Read Scalability](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-2.png) - --------------------------------------------------------------------------------- - -## Синхронная фиксация - - - Линейная шкала слева и темные прямоугольники соответствуют количеству - транзакций в секунду, усредненному за всё время теста. - - - Логарифмическая шкала справа и желтые интервальные отрезки - соответствуют времени выполнения транзакций. При этом каждый отрезок - показывает минимальное и максимальное время, затраченное на выполнение - транзакций, а крестиком отмечено среднеквадратичное значение. - -Выполняется **10.000 транзакций в режиме синхронной фиксации данных** на -диске. При этом требуется гарантия, что при аварийном выключении питания -(или другом подобном сбое) все данные будут консистентны и полностью -соответствовать последней завершенной транзакции. В _libmdbx_ в этом -режиме при фиксации каждой транзакции выполняется системный вызов -[fdatasync](https://linux.die.net/man/2/fdatasync). - -В каждой транзакции выполняется комбинированная CRUD-операция (две -вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует -на пустой базе, а при завершении, в результате выполняемых действий, в -базе насчитывается 10.000 небольших key-value записей. - -![Comparison #3: Sync-write mode](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-3.png) - --------------------------------------------------------------------------------- - -## Отложенная фиксация - - - Линейная шкала слева и темные прямоугольники соответствуют количеству - транзакций в секунду, усредненному за всё время теста. - - - Логарифмическая шкала справа и желтые интервальные отрезки - соответствуют времени выполнения транзакций. При этом каждый отрезок - показывает минимальное и максимальное время, затраченное на выполнение - транзакций, а крестиком отмечено среднеквадратичное значение. - -Выполняется **100.000 транзакций в режиме отложенной фиксации данных** -на диске. При этом требуется гарантия, что при аварийном выключении -питания (или другом подобном сбое) все данные будут консистентны на -момент завершения одной из транзакций, но допускается потеря изменений -из некоторого количества последних транзакций, что для многих движков -предполагает включение -[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) (write-ahead -logging) либо журнала транзакций, который в свою очередь опирается на -гарантию упорядоченности данных в журналируемой файловой системе. -_libmdbx_ при этом не ведет WAL, а передает весь контроль файловой -системе и ядру ОС. - -В каждой транзакции выполняется комбинированная CRUD-операция (две -вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует -на пустой базе, а при завершении, в результате выполняемых действий, в -базе насчитывается 100.000 небольших key-value записей. - -![Comparison #4: Lazy-write mode](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-4.png) - --------------------------------------------------------------------------------- - -## Асинхронная фиксация - - - Линейная шкала слева и темные прямоугольники соответствуют количеству - транзакций в секунду, усредненному за всё время теста. - - - Логарифмическая шкала справа и желтые интервальные отрезки - соответствуют времени выполнения транзакций. При этом каждый отрезок - показывает минимальное и максимальное время, затраченное на выполнение - транзакций, а крестиком отмечено среднеквадратичное значение. - -Выполняется **1.000.000 транзакций в режиме асинхронной фиксации -данных** на диске. При этом требуется гарантия, что при аварийном -выключении питания (или другом подобном сбое) все данные будут -консистентны на момент завершения одной из транзакций, но допускается -потеря изменений из значительного количества последних транзакций. Во -всех движках при этом включался режим предполагающий минимальную -нагрузку на диск по записи, и соответственно минимальную гарантию -сохранности данных. В _libmdbx_ при этом используется режим асинхронной -записи измененных страниц на диск посредством ядра ОС и системного -вызова [msync(MS_ASYNC)](https://linux.die.net/man/2/msync). - -В каждой транзакции выполняется комбинированная CRUD-операция (две -вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует -на пустой базе, а при завершении, в результате выполняемых действий, в -базе насчитывается 10.000 небольших key-value записей. - -![Comparison #5: Async-write mode](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-5.png) - --------------------------------------------------------------------------------- - -## Потребление ресурсов - -Показана соотнесенная сумма использованных ресурсов в ходе бенчмарка в -режиме отложенной фиксации: - - - суммарное количество операций ввода-вывода (IOPS), как записи, так и - чтения. - - - суммарное затраченное время процессора, как в режиме пользовательских - процессов, так и в режиме ядра ОС. - - - использованное место на диске при завершении теста, после закрытия БД - из тестирующего процесса, но без ожидания всех внутренних операций - обслуживания (компактификации LSM и т.п.). - -Движок _ForestDB_ был исключен при оформлении результатов, так как -относительно конкурентов многократно превысил потребление каждого из -ресурсов (потратил процессорное время на генерацию IOPS для заполнения -диска), что не позволяло наглядно сравнить показатели остальных движков -на одной диаграмме. - -Все данные собирались посредством системного вызова -[getrusage()](http://man7.org/linux/man-pages/man2/getrusage.2.html) и -сканированием директорий с данными. - -![Comparison #6: Cost comparison](https://raw.githubusercontent.com/wiki/leo-yuriev/libmdbx/img/perf-slide-6.png) - --------------------------------------------------------------------------------- - -``` -$ objdump -f -h -j .text libmdbx.so - -libmdbx.so: file format elf64-x86-64 -architecture: i386:x86-64, flags 0x00000150: -HAS_SYMS, DYNAMIC, D_PAGED -start address 0x0000000000003870 - -Sections: -Idx Name Size VMA LMA File off Algn - 11 .text 000173d4 0000000000003870 0000000000003870 00003870 2**4 - CONTENTS, ALLOC, LOAD, READONLY, CODE - -``` diff --git a/libs/libmdbx/src/README.md b/libs/libmdbx/src/README.md index a5b420f35b..5c32049f79 100644 --- a/libs/libmdbx/src/README.md +++ b/libs/libmdbx/src/README.md @@ -4,18 +4,25 @@ libmdbx ====================================== -Revised and extended descendant of [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) (aka _LMDB_). -Русскоязычная версия [здесь](README-RU.md). -_libmdbx_ is superior to LMDB in terms of features and reliability, not -inferior in performance. _libmdbx_ works on Linux, FreeBSD, MacOS X and -other systems compliant with POSIX.1-2008, but also support Windows as a -complementary platform. - -The next version is under active non-public development, which will -radically change both the API and the database format. The goal of this -revolution is to provide a clearer and more reliable API, add more -features and new database properties. +_libmdbx_ is an extremely fast, compact, powerful, embedded +transactional [key-value +store](https://en.wikipedia.org/wiki/Key-value_database) +database, with permissive [OpenLDAP Public License](LICENSE). +_libmdbx_ has a specific set of properties and capabilities, +focused on creating unique lightweight solutions with +extraordinary performance. + +The next version is under active non-public development and will be +released as **_MithrilDB_** and `libmithrildb` for libraries & packages. +Admittedly mythical [Mithril](https://en.wikipedia.org/wiki/Mithril) is +resembling silver but being stronger and lighter than steel. Therefore +_MithrilDB_ is rightly relevant name. +> _MithrilDB_ will be radically different from _libmdbx_ by the new +> database format and API based on C++17, as well as the [Apache 2.0 +> License](https://www.apache.org/licenses/LICENSE-2.0). The goal of this +> revolution is to provide a clearer and robust API, add more features and +> new valuable properties of database. *The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо.* @@ -31,8 +38,6 @@ features and new database properties. - [Key features](#key-features) - [Improvements over LMDB](#improvements-over-lmdb) - [Gotchas](#gotchas) - - [Problem of long-time reading](#problem-of-long-time-reading) - - [Durability in asynchronous writing mode](#durability-in-asynchronous-writing-mode) - [Usage](#usage) - [Building](#building) - [Bindings](#bindings) @@ -47,15 +52,23 @@ features and new database properties. ----- ## Overview -_libmdbx_ is an embedded lightweight key-value database engine oriented -for performance. -_libmdbx_ allows multiple processes to read and update several key-value +_libmdbx_ is revised and extended descendant of amazing [Lightning +Memory-Mapped +Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). +_libmdbx_ inherits all features and characteristics from +[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database), +but resolves some issues and adds several features. + + - _libmdbx_ guarantee data integrity after crash unless this was explicitly +neglected in favour of write performance. + + - _libmdbx_ allows multiple processes to read and update several key-value tables concurrently, while being [ACID](https://en.wikipedia.org/wiki/ACID)-compliant, with minimal overhead and Olog(N) operation cost. -_libmdbx_ enforce + - _libmdbx_ enforce [serializability](https://en.wikipedia.org/wiki/Serializability) for writers by single [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) and affords @@ -63,29 +76,32 @@ writers by single for parallel readers without atomic/interlocked operations, while writing and reading transactions do not block each other. -_libmdbx_ can guarantee consistency after crash depending of operation -mode. - -_libmdbx_ uses [B+Trees](https://en.wikipedia.org/wiki/B%2B_tree) and + - _libmdbx_ uses [B+Trees](https://en.wikipedia.org/wiki/B%2B_tree) and [Memory-Mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), doesn't use [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) which might be a caveat for some workloads. + - _libmdbx_ implements a simplified variant of the [Berkeley +DB](https://en.wikipedia.org/wiki/Berkeley_DB) and/or +[dbm](https://en.wikipedia.org/wiki/DBM_(computing)) API. + + - _libmdbx_ supports Linux, Windows, MacOS, FreeBSD, DragonFly, Solaris, +OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with +POSIX.1-2008. + ### Comparison with other databases For now please refer to [chapter of "BoltDB comparison with other databases"](https://github.com/coreos/bbolt#comparison-with-other-databases) -which is also (mostly) applicable to MDBX. +which is also (mostly) applicable to _libmdbx_. ### History -The _libmdbx_ design is based on [Lightning Memory-Mapped -Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). -Initial development was going in +At first the development was carried out within the [ReOpenLDAP](https://github.com/leo-yuriev/ReOpenLDAP) project. About a -year later libmdbx was isolated to separate project, which was +year later _libmdbx_ was separated into standalone project, which was [presented at Highload++ 2015 conference](http://www.highload.ru/2015/abstracts/1831.html). -Since early 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta), +Since 2017 _libmdbx_ is used in [Fast Positive Tables](https://github.com/leo-yuriev/libfpta), and development is funded by [Positive Technologies](https://www.ptsecurity.com). ### Acknowledgments @@ -102,9 +118,6 @@ Description ## Key features -_libmdbx_ inherits all features and characteristics from -[LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database): - 1. Key-value pairs are stored in ordered map(s), keys are always sorted, range lookups are supported. @@ -148,146 +161,90 @@ without freezing writers. ## Improvements over LMDB -1. Automatic dynamic DB size management according to the parameters -specified by `mdbx_env_set_geometry()` function. Including -growth step and truncation threshold, as well as the choice of page -size. - -2. Automatic returning of freed pages into unallocated space at the end -of database file, with optionally automatic shrinking it. This reduces -amount of pages resides in RAM and circulated in disk I/O. In fact -_libmdbx_ constantly performs DB compactification, without spending -additional resources for that. - -3. `LIFO RECLAIM` mode: - - The newest pages are picked for reuse instead of the oldest. This allows - to minimize reclaim loop and make it execution time independent of total - page count. - - This results in OS kernel cache mechanisms working with maximum - efficiency. In case of using disk controllers or storages with - [BBWC](https://en.wikipedia.org/wiki/Disk_buffer#Write_acceleration) - this may greatly improve write performance. - -4. Fast estimation of range query result size via functions -`mdbx_estimate_range()`, `mdbx_estimate_move()` and -`mdbx_estimate_distance()`. E.g. for selection the optimal query -execution plan. - -5. `mdbx_chk` tool for DB integrity check. - -6. Support for keys and values of zero length, including multi-values +_libmdbx_ is superior to _legendary [LMDB](https://symas.com/lmdb/)_ in +terms of features and reliability, not inferior in performance. In +comparison to LMDB, _libmdbx_ make things "just work" perfectly and +out-of-the-box, not silently and catastrophically break down. The list +below is pruned down to the improvements most notable and obvious from +the user's point of view. + +1. Automatic on-the-fly database size control by preset parameters, both +reduction and increment. + > _libmdbx_ manage the database size according to parameters specified + > by `mdbx_env_set_geometry()` function, + > ones include the growth step and the truncation threshold. + +2. Automatic continuous zero-overhead database compactification. + > _libmdbx_ logically move as possible a freed pages + > at end of allocation area into unallocated space, + > and then release such space if a lot of. + +3. LIFO policy for recycling a Garbage Collection items. On systems with a disk +write-back cache, this can significantly increase write performance, up to +several times in a best case scenario. + > LIFO means that for reuse pages will be taken which became unused the lastest. + > Therefore the loop of database pages circulation becomes as short as possible. + > In other words, the number of pages, that are overwritten in memory + > and on disk during a series of write transactions, will be as small as possible. + > Thus creates ideal conditions for the efficient operation of the disk write-back cache. + +4. Fast estimation of range query result volume, i.e. how many items can +be found between a `KEY1` and a `KEY2`. This is prerequisite for build +and/or optimize query execution plans. + > _libmdbx_ performs a rough estimate based only on b-tree pages that + > are common for the both stacks of cursors that were set to corresponing + > keys. + +5. `mdbx_chk` tool for database integrity check. + +6. Guarantee of database integrity even in asynchronous unordered write-to-disk mode. + > _libmdbx_ propose additional trade-off by implementing append-like manner for updates + > in `NOSYNC` and `MAPASYNC` modes, that avoid database corruption after a system crash + > contrary to LMDB. Nevertheless, the `MDBX_UTTERLY_NOSYNC` mode available to match LMDB behaviour, + > and for a special use-cases. + +7. Automated steady flush to disk upon volume of changes and/or by +timeout via cheap polling. + +8. Sequence generation and three cheap persistent 64-bit markers with ACID. + +9. Support for keys and values of zero length, including multi-values (aka sorted duplicates). -7. Ability to assign up to 3 persistent 64-bit markers to commiting -transaction with `mdbx_canary_put()` and then get them in read -transaction by `mdbx_canary_get()`. - -8. Ability to update or delete record and get previous value via -`mdbx_replace()`. Also allows update the specific item from multi-value -with the same key. - -9. Sequence generation via `mdbx_dbi_sequence()`. - -10. `OOM-KICK` callback. - - `mdbx_env_set_oomfunc()` allows to set a callback, which will be called - in the event of DB space exhausting during long-time read transaction in - parallel with extensive updating. Callback will be invoked with PID and - pthread_id of offending thread as parameters. Callback can do any of - these things to remedy the problem: - - * wait for read transaction to finish normally; - - * kill the offending process (signal 9), if separate process is doing - long-time read; - - * abort or restart offending read transaction if it's running in sibling - thread; - - * abort current write transaction with returning error code. - -11. Ability to open DB in exclusive mode by `MDBX_EXCLUSIVE` flag. - -12. Ability to get how far current read-transaction snapshot lags -from the latest version of the DB by `mdbx_txn_straggler()`. - -13. Ability to explicitly update the existing record, not insertion -a new one. Implemented as `MDBX_CURRENT` flag for `mdbx_put()`. - -14. Fixed `mdbx_cursor_count()`, which returns correct count of -duplicated (aka multi-value) for all cases and any cursor position. +10. The handler of lack-of-space condition with a callback, +that allow you to control and resolve such situations. -15. `mdbx_env_info()` to getting additional info, including number of -the oldest snapshot of DB, which is used by someone of the readers. +11. Support for opening a database in the exclusive mode, including on a network share. -16. `mdbx_del()` doesn't ignore additional argument (specifier) `data` -for tables without duplicates (without flag `MDBX_DUPSORT`), if `data` -is not null then always uses it to verify record, which is being -deleted. +12. Extended transaction info, including dirty and leftover space info +for a write transaction, reading lag and hold over space for read +transactions. -17. Ability to open dbi-table with simultaneous with race-free setup -of comparators for keys and values, via `mdbx_dbi_open_ex()`. +13. Extended whole-database info (aka environment) and reader enumeration. -18. `mdbx_is_dirty()`to find out if given key or value is on dirty page, that -useful to avoid copy-out before updates. +14. Extended update or delete, _at once_ with getting previous value +and addressing the particular item from multi-value with the same key. -19. Correct update of current record in `MDBX_CURRENT` mode of -`mdbx_cursor_put()`, including sorted duplicated. +15. Support for explicitly updating the existing record, not insertion a new one. -20. Check if there is a row with data after current cursor position via -`mdbx_cursor_eof()`. +16. All cursors are uniformly, can be reused and should be closed explicitly, +regardless ones were opened within write or read transaction. -21. Additional error code `MDBX_EMULTIVAL`, which is returned by -`mdbx_put()` and `mdbx_replace()` in case is ambiguous update or delete. +17. Correct update of current record with `MDBX_CURRENT` flag when size +of key or data was changed, including sorted duplicated. -22. Ability to get value by key and duplicates count by `mdbx_get_ex()`. +18. Opening database handles is spared from race conditions and +pre-opening is not needed. -23. Functions `mdbx_cursor_on_first()` and `mdbx_cursor_on_last()`, -which allows to check cursor is currently on first or last position -respectively. +19. Ability to determine whether the particular data is on a dirty page +or not, that allows to avoid copy-out before updates. -24. Automatic creation of steady commit-points (flushing data to the -disk) when the volume of changes reaches a threshold, which can be -set by `mdbx_env_set_syncbytes()`. +20. Ability to determine whether the cursor is pointed to a key-value +pair, to the first, to the last, or not set to anything. -25. Control over debugging and receiving of debugging messages via -`mdbx_setup_debug()`. +21. Returning `MDBX_EMULTIVAL` error in case of ambiguous update or delete. -26. Function `mdbx_env_pgwalk()` for page-walking the DB. - -27. Three meta-pages instead of two, that allows to guarantee -consistency of data when updating weak commit-points without the -risk of damaging the last steady commit-point. - -28. Guarantee of DB integrity in `WRITEMAP+MAPSYNC` mode: - > Current _libmdbx_ gives a choice of safe async-write mode (default) - > and `UTTERLY_NOSYNC` mode which may result in full - > DB corruption during system crash as with LMDB. For details see - > [Data safety in async-write mode](#data-safety-in-async-write-mode). - -29. Ability to close DB in "dirty" state (without data flush and -creation of steady synchronization point) via `mdbx_env_close_ex()`. - -30. If read transaction is aborted via `mdbx_txn_abort()` or -`mdbx_txn_reset()` then DBI-handles, which were opened during it, -will not be closed or deleted. In several cases this allows -to avoid hard-to-debug errors. - -31. All cursors in all read and write transactions can be reused by -`mdbx_cursor_renew()` and MUST be freed explicitly. - > ## Caution, please pay attention! - > - > This is the only change of API, which changes semantics of cursor management - > and can lead to memory leaks on misuse. This is a needed change as it eliminates ambiguity - > which helps to avoid such errors as: - > - use-after-free; - > - double-free; - > - memory corruption and segfaults. - - -32. On **Mac OS X** the `fcntl(F_FULLFSYNC)` syscall is used _by +22. On **MacOS** the `fcntl(F_FULLFSYNC)` syscall is used _by default_ to synchronize data with the disk, as this is [the only way to guarantee data durability](https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fsync.2.html) @@ -297,7 +254,7 @@ compared to LMDB, where the `fsync()` syscall is used. Therefore, _libmdbx_ allows you to override this behavior by defining the `MDBX_OSX_SPEED_INSTEADOF_DURABILITY=1` option while build the library. -33. On **Windows** the `LockFileEx()` syscall is used for locking, since +23. On **Windows** the `LockFileEx()` syscall is used for locking, since it allows place the database on network drives, and provides protection against incompetent user actions (aka [poka-yoke](https://en.wikipedia.org/wiki/Poka-yoke)). Therefore @@ -307,8 +264,9 @@ named mutexes are used. ## Gotchas -1. There cannot be more than one writer at a time. This allows serialize an -updates and eliminate any possibility of conflicts, deadlocks or logical errors. +1. There cannot be more than one writer at a time. + > On the other hand, this allows serialize an updates and eliminate any + > possibility of conflicts, deadlocks or logical errors. 2. No [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) means relatively big [WAF](https://en.wikipedia.org/wiki/Write_amplification) @@ -332,7 +290,7 @@ is done on memory page level with [B+trees](https://ru.wikipedia.org/wiki/B-%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D0%BE). Therefore altering data requires to copy about Olog(N) memory pages, which uses [memory bandwidth](https://en.wikipedia.org/wiki/Memory_bandwidth) and is main -performance bottleneck in `MAPASYNC` mode. +performance bottleneck in `MDBX_MAPASYNC` mode. > This is unavoidable, but isn't that bad. Syncing data to disk requires > much more similar operations which will be done by OS, therefore this is > noticeable only if data sync to persistent storage is fully disabled. @@ -340,120 +298,61 @@ performance bottleneck in `MAPASYNC` mode. > performance overhead. If there is no need to save data to persistent > storage then it's much more preferable to use `std::map`. +4. Massive altering of data during a parallel long read operation will +increase the process work set, may exhaust entire free database space and +result in subsequent write performance degradation. + > _libmdbx_ mostly solve this issue by lack-of-space callback and `MDBX_LIFORECLAIM` mode. + > See [`mdbx.h`](mdbx.h) with API description for details. + > The "next" version of libmdbx (MithrilDB) will completely solve this. -4. _LMDB_ has a problem of long-time readers which degrades performance -and bloats DB. - > _libmdbx_ addresses that, details below. - -5. _LMDB_ is susceptible to DB corruption in `WRITEMAP+MAPASYNC` mode. -_libmdbx_ in `WRITEMAP+MAPASYNC` guarantees DB integrity and consistency -of data. - > Additionally there is an alternative: `UTTERLY_NOSYNC` mode. - > Details below. - - -### Problem of long-time reading -Garbage collection problem exists in all databases one way or another -(e.g. VACUUM in PostgreSQL). But in _libmdbx_ and LMDB it's even more -discernible because of high transaction rate and intentional internals -simplification in favor of performance. - -Understanding the problem requires some explanation, but can be -difficult for quick perception. So is is reasonable -to simplify this as follows: - -* Massive altering of data during a parallel long read operation may -exhaust the free DB space. - -* If the available space is exhausted, any attempt to update the data -will cause a "MAP_FULL" error until a long read transaction is -completed. - -* A good example of long readers is a hot backup or debugging of -a client application while retaining an active read transaction. +5. There are no built-in checksums or digests to verify database integrity. + > The "next" version of _libmdbx_ (MithrilDB) will solve this issue employing [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree). -* In _LMDB_ this results in degraded performance of all operations of -writing data to persistent storage. - -* _libmdbx_ has the `OOM-KICK` mechanism which allow to abort such -operations and the `LIFO RECLAIM` mode which addresses performance -degradation. - -### Durability in asynchronous writing mode -In `WRITEMAP+MAPSYNC` mode updated (aka dirty) pages are written to -persistent storage by the OS kernel. This means that if the application -fails, the OS kernel will finish writing all updated data to disk and -nothing will be lost. However, in the case of hardware malfunction or OS -kernel fatal error, only some updated data can be written to disk and -the database structure is likely to be destroyed. In such situation, DB -is completely corrupted and can't be repaired. - -_libmdbx_ addresses this by fully reimplementing write path of data: - -* In `WRITEMAP+MAPSYNC` mode meta-data pages aren't updated in place, -instead their shadow copies are used and their updates are synced after -data is flushed to disk. - -* During transaction commit _libmdbx_ marks it as a steady or weak -depending on synchronization status between RAM and persistent storage. -For instance, in the `WRITEMAP+MAPSYNC` mode committed transactions -are marked as weak by default, but as steady after explicit data flushes. - -* _libmdbx_ maintains three separate meta-pages instead of two. This -allows to commit transaction as steady or weak without losing two -previous commit points (one of them can be steady, and another -weak). Thus, after a fatal system failure, it will be possible to -rollback to the last steady commit point. +-------------------------------------------------------------------------------- -* During DB open _libmdbx_ rollbacks to the last steady commit point, -this guarantees database integrity after a crash. However, if the -database opening in read-only mode, such rollback cannot be performed -which will cause returning the MDBX_WANNA_RECOVERY error. +Usage +===== -For data integrity a pages which form database snapshot with steady -commit point, must not be updated until next steady commit point. -Therefore the last steady commit point creates an effect analogues to -"long-time read". The only difference that now in case of space -exhaustion the problem will be immediately addressed by writing changes -to disk and forming the new steady commit point. +## Source code embedding -So in async-write mode _libmdbx_ will always use new pages until the -free DB space will be exhausted or `mdbx_env_sync()` will be invoked, -and the total write traffic to the disk will be the same as in -sync-write mode. +_libmdbx_ provides two official ways for integration in source code form: -Currently libmdbx gives a choice between a safe async-write mode -(default) and `UTTERLY_NOSYNC` mode which may lead to DB corruption -after a system crash, i.e. like the LMDB. +1. Using the amalgamated source code. + > The amalgamated source code includes all files requires to build and + > use _libmdbx_, but not for testing _libmdbx_ itself. -Next version of _libmdbx_ will be automatically create steady commit -points in async-write mode upon completion transfer data to the disk. +2. Adding the complete original source code as a `git submodule`. + > This allows you to build as _libmdbx_ and testing tool. + > On the other hand, this way requires you to pull git tags, and use C++11 compiler for test tool. --------------------------------------------------------------------------------- +**_Please, avoid using any other techniques._** Otherwise, at least +don't ask for support and don't name such chimeras `libmdbx`. -Usage -===== +The amalgamated source code could be created from original clone of git +repository on Linux by executing `make dist`. As a result, the desired +set of files will be formed in the `dist` subdirectory. ## Building -To build on all platforms except Windows the prerequirements are the -same: non-obsolete versions of GNU Make, -[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)), C and C++ -compilers compatible with GCC or CLANG. On Windows you will need only : -Microsoft Visual Studio 2015 or later, Windows SDK for Windows 8 or -later. +Both amalgamated and original source code provides build through the use +[CMake](https://cmake.org/) or [GNU +Make](https://www.gnu.org/software/make/) with +[bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)). All build ways +are completely traditional and have minimal prerequirements like +`build-essential`, i.e. the non-obsolete C/C++ compiler and a +[SDK](https://en.wikipedia.org/wiki/Software_development_kit) for the +target platform. Obviously you need building tools itself, i.e. `git`, +`cmake` or GNU `make` with `bash`. -Historically, the libmdbx builing is based on single -[Makefile](https://en.wikipedia.org/wiki/Makefile) which assumes -different recipes depending on target platform. In the next versions, it -is planned to switch to [CMake](https://en.wikipedia.org/wiki/CMake), -with the refusal to support other tools. +So just use CMake or GNU Make in your habitual manner and feel free to +fill an issue or make pull request in the case something will be +unexpected or broken down. #### DSO/DLL unloading and destructors of Thread-Local-Storage objects When building _libmdbx_ as a shared library or use static _libmdbx_ as a part of another dynamic library, it is advisable to make sure that your system ensures the correctness of the call destructors of -Thread-Local-Storage objects when unloading dynamic libraries'. +Thread-Local-Storage objects when unloading dynamic libraries. If this is not the case, then unloading a dynamic-link library with _libmdbx_ code inside, can result in either a resource leak or a crash @@ -495,22 +394,15 @@ directory with source code, and `gmake check` (or `make check`) to run the basic tests. ### Windows -For building _libmdbx_ on Windows the [Microsoft Visual -Studio](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) is -recommended, but not tools such as MinGW, MSYS, or Cygwin. To do this, -the libmdbx source code includes the set of appropriate project files -that are compatible with Visual Studio 2015, the Windows SDK for Windows -8.1, and later. Just open `mdbx.sln` in Visual Studio and build the -library. - -To build with newer versions of the SDK or Visual Studio, it should be -sufficient to execute "Retarget solution". To build for older versions -of Windows (such as Windows XP) or by older compilers, you will need to -convert or recreate the corresponding project files yourself. +For build _libmdbx_ on Windows the _original_ CMake and [Microsoft Visual +Studio](https://en.wikipedia.org/wiki/Microsoft_Visual_Studio) are +recommended. Building by MinGW, MSYS or Cygwin is potentially possible. However, these scripts are not tested and will probably require you to modify the -Makefile. It should be noted that in _libmdbx_ was efforts to resolve +CMakeLists.txt or Makefile respectively. + +It should be noted that in _libmdbx_ was efforts to resolve runtime dependencies from CRT and other libraries Visual Studio. For this is enough define the `MDBX_AVOID_CRT` during build. @@ -521,9 +413,9 @@ run the [long stochastic test scenario](test/long_stochastic.sh), the such testing is recommended with place the test data on the [RAM-disk](https://en.wikipedia.org/wiki/RAM_drive). -### MacOS X +### MacOS Current [native build tools](https://en.wikipedia.org/wiki/Xcode) for -MacOS X include GNU Make, CLANG and an outdated version of bash. +MacOS include GNU Make, CLANG and an outdated version of bash. Therefore, to build the library, it is enough to run `make all` in the directory with source code, and run `make check` to execute the base tests. If something goes wrong, it is recommended to install @@ -691,11 +583,10 @@ $ objdump -f -h -j .text libmdbx.so libmdbx.so: file format elf64-x86-64 architecture: i386:x86-64, flags 0x00000150: HAS_SYMS, DYNAMIC, D_PAGED -start address 0x0000000000003870 +start address 0x0000000000003710 Sections: Idx Name Size VMA LMA File off Algn - 11 .text 000173d4 0000000000003870 0000000000003870 00003870 2**4 + 11 .text 00015eff 0000000000003710 0000000000003710 00003710 2**4 CONTENTS, ALLOC, LOAD, READONLY, CODE - ``` diff --git a/libs/libmdbx/src/appveyor.yml b/libs/libmdbx/src/appveyor.yml index 0c0dd8a6a6..f42224d582 100644 --- a/libs/libmdbx/src/appveyor.yml +++ b/libs/libmdbx/src/appveyor.yml @@ -1,13 +1,33 @@ -version: 0.3.2.{build} +version: 0.4.0.{build} environment: matrix: + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 + CMAKE_GENERATOR: Visual Studio 16 2019 + TOOLSET: 142 + MDBX_BUILD_SHARED_LIBRARY: OFF + MDBX_AVOID_CRT: OFF + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 + CMAKE_GENERATOR: Visual Studio 16 2019 + TOOLSET: 142 + MDBX_BUILD_SHARED_LIBRARY: ON + MDBX_AVOID_CRT: ON + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 + CMAKE_GENERATOR: Visual Studio 16 2019 + TOOLSET: 142 + MDBX_BUILD_SHARED_LIBRARY: OFF + MDBX_AVOID_CRT: ON + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 + CMAKE_GENERATOR: Visual Studio 16 2019 + TOOLSET: 142 + MDBX_BUILD_SHARED_LIBRARY: ON + MDBX_AVOID_CRT: OFF - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 - TOOLSET: v141 -# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 -# TOOLSET: v140 -# - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 -# TOOLSET: v120 + CMAKE_GENERATOR: Visual Studio 15 2017 + TOOLSET: 141 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + CMAKE_GENERATOR: Visual Studio 14 2015 + TOOLSET: 140 branches: except: @@ -18,37 +38,62 @@ configuration: - Release platform: -- x86 +- Win32 - x64 -#- ARM -build_script: -- ps: > - msbuild "C:\projects\libmdbx\mdbx.sln" /verbosity:minimal - /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" - /property:PlatformToolset=$env:TOOLSET - /property:Configuration=$env:CONFIGURATION - /property:Platform=$env:PLATFORM +before_build: +- git clean -x -f -d +- git submodule sync +- git fetch --tags --prune +- git submodule update --init --recursive +- git submodule foreach --recursive git fetch --tags --prune +- cmake --version -test_script: +build_script: - ps: | - if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_test.exe" -PathType Leaf)) { - $mdbx_test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_test.exe" - $mdbx_chk = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_chk.exe" - } elseif (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) { - $mdbx_test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\mdbx_test.exe" - $mdbx_chk = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\mdbx_chk.exe" + Write-Output "*******************************************************************************" + Write-Output "Configuration: $env:CONFIGURATION" + Write-Output "Platform: $env:PLATFORM" + Write-Output "Toolchain: $env:CMAKE_GENERATOR v$env:TOOLSET" + Write-Output "Options: MDBX_AVOID_CRT=$env:MDBX_AVOID_CRT MDBX_BUILD_SHARED_LIBRARY=$env:MDBX_BUILD_SHARED_LIBRARY" + Write-Output "*******************************************************************************" + + md _build -Force | Out-Null + cd _build + + $generator = $env:CMAKE_GENERATOR + if ($env:TOOLSET -lt 142) { + if ($env:PLATFORM -eq "x64") { + $generator = "$generator Win64" + } + & cmake -G "$generator" -D CMAKE_CONFIGURATION_TYPES="Debug;Release" -D MDBX_AVOID_CRT:BOOL=$env:MDBX_AVOID_CRT -D MDBX_BUILD_SHARED_LIBRARY:BOOL=$env:MDBX_BUILD_SHARED_LIBRARY .. } else { - $mdbx_test = "" - $mdbx_chk = "" + & cmake -G "$generator" -A $env:PLATFORM -D CMAKE_CONFIGURATION_TYPES="Debug;Release" -DMDBX_AVOID_CRT:BOOL=$env:MDBX_AVOID_CRT -D MDBX_BUILD_SHARED_LIBRARY:BOOL=$env:MDBX_BUILD_SHARED_LIBRARY .. } + if ($LastExitCode -ne 0) { + throw "Exec: $ErrorMessage" + } + Write-Output "*******************************************************************************" + + & cmake --build . --config $env:CONFIGURATION + if ($LastExitCode -ne 0) { + throw "Exec: $ErrorMessage" + } + Write-Output "*******************************************************************************" - if ($mdbx_test -ne "") { - & "$mdbx_test" --pathname=test.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 - & "$mdbx_chk" -nvv test.db | Tee-Object -file chk.log | Select-Object -last 42 +test_script: +- ps: | + if (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) { + & ./$env:CONFIGURATION/mdbx_test.exe --progress --console=no --pathname=test.db --dont-cleanup-after basic > test.log + Get-Content test.log | Select-Object -last 42 + if ($LastExitCode -ne 0) { + throw "Exec: $ErrorMessage" + } else { + & ./$env:CONFIGURATION/mdbx_chk.exe -nvv test.db | Tee-Object -file chk.log | Select-Object -last 42 + } } on_failure: -- ps: Push-AppveyorArtifact test.log -- ps: Push-AppveyorArtifact test.db -- ps: Push-AppveyorArtifact chk.log +- ps: Push-AppveyorArtifact \projects\libmdbx\_build\test.log +- ps: Push-AppveyorArtifact \projects\libmdbx\_build\test.db +- ps: Push-AppveyorArtifact \projects\libmdbx\_build\chk.log diff --git a/libs/libmdbx/src/cmake/compiler.cmake b/libs/libmdbx/src/cmake/compiler.cmake new file mode 100644 index 0000000000..03b0805bec --- /dev/null +++ b/libs/libmdbx/src/cmake/compiler.cmake @@ -0,0 +1,666 @@ +## Copyright (c) 2012-2019 Leonid Yuriev <leo@yuriev.ru>. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## + +cmake_minimum_required(VERSION 3.8.2) +cmake_policy(PUSH) +cmake_policy(VERSION 3.8.2) + +if (CMAKE_VERSION MATCHES ".*MSVC.*") + message(FATAL_ERROR "CMake from MSVC kit is unfit! " + "Please use the original CMake from https://cmake.org/download/") +endif() + +if (NOT (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED)) + message(FATAL_ERROR "This module required C or C++ to be enabled") +endif() + +include(CMakeDependentOption) + +if(CMAKE_CXX_COMPILER_LOADED) + include(CheckCXXSourceRuns) + include(CheckCXXSourceCompiles) + include(CheckCXXCompilerFlag) +endif() +if(CMAKE_C_COMPILER_LOADED) + include(CheckCSourceRuns) + include(CheckCSourceCompiles) + include(CheckCCompilerFlag) +endif() + +# Check if the same compile family is used for both C and CXX +if(CMAKE_C_COMPILER_LOADED AND CMAKE_CXX_COMPILER_LOADED AND + NOT (CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)) + message(WARNING "CMAKE_C_COMPILER_ID (${CMAKE_C_COMPILER_ID}) is different " + "from CMAKE_CXX_COMPILER_ID (${CMAKE_CXX_COMPILER_ID}). " + "The final binary may be unusable.") +endif() + +if(CMAKE_CXX_COMPILER_LOADED) + set(CMAKE_PRIMARY_LANG "CXX") +else() + set(CMAKE_PRIMARY_LANG "C") +endif() + +macro(check_compiler_flag flag variable) + if(CMAKE_CXX_COMPILER_LOADED) + check_cxx_compiler_flag(${flag} ${variable}) + else() + check_c_compiler_flag(${flag} ${variable}) + endif() +endmacro(check_compiler_flag) + +# We support building with Clang and gcc. First check +# what we're using for build. +if(CMAKE_C_COMPILER_LOADED AND CMAKE_C_COMPILER_ID STREQUAL "Clang") + set(CMAKE_COMPILER_IS_CLANG ON) + set(CMAKE_COMPILER_IS_GNUCC OFF) +endif() +if(CMAKE_CXX_COMPILER_LOADED AND CMAKE_CXx_COMPILER_ID STREQUAL "Clang") + set(CMAKE_COMPILER_IS_CLANG ON) + set(CMAKE_COMPILER_IS_GNUCXX OFF) +endif() + +# Hard coding the compiler version is ugly from cmake POV, but +# at least gives user a friendly error message. The most critical +# demand for C++ compiler is support of C++11 lambdas, added +# only in version 4.5 https://gcc.gnu.org/projects/cxx0x.html +if(CMAKE_COMPILER_IS_GNUCC) + if(CMAKE_C_COMPILER_VERSION VERSION_LESS 4.5) + message(FATAL_ERROR " + Your GCC version is ${CMAKE_C_COMPILER_VERSION}, please update") + endif() +endif() +if(CMAKE_COMPILER_IS_GNUCXX) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5) + message(FATAL_ERROR " + Your G++ version is ${CMAKE_CXX_COMPILER_VERSION}, please update") + endif() +endif() + +if(CMAKE_C_COMPILER_LOADED) + # Check for Elbrus lcc + execute_process(COMMAND ${CMAKE_C_COMPILER} --version + OUTPUT_VARIABLE tmp_lcc_probe_version + RESULT_VARIABLE tmp_lcc_probe_result ERROR_QUIET) + if(tmp_lcc_probe_result EQUAL 0) + string(FIND "${tmp_lcc_probe_version}" "lcc:" tmp_lcc_marker) + string(FIND "${tmp_lcc_probe_version}" ":e2k-" tmp_e2k_marker) + if(tmp_lcc_marker GREATER -1 AND tmp_e2k_marker GREATER tmp_lcc_marker) + execute_process(COMMAND ${CMAKE_C_COMPILER} -print-version + OUTPUT_VARIABLE CMAKE_C_COMPILER_VERSION + RESULT_VARIABLE tmp_lcc_probe_result) + set(CMAKE_COMPILER_IS_ELBRUSC ON) + set(CMAKE_C_COMPILER_ID "Elbrus") + else() + set(CMAKE_COMPILER_IS_ELBRUSC OFF) + endif() + unset(tmp_lcc_marker) + unset(tmp_e2k_marker) + endif() + unset(tmp_lcc_probe_version) + unset(tmp_lcc_probe_result) +endif() + +if(CMAKE_CXX_COMPILER_LOADED) + # Check for Elbrus l++ + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE tmp_lxx_probe_version + RESULT_VARIABLE tmp_lxx_probe_result ERROR_QUIET) + if(tmp_lxx_probe_result EQUAL 0) + string(FIND "${tmp_lxx_probe_version}" "lcc:" tmp_lcc_marker) + string(FIND "${tmp_lxx_probe_version}" ":e2k-" tmp_e2k_marker) + if(tmp_lcc_marker GREATER -1 AND tmp_e2k_marker GREATER tmp_lcc_marker) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -print-version + OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERSION + RESULT_VARIABLE tmp_lxx_probe_result) + set(CMAKE_COMPILER_IS_ELBRUSCXX ON) + set(CMAKE_CXX_COMPILER_ID "Elbrus") + else() + set(CMAKE_COMPILER_IS_ELBRUSCXX OFF) + endif() + unset(tmp_lcc_marker) + unset(tmp_e2k_marker) + endif() + unset(tmp_lxx_probe_version) + unset(tmp_lxx_probe_result) +endif() + +if(CMAKE_CL_64) + set(MSVC64 1) +endif() +if(WIN32 AND CMAKE_COMPILER_IS_GNU${CMAKE_PRIMARY_LANG}) + execute_process(COMMAND ${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER} -dumpmachine + OUTPUT_VARIABLE __GCC_TARGET_MACHINE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(__GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + set(MINGW64 1) + endif() + unset(__GCC_TARGET_MACHINE) +endif() + +if(CMAKE_COMPILER_IS_ELBRUSC OR CMAKE_SYSTEM_PROCESSOR MATCHES "e2k.*|E2K.*|elbrus.*|ELBRUS.*") + set(E2K TRUE) + set(CMAKE_SYSTEM_ARCH "Elbrus") +elseif((MSVC64 OR MINGW64) AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(X86_64 TRUE) + set(CMAKE_SYSTEM_ARCH "x86_64") +elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) + set(X86_32 TRUE) + set(CMAKE_SYSTEM_ARCH "x86") +elseif(CMAKE_COMPILER_IS_ELBRUSC OR CMAKE_SYSTEM_PROCESSOR MATCHES "e2k.*|E2K.*|elbrus.*|ELBRUS.*") + set(E2K TRUE) + set(CMAKE_SYSTEM_ARCH "Elbrus") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(X86_64 TRUE) + set(CMAKE_SYSTEM_ARCH "x86_64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*") + set(X86_32 TRUE) + set(CMAKE_SYSTEM_ARCH "x86") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|ARM64.*)" AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(AARCH64 TRUE) + set(CMAKE_SYSTEM_ARCH "ARM64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM32 TRUE) + set(CMAKE_SYSTEM_ARCH "ARM") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le.*" AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(PPC64LE TRUE) + set(CMAKE_SYSTEM_ARCH "PPC64LE") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64.*" AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(PPC64 TRUE) + set(CMAKE_SYSTEM_ARCH "PPC64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc).*") + set(PPC32 TRUE) + set(CMAKE_SYSTEM_ARCH "PPC") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips|MIPS)64.*" AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(MIPS64 TRUE) + set(CMAKE_SYSTEM_ARCH "MIPS64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips|MIPS).*") + set(MIPS32 TRUE) + set(CMAKE_SYSTEM_ARCH "MIPS") +endif() + +if(MSVC) + check_compiler_flag("/WX" CC_HAS_WERROR) +else() + # + # GCC started to warn for unused result starting from 4.2, and + # this is when it introduced -Wno-unused-result + # GCC can also be built on top of llvm runtime (on mac). + check_compiler_flag("-Wno-unknown-pragmas" CC_HAS_WNO_UNKNOWN_PRAGMAS) + check_compiler_flag("-Wextra" CC_HAS_WEXTRA) + check_compiler_flag("-Werror" CC_HAS_WERROR) + check_compiler_flag("-fexceptions" CC_HAS_FEXCEPTIONS) + check_cxx_compiler_flag("-fcxx-exceptions" CC_HAS_FCXX_EXCEPTIONS) + check_compiler_flag("-funwind-tables" CC_HAS_FUNWIND_TABLES) + check_compiler_flag("-fno-omit-frame-pointer" CC_HAS_FNO_OMIT_FRAME_POINTER) + check_compiler_flag("-fno-common" CC_HAS_FNO_COMMON) + check_compiler_flag("-ggdb" CC_HAS_GGDB) + check_compiler_flag("-fvisibility=hidden" CC_HAS_VISIBILITY) + check_compiler_flag("-march=native" CC_HAS_ARCH_NATIVE) + check_compiler_flag("-Og" CC_HAS_DEBUG_FRENDLY_OPTIMIZATION) + check_compiler_flag("-Wall" CC_HAS_WALL) + check_compiler_flag("-Ominimal" CC_HAS_OMINIMAL) + check_compiler_flag("-ffunction-sections -fdata-sections" CC_HAS_SECTIONS) + check_compiler_flag("-ffast-math" CC_HAS_FASTMATH) + + # Check for an omp support + set(CMAKE_REQUIRED_FLAGS "-fopenmp -Werror") + check_cxx_source_compiles("int main(void) { + #pragma omp parallel + return 0; + }" HAVE_OPENMP) + set(CMAKE_REQUIRED_FLAGS "") +endif() + +# Check for LTO support by GCC +if(CMAKE_COMPILER_IS_GNU${CMAKE_PRIMARY_LANG}) + unset(gcc_collect) + unset(gcc_lto_wrapper) + + if(NOT CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 4.7) + execute_process(COMMAND ${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER} -v + OUTPUT_VARIABLE gcc_info_v ERROR_VARIABLE gcc_info_v) + + string(REGEX MATCH "^(.+\nCOLLECT_GCC=)([^ \n]+)(\n.+)$" gcc_collect_valid ${gcc_info_v}) + if(gcc_collect_valid) + string(REGEX REPLACE "^(.+\nCOLLECT_GCC=)([^ \n]+)(\n.+)$" "\\2" gcc_collect ${gcc_info_v}) + endif() + + string(REGEX MATCH "^(.+\nCOLLECT_LTO_WRAPPER=)([^ \n]+/lto-wrapper)(\n.+)$" gcc_lto_wrapper_valid ${gcc_info_v}) + if(gcc_lto_wrapper_valid) + string(REGEX REPLACE "^(.+\nCOLLECT_LTO_WRAPPER=)([^ \n]+/lto-wrapper)(\n.+)$" "\\2" gcc_lto_wrapper ${gcc_info_v}) + endif() + + set(gcc_suffix "") + if(gcc_collect_valid AND gcc_collect) + string(REGEX MATCH "^(.*cc)(-.+)$" gcc_suffix_valid ${gcc_collect}) + if(gcc_suffix_valid) + string(REGEX MATCH "^(.*cc)(-.+)$" "\\2" gcc_suffix ${gcc_collect}) + endif() + endif() + + get_filename_component(gcc_dir ${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER} DIRECTORY) + if(NOT CMAKE_GCC_AR) + find_program(CMAKE_GCC_AR NAMES gcc${gcc_suffix}-ar gcc-ar${gcc_suffix} PATHS ${gcc_dir} NO_DEFAULT_PATH) + endif() + if(NOT CMAKE_GCC_NM) + find_program(CMAKE_GCC_NM NAMES gcc${gcc_suffix}-nm gcc-nm${gcc_suffix} PATHS ${gcc_dir} NO_DEFAULT_PATH) + endif() + if(NOT CMAKE_GCC_RANLIB) + find_program(CMAKE_GCC_RANLIB NAMES gcc${gcc_suffix}-ranlib gcc-ranlib${gcc_suffix} PATHS ${gcc_dir} NO_DEFAULT_PATH) + endif() + + unset(gcc_dir) + unset(gcc_suffix_valid) + unset(gcc_suffix) + unset(gcc_lto_wrapper_valid) + unset(gcc_collect_valid) + unset(gcc_collect) + unset(gcc_info_v) + endif() + + if(CMAKE_GCC_AR AND CMAKE_GCC_NM AND CMAKE_GCC_RANLIB AND gcc_lto_wrapper) + message(STATUS "Found GCC's LTO toolset: ${gcc_lto_wrapper}, ${CMAKE_GCC_AR}, ${CMAKE_GCC_RANLIB}") + set(GCC_LTO_CFLAGS "-flto -fno-fat-lto-objects -fuse-linker-plugin") + set(GCC_LTO_AVAILABLE TRUE) + message(STATUS "Link-Time Optimization by GCC is available") + else() + set(GCC_LTO_AVAILABLE FALSE) + message(STATUS "Link-Time Optimization by GCC is NOT available") + endif() + unset(gcc_lto_wrapper) +endif() + +# check for LTO by MSVC +if(MSVC) + if(NOT MSVC_VERSION LESS 1600) + set(MSVC_LTO_AVAILABLE TRUE) + message(STATUS "Link-Time Optimization by MSVC is available") + else() + set(MSVC_LTO_AVAILABLE FALSE) + message(STATUS "Link-Time Optimization by MSVC is NOT available") + endif() +endif() + +# Check for LTO support by CLANG +if(CMAKE_COMPILER_IS_CLANG) + if(NOT CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 3.5) + execute_process(COMMAND ${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER} -print-search-dirs + OUTPUT_VARIABLE clang_search_dirs) + + unset(clang_bindir) + unset(clang_libdir) + string(REGEX MATCH "^(.*programs: =)([^:]*:)*([^:]+/llvm[-.0-9]+/bin[^:]*)(:[^:]*)*(\n.+)$" clang_bindir_valid ${clang_search_dirs}) + if(clang_bindir_valid) + string(REGEX REPLACE "^(.*programs: =)([^:]*:)*([^:]+/llvm[-.0-9]+/bin[^:]*)(:[^:]*)*(\n.+)$" "\\3" clang_bindir ${clang_search_dirs}) + get_filename_component(clang_libdir "${clang_bindir}/../lib" REALPATH) + if(clang_libdir) + message(STATUS "Found CLANG/LLVM directories: ${clang_bindir}, ${clang_libdir}") + endif() + endif() + + if(NOT (clang_bindir AND clang_libdir)) + message(STATUS "Could NOT find CLANG/LLVM directories (bin and/or lib).") + endif() + + if(NOT CMAKE_CLANG_LD AND clang_bindir) + find_program(CMAKE_CLANG_LD NAMES llvm-link link llvm-ld ld PATHS ${clang_bindir} NO_DEFAULT_PATH) + endif() + if(NOT CMAKE_CLANG_AR AND clang_bindir) + find_program(CMAKE_CLANG_AR NAMES llvm-ar ar PATHS ${clang_bindir} NO_DEFAULT_PATH) + endif() + if(NOT CMAKE_CLANG_NM AND clang_bindir) + find_program(CMAKE_CLANG_NM NAMES llvm-nm nm PATHS ${clang_bindir} NO_DEFAULT_PATH) + endif() + if(NOT CMAKE_CLANG_RANLIB AND clang_bindir) + find_program(CMAKE_CLANG_RANLIB NAMES llvm-ranlib ranlib PATHS ${clang_bindir} NO_DEFAULT_PATH) + endif() + + set(clang_lto_plugin_name "LLVMgold${CMAKE_SHARED_LIBRARY_SUFFIX}") + if(NOT CMAKE_LD_GOLD AND clang_bindir) + find_program(CMAKE_LD_GOLD NAMES ld.gold PATHS) + endif() + if(NOT CLANG_LTO_PLUGIN AND clang_libdir) + find_file(CLANG_LTO_PLUGIN ${clang_lto_plugin_name} PATH ${clang_libdir} NO_DEFAULT_PATH) + endif() + if(CLANG_LTO_PLUGIN) + message(STATUS "Found CLANG/LLVM's plugin for LTO: ${CLANG_LTO_PLUGIN}") + else() + message(STATUS "Could NOT find CLANG/LLVM's plugin (${clang_lto_plugin_name}) for LTO.") + endif() + + if(CMAKE_CLANG_LD AND CMAKE_CLANG_AR AND CMAKE_CLANG_NM AND CMAKE_CLANG_RANLIB) + message(STATUS "Found CLANG/LLVM's binutils for LTO: ${CMAKE_CLANG_AR}, ${CMAKE_CLANG_RANLIB}") + else() + message(STATUS "Could NOT find CLANG/LLVM's binutils (ar, ranlib, nm) for LTO.") + endif() + + unset(clang_lto_plugin_name) + unset(clang_libdir) + unset(clang_bindir_valid) + unset(clang_bindir) + unset(clang_search_dirs) + endif() + + if((CLANG_LTO_PLUGIN AND CMAKE_LD_GOLD) AND + (CMAKE_CLANG_LD AND CMAKE_CLANG_AR AND CMAKE_CLANG_NM AND CMAKE_CLANG_RANLIB)) + set(CLANG_LTO_AVAILABLE TRUE) + message(STATUS "Link-Time Optimization by CLANG/LLVM is available") + elseif(CMAKE_TOOLCHAIN_FILE AND NOT CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 7.0) + set(CLANG_LTO_AVAILABLE TRUE) + if (NOT CMAKE_CLANG_AR) + set(CMAKE_CLANG_AR ${CMAKE_AR}) + endif() + if (NOT CMAKE_CLANG_NM) + set(CMAKE_CLANG_NM ${CMAKE_NM}) + endif() + if (NOT CMAKE_CLANG_RANLIB) + set(CMAKE_CLANG_RANLIB ${CMAKE_RANLIB }) + endif() + message(STATUS "Assume Link-Time Optimization by CLANG/LLVM is available via ${CMAKE_TOOLCHAIN_FILE}") + else() + set(CLANG_LTO_AVAILABLE FALSE) + message(STATUS "Link-Time Optimization by CLANG/LLVM is NOT available") + endif() +endif() + +# Perform build type specific configuration. +option(ENABLE_BACKTRACE "Enable output of fiber backtrace information in 'show + fiber' administrative command. Only works on x86 architectures, if compiled + with gcc. If GNU binutils and binutils-dev libraries are installed, backtrace + is output with resolved function (symbol) names. Otherwise only frame + addresses are printed." OFF) + +set(HAVE_BFD False) +if(ENABLE_BACKTRACE) + if(NOT (X86_32 OR X86_64) OR NOT CMAKE_COMPILER_IS_GNU${CMAKE_PRIMARY_LANG}) + # We only know this option to work with gcc + message(FATAL_ERROR "ENABLE_BACKTRACE option is set but the system + is not x86 based (${CMAKE_SYSTEM_PROCESSOR}) or the compiler + is not GNU GCC (${CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER}).") + endif() + # Use GNU bfd if present. + find_library(BFD_LIBRARY NAMES libbfd.a) + if(BFD_LIBRARY) + check_library_exists(${BFD_LIBRARY} bfd_init "" HAVE_BFD_LIB) + endif() + find_library(IBERTY_LIBRARY NAMES libiberty.a) + if(IBERTY_LIBRARY) + check_library_exists(${IBERTY_LIBRARY} cplus_demangle "" HAVE_IBERTY_LIB) + endif() + set(CMAKE_REQUIRED_DEFINITIONS -DPACKAGE=${PACKAGE} -DPACKAGE_VERSION=${PACKAGE_VERSION}) + check_include_files(bfd.h HAVE_BFD_H) + set(CMAKE_REQUIRED_DEFINITIONS) + find_package(ZLIB) + if(HAVE_BFD_LIB AND HAVE_BFD_H AND HAVE_IBERTY_LIB AND ZLIB_FOUND) + set(HAVE_BFD ON) + set(BFD_LIBRARIES ${BFD_LIBRARY} ${IBERTY_LIBRARY} ${ZLIB_LIBRARIES}) + find_package_message(BFD_LIBRARIES "Found libbfd and dependencies" + ${BFD_LIBRARIES}) + if(TARGET_OS_FREEBSD AND NOT TARGET_OS_DEBIAN_FREEBSD) + set(BFD_LIBRARIES ${BFD_LIBRARIES} iconv) + endif() + endif() +endif() + +macro(setup_compile_flags) + # LY: save initial C/CXX flags + if(NOT INITIAL_CMAKE_FLAGS_SAVED) + if(MSVC) + string(REGEX REPLACE "^(.*)(/EHsc)( *)(.*)$" "\\1/EHs\\3\\4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + endif() + set(INITIAL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_C_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS} CACHE STRING "Initial CMake's flags" FORCE) + set(INITIAL_CMAKE_FLAGS_SAVED TRUE CACHE INTERNAL "State of initial CMake's flags" FORCE) + endif() + + # LY: reset C/CXX flags + set(CXX_FLAGS ${INITIAL_CMAKE_CXX_FLAGS}) + set(C_FLAGS ${INITIAL_CMAKE_C_FLAGS}) + set(EXE_LINKER_FLAGS ${INITIAL_CMAKE_EXE_LINKER_FLAGS}) + set(SHARED_LINKER_FLAGS ${INITIAL_CMAKE_SHARED_LINKER_FLAGS}) + set(STATIC_LINKER_FLAGS ${INITIAL_CMAKE_STATIC_LINKER_FLAGS}) + set(MODULE_LINKER_FLAGS ${INITIAL_CMAKE_MODULE_LINKER_FLAGS}) + + if(CC_HAS_FEXCEPTIONS) + add_compile_flags("C;CXX" "-fexceptions") + endif() + if(CC_HAS_FCXX_EXCEPTIONS) + add_compile_flags("CXX" "-fcxx-exceptions -frtti") + endif() + + # In C a global variable without a storage specifier (static/extern) and + # without an initialiser is called a ’tentative definition’. The + # language permits multiple tentative definitions in the single + # translation unit; i.e. int foo; int foo; is perfectly ok. GNU + # toolchain goes even further, allowing multiple tentative definitions + # in *different* translation units. Internally, variables introduced via + # tentative definitions are implemented as ‘common’ symbols. Linker + # permits multiple definitions if they are common symbols, and it picks + # one arbitrarily for inclusion in the binary being linked. + # + # -fno-common forces GNU toolchain to behave in a more + # standard-conformant way in respect to tentative definitions and it + # prevents common symbols generation. Since we are a cross-platform + # project it really makes sense. There are toolchains that don’t + # implement GNU style handling of the tentative definitions and there + # are platforms lacking proper support for common symbols (osx). + if(CC_HAS_FNO_COMMON) + add_compile_flags("C;CXX" "-fno-common") + endif() + + if(CC_HAS_GGDB) + add_compile_flags("C;CXX" "-ggdb") + endif() + + if(CC_HAS_WNO_UNKNOWN_PRAGMAS AND NOT HAVE_OPENMP) + add_compile_flags("C;CXX" -Wno-unknown-pragmas) + endif() + + if(CC_HAS_SECTIONS) + add_compile_flags("C;CXX" -ffunction-sections -fdata-sections) + elseif(MSVC) + add_compile_flags("C;CXX" /Gy) + endif() + + # We must set -fno-omit-frame-pointer here, since we rely + # on frame pointer when getting a backtrace, and it must + # be used consistently across all object files. + # The same reasoning applies to -fno-stack-protector switch. + if(ENABLE_BACKTRACE) + if(CC_HAS_FNO_OMIT_FRAME_POINTER) + add_compile_flags("C;CXX" "-fno-omit-frame-pointer") + endif() + endif() + + if(MSVC) + if (MSVC_VERSION LESS 1900) + message(FATAL_ERROR "At least \"Microsoft C/C++ Compiler\" version 19.0.24234.1 (Visual Studio 2015 Update 3) is required.") + endif() + add_compile_flags("CXX" "/Zc:__cplusplus") + add_compile_flags("C;CXX" "/W4") + add_compile_flags("C;CXX" "/utf-8") + else() + if(CC_HAS_WALL) + add_compile_flags("C;CXX" "-Wall") + endif() + if(CC_HAS_WEXTRA) + add_compile_flags("C;CXX" "-Wextra") + endif() + endif() + + if(CMAKE_COMPILER_IS_GNU${CMAKE_PRIMARY_LANG} + AND CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 5) + # G++ bug. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31488 + add_compile_flags("CXX" "-Wno-invalid-offsetof") + endif() + + add_definitions("-D__STDC_FORMAT_MACROS=1") + add_definitions("-D__STDC_LIMIT_MACROS=1") + add_definitions("-D__STDC_CONSTANT_MACROS=1") + add_definitions("-D_HAS_EXCEPTIONS=1") + + # Only add -Werror if it's a debug build, done by developers. + # Release builds should not cause extra trouble. + if(CC_HAS_WERROR AND (CI OR CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE STREQUAL "Debug")) + if(MSVC) + add_compile_flags("C;CXX" "/WX") + elseif(CMAKE_COMPILER_IS_CLANG) + if (NOT CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 6) + add_compile_flags("C;CXX" "-Werror") + endif() + elseif(CMAKE_COMPILER_IS_GNUCC) + if (NOT CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 6) + add_compile_flags("C;CXX" "-Werror") + endif() + else() + add_compile_flags("C;CXX" "-Werror") + endif() + endif() + + if(HAVE_OPENMP) + add_compile_flags("C;CXX" "-fopenmp") + endif() + + if (ENABLE_ASAN) + add_compile_flags("C;CXX" -fsanitize=address) + endif() + + if(ENABLE_GCOV) + if(NOT HAVE_GCOV) + message(FATAL_ERROR + "ENABLE_GCOV option requested but gcov library is not found") + endif() + + add_compile_flags("C;CXX" "-fprofile-arcs" "-ftest-coverage") + set(EXE_LINKER_FLAGS "${EXE_LINKER_FLAGS} -fprofile-arcs -ftest-coverage") + set(SHARED_LINKER_FLAGS "${SHARED_LINKER_FLAGS} -fprofile-arcs -ftest-coverage") + set(MODULE_LINKER_FLAGS "${MODULE_LINKER_FLAGS} -fprofile-arcs -ftest-coverage") + # add_library(gcov SHARED IMPORTED) + endif() + + if(ENABLE_GPROF) + add_compile_flags("C;CXX" "-pg") + endif() + + if(CMAKE_COMPILER_IS_GNUCC AND LTO_ENABLED) + add_compile_flags("C;CXX" ${GCC_LTO_CFLAGS}) + set(EXE_LINKER_FLAGS "${EXE_LINKER_FLAGS} ${GCC_LTO_CFLAGS} -fverbose-asm -fwhole-program") + set(SHARED_LINKER_FLAGS "${SHARED_LINKER_FLAGS} ${GCC_LTO_CFLAGS} -fverbose-asm") + set(MODULE_LINKER_FLAGS "${MODULE_LINKER_FLAGS} ${GCC_LTO_CFLAGS} -fverbose-asm") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5) + # Pass the same optimization flags to the linker + set(compile_flags "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPERCASE}}") + set(EXE_LINKER_FLAGS "${EXE_LINKER_FLAGS} ${compile_flags}") + set(SHARED_LINKER_FLAGS "${SHARED_LINKER_FLAGS} ${compile_flags}") + set(MODULE_LINKER_FLAGS "${MODULE_LINKER_FLAGS} ${compile_flags}") + unset(compile_flags) + else() + add_compile_flags("CXX" "-flto-odr-type-merging") + endif() + endif() + + if(MSVC AND LTO_ENABLED) + add_compile_flags("C;CXX" "/GL") + foreach(linkmode IN ITEMS EXE SHARED STATIC MODULE) + set(${linkmode}_LINKER_FLAGS "${${linkmode}_LINKER_FLAGS} /LTCG") + string(REGEX REPLACE "^(.*)(/INCREMENTAL:NO *)(.*)$" "\\1\\3" ${linkmode}_LINKER_FLAGS "${${linkmode}_LINKER_FLAGS}") + string(REGEX REPLACE "^(.*)(/INCREMENTAL:YES *)(.*)$" "\\1\\3" ${linkmode}_LINKER_FLAGS "${${linkmode}_LINKER_FLAGS}") + string(REGEX REPLACE "^(.*)(/INCREMENTAL *)(.*)$" "\\1\\3" ${linkmode}_LINKER_FLAGS "${${linkmode}_LINKER_FLAGS}") + string(STRIP "${${linkmode}_LINKER_FLAGS}" ${linkmode}_LINKER_FLAGS) + foreach(config IN LISTS CMAKE_CONFIGURATION_TYPES ITEMS Release MinSizeRel RelWithDebInfo Debug) + string(TOUPPER "${config}" config_uppercase) + if(DEFINED "CMAKE_${linkmode}_LINKER_FLAGS_${config_uppercase}") + string(REGEX REPLACE "^(.*)(/INCREMENTAL:NO *)(.*)$" "\\1\\3" altered_flags "${CMAKE_${linkmode}_LINKER_FLAGS_${config_uppercase}}") + string(REGEX REPLACE "^(.*)(/INCREMENTAL:YES *)(.*)$" "\\1\\3" altered_flags "${altered_flags}") + string(REGEX REPLACE "^(.*)(/INCREMENTAL *)(.*)$" "\\1\\3" altered_flags "${altered_flags}") + string(STRIP "${altered_flags}" altered_flags) + if(NOT "${altered_flags}" STREQUAL "${CMAKE_${linkmode}_LINKER_FLAGS_${config_uppercase}}") + set(CMAKE_${linkmode}_LINKER_FLAGS_${config_uppercase} "${altered_flags}" CACHE STRING "Altered: '/INCREMENTAL' removed for LTO" FORCE) + endif() + endif() + endforeach(config) + endforeach(linkmode) + unset(linkmode) + + foreach(config IN LISTS CMAKE_CONFIGURATION_TYPES ITEMS Release MinSizeRel RelWithDebInfo) + foreach(lang IN ITEMS C CXX) + string(TOUPPER "${config}" config_uppercase) + if(DEFINED "CMAKE_${lang}_FLAGS_${config_uppercase}") + string(REPLACE "/O2" "/Ox" altered_flags "${CMAKE_${lang}_FLAGS_${config_uppercase}}") + if(NOT "${altered_flags}" STREQUAL "${CMAKE_${lang}_FLAGS_${config_uppercase}}") + set(CMAKE_${lang}_FLAGS_${config_uppercase} "${altered_flags}" CACHE STRING "Altered: '/O2' replaced by '/Ox' for LTO" FORCE) + endif() + endif() + unset(config_uppercase) + endforeach(lang) + endforeach(config) + unset(altered_flags) + unset(lang) + unset(config) + endif() + + if(CMAKE_COMPILER_IS_CLANG AND OSX_ARCHITECTURES) + set(EXE_LINKER_FLAGS "${EXE_LINKER_FLAGS} -Wl,-keep_dwarf_unwind") + set(SHARED_LINKER_FLAGS "${SHARED_LINKER_FLAGS} -Wl,-keep_dwarf_unwind") + set(MODULE_LINKER_FLAGS "${MODULE_LINKER_FLAGS} -Wl,-keep_dwarf_unwind") + endif() + + if(CMAKE_COMPILER_IS_CLANG AND LTO_ENABLED) + if(CMAKE_${CMAKE_PRIMARY_LANG}_COMPILER_VERSION VERSION_LESS 3.9) + set(CLANG_LTO_FLAG "-flto") + else() + set(CLANG_LTO_FLAG "-flto=thin") + endif() + add_compile_flags("C;CXX" ${CLANG_LTO_FLAG}) + set(EXE_LINKER_FLAGS "${EXE_LINKER_FLAGS} ${CLANG_LTO_FLAG} -fverbose-asm -fwhole-program") + set(SHARED_LINKER_FLAGS "${SHARED_LINKER_FLAGS} ${CLANG_LTO_FLAG} -fverbose-asm") + set(MODULE_LINKER_FLAGS "${MODULE_LINKER_FLAGS} ${CLANG_LTO_FLAG} -fverbose-asm") + endif() + + # LY: push C/CXX flags into the cache + set(CMAKE_CXX_FLAGS ${CXX_FLAGS} CACHE STRING "Flags used by the C++ compiler during all build types" FORCE) + set(CMAKE_C_FLAGS ${C_FLAGS} CACHE STRING "Flags used by the C compiler during all build types" FORCE) + set(CMAKE_EXE_LINKER_FLAGS ${EXE_LINKER_FLAGS} CACHE STRING "Flags used by the linker" FORCE) + set(CMAKE_SHARED_LINKER_FLAGS ${SHARED_LINKER_FLAGS} CACHE STRING "Flags used by the linker during the creation of dll's" FORCE) + set(CMAKE_STATIC_LINKER_FLAGS ${STATIC_LINKER_FLAGS} CACHE STRING "Flags used by the linker during the creation of static libraries" FORCE) + set(CMAKE_MODULE_LINKER_FLAGS ${MODULE_LINKER_FLAGS} CACHE STRING "Flags used by the linker during the creation of modules" FORCE) + unset(CXX_FLAGS) + unset(C_FLAGS) + unset(EXE_LINKER_FLAGS) + unset(SHARED_LINKER_FLAGS) + unset(STATIC_LINKER_FLAGS) + unset(MODULE_LINKER_FLAGS) +endmacro(setup_compile_flags) + +# determine library for for std::filesystem +set(LIBCXX_FILESYSTEM "") +if(CMAKE_COMPILER_IS_GNUCXX) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) + set(LIBCXX_FILESYSTEM "stdc++fs") + endif() +elseif (CMAKE_COMPILER_IS_CLANG) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(LIBCXX_FILESYSTEM "c++experimental") + else() + set(LIBCXX_FILESYSTEM "stdc++fs") + endif() +endif() + +cmake_policy(POP) diff --git a/libs/libmdbx/src/cmake/profile.cmake b/libs/libmdbx/src/cmake/profile.cmake new file mode 100644 index 0000000000..6507e8dd83 --- /dev/null +++ b/libs/libmdbx/src/cmake/profile.cmake @@ -0,0 +1,45 @@ +## Copyright (c) 2012-2019 Leonid Yuriev <leo@yuriev.ru>. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## + +cmake_minimum_required(VERSION 3.8.2) +cmake_policy(PUSH) +cmake_policy(VERSION 3.8.2) + +include(CheckLibraryExists) +check_library_exists(gcov __gcov_flush "" HAVE_GCOV) + +option(ENABLE_GCOV + "Enable integration with gcov, a code coverage program" OFF) + +option(ENABLE_GPROF + "Enable integration with gprof, a performance analyzing tool" OFF) + +if(CMAKE_CXX_COMPILER_LOADED) + include(CheckIncludeFileCXX) + check_include_file_cxx(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) +else() + include(CheckIncludeFile) + check_include_file(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) +endif() + +option(MDBX_USE_VALGRIND "Enable integration with valgrind, a memory analyzing tool" OFF) +if(MDBX_USE_VALGRIND AND NOT HAVE_VALGRIND_MEMCHECK_H) + message(FATAL_ERROR "MDBX_USE_VALGRIND option is set but valgrind/memcheck.h is not found") +endif() + +option(ENABLE_ASAN + "Enable AddressSanitizer, a fast memory error detector based on compiler instrumentation" OFF) + +cmake_policy(POP) diff --git a/libs/libmdbx/src/cmake/utils.cmake b/libs/libmdbx/src/cmake/utils.cmake new file mode 100644 index 0000000000..c31f53cd3e --- /dev/null +++ b/libs/libmdbx/src/cmake/utils.cmake @@ -0,0 +1,183 @@ +## Copyright (c) 2012-2019 Leonid Yuriev <leo@yuriev.ru>. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +## + +cmake_minimum_required(VERSION 3.8.2) +cmake_policy(PUSH) +cmake_policy(VERSION 3.8.2) + +macro(add_compile_flags langs) + foreach(_lang ${langs}) + string(REPLACE ";" " " _flags "${ARGN}") + if(CMAKE_CXX_COMPILER_LOADED AND _lang STREQUAL "CXX") + set("${_lang}_FLAGS" "${${_lang}_FLAGS} ${_flags}") + endif() + if(CMAKE_C_COMPILER_LOADED AND _lang STREQUAL "C") + set("${_lang}_FLAGS" "${${_lang}_FLAGS} ${_flags}") + endif() + endforeach() + unset(_lang) + unset(_flags) +endmacro(add_compile_flags) + +macro(set_source_files_compile_flags) + foreach(file ${ARGN}) + get_filename_component(_file_ext ${file} EXT) + set(_lang "") + if("${_file_ext}" STREQUAL ".m") + set(_lang OBJC) + # CMake believes that Objective C is a flavor of C++, not C, + # and uses g++ compiler for .m files. + # LANGUAGE property forces CMake to use CC for ${file} + set_source_files_properties(${file} PROPERTIES LANGUAGE C) + elseif("${_file_ext}" STREQUAL ".mm") + set(_lang OBJCXX) + endif() + + if(_lang) + get_source_file_property(_flags ${file} COMPILE_FLAGS) + if("${_flags}" STREQUAL "NOTFOUND") + set(_flags "${CMAKE_${_lang}_FLAGS}") + else() + set(_flags "${_flags} ${CMAKE_${_lang}_FLAGS}") + endif() + # message(STATUS "Set (${file} ${_flags}") + set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS + "${_flags}") + endif() + endforeach() + unset(_file_ext) + unset(_lang) +endmacro(set_source_files_compile_flags) + +macro(fetch_version name version_file) + set(${name}_VERSION "") + set(${name}_GIT_DESCRIBE "") + set(${name}_GIT_TIMESTAMP "") + set(${name}_GIT_TREE "") + set(${name}_GIT_COMMIT "") + set(${name}_GIT_REVISION 0) + set(${name}_GIT_VERSION "") + if(GIT) + execute_process(COMMAND ${GIT} describe --tags --long --dirty=-dirty + OUTPUT_VARIABLE ${name}_GIT_DESCRIBE + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_DESCRIBE" STREQUAL "") + message(FATAL_ERROR "Please fetch tags and/or install latest version of git ('describe --tags --long --dirty' failed)") + endif() + + execute_process(COMMAND ${GIT} show --no-patch --format=%cI HEAD + OUTPUT_VARIABLE ${name}_GIT_TIMESTAMP + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_TIMESTAMP" STREQUAL "%cI") + execute_process(COMMAND ${GIT} show --no-patch --format=%ci HEAD + OUTPUT_VARIABLE ${name}_GIT_TIMESTAMP + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_TIMESTAMP" STREQUAL "%ci") + message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%cI HEAD' failed)") + endif() + endif() + + execute_process(COMMAND ${GIT} show --no-patch --format=%T HEAD + OUTPUT_VARIABLE ${name}_GIT_TREE + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_TREE" STREQUAL "") + message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%T HEAD' failed)") + endif() + + execute_process(COMMAND ${GIT} show --no-patch --format=%H HEAD + OUTPUT_VARIABLE ${name}_GIT_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_COMMIT" STREQUAL "") + message(FATAL_ERROR "Please install latest version of git ('show --no-patch --format=%H HEAD' failed)") + endif() + + execute_process(COMMAND ${GIT} rev-list --count --no-merges HEAD + OUTPUT_VARIABLE ${name}_GIT_REVISION + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE rc) + if(rc OR "${name}_GIT_REVISION" STREQUAL "") + message(FATAL_ERROR "Please install latest version of git ('rev-list --count --no-merges HEAD' failed)") + endif() + + string(REGEX MATCH "^(v)?([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)?" git_version_valid "${${name}_GIT_DESCRIBE}") + if(git_version_valid) + string(REGEX REPLACE "^(v)?([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)?" "\\2;\\3;\\4" ${name}_GIT_VERSION ${${name}_GIT_DESCRIBE}) + else() + string(REGEX MATCH "^(v)?([0-9]+)\\.([0-9]+)(.*)?" git_version_valid "${${name}_GIT_DESCRIBE}") + if(git_version_valid) + string(REGEX REPLACE "^(v)?([0-9]+)\\.([0-9]+)(.*)?" "\\2;\\3;0" ${name}_GIT_VERSION ${${name}_GIT_DESCRIBE}) + else() + message(AUTHOR_WARNING "Bad ${name} version \"${${name}_GIT_DESCRIBE}\"; falling back to 0.0.0 (have you made an initial release?)") + set(${name}_GIT_VERSION "0;0;0") + endif() + endif() + endif() + + if(NOT ${name}_GIT_VERSION OR NOT ${name}_GIT_TIMESTAMP OR NOT ${name}_GIT_REVISION) + message(WARNING "Unable to retrive ${name} version from git.") + set(${name}_GIT_VERSION "0;0;0;0") + set(${name}_GIT_TIMESTAMP "") + set(${name}_GIT_REVISION 0) + + # Try to get version from VERSION file + if(EXISTS "${version_file}") + file(STRINGS "${version_file}" ${name}_VERSION) + endif() + + if(NOT ${name}_VERSION) + message(WARNING "Unable to retrive ${name} version from \"${version_file}\" file.") + set(${name}_VERSION_LIST ${${name}_GIT_VERSION}) + string(REPLACE ";" "." ${name}_VERSION "${${name}_GIT_VERSION}") + else() + string(REPLACE "." ";" ${name}_VERSION_LIST ${${name}_VERSION}) + endif() + + else() + list(APPEND ${name}_GIT_VERSION ${${name}_GIT_REVISION}) + set(${name}_VERSION_LIST ${${name}_GIT_VERSION}) + string(REPLACE ";" "." ${name}_VERSION "${${name}_GIT_VERSION}") + endif() + + list(GET ${name}_VERSION_LIST 0 "${name}_VERSION_MAJOR") + list(GET ${name}_VERSION_LIST 1 "${name}_VERSION_MINOR") + list(GET ${name}_VERSION_LIST 2 "${name}_VERSION_RELEASE") + list(GET ${name}_VERSION_LIST 3 "${name}_VERSION_REVISION") + + set(${name}_VERSION_MAJOR ${${name}_VERSION_MAJOR} PARENT_SCOPE) + set(${name}_VERSION_MINOR ${${name}_VERSION_MINOR} PARENT_SCOPE) + set(${name}_VERSION_RELEASE ${${name}_VERSION_RELEASE} PARENT_SCOPE) + set(${name}_VERSION_REVISION ${${name}_VERSION_REVISION} PARENT_SCOPE) + set(${name}_VERSION ${${name}_VERSION} PARENT_SCOPE) + + set(${name}_GIT_DESCRIBE ${${name}_GIT_DESCRIBE} PARENT_SCOPE) + set(${name}_GIT_TIMESTAMP ${${name}_GIT_TIMESTAMP} PARENT_SCOPE) + set(${name}_GIT_TREE ${${name}_GIT_TREE} PARENT_SCOPE) + set(${name}_GIT_COMMIT ${${name}_GIT_COMMIT} PARENT_SCOPE) + set(${name}_GIT_REVISION ${${name}_GIT_REVISION} PARENT_SCOPE) + set(${name}_GIT_VERSION ${${name}_GIT_VERSION} PARENT_SCOPE) +endmacro(fetch_version) + +cmake_policy(POP) diff --git a/libs/libmdbx/src/dll.vcxproj b/libs/libmdbx/src/dll.vcxproj deleted file mode 100644 index 5464c15875..0000000000 --- a/libs/libmdbx/src/dll.vcxproj +++ /dev/null @@ -1,223 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{6D19209B-ECE7-4B9C-941C-0AA2B484F199}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>DynamicLibrary</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>DynamicLibrary</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>DynamicLibrary</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>DynamicLibrary</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <TargetName>mdbx</TargetName> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <CustomBuildBeforeTargets>PreLinkEvent</CustomBuildBeforeTargets> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <TargetName>mdbx</TargetName> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <CustomBuildBeforeTargets>PreLinkEvent</CustomBuildBeforeTargets> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <TargetName>mdbx</TargetName> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <CustomBuildBeforeTargets>PreLinkEvent</CustomBuildBeforeTargets> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <TargetName>mdbx</TargetName> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - <CustomBuildBeforeTargets>PreLinkEvent</CustomBuildBeforeTargets> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> - <WarningLevel>EnableAllWarnings</WarningLevel> - <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> - <Optimization>Disabled</Optimization> - <StringPooling>true</StringPooling> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <TargetMachine>MachineX86</TargetMachine> - <GenerateDebugInformation>true</GenerateDebugInformation> - <SubSystem>Windows</SubSystem> - <AdditionalDependencies>ntdll.lib;$(IntermediateOutputPath)mdbx_ntdll_extra.lib;kernel32.lib;advapi32.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - <CustomBuildStep> - <Message>Generate fake-library mdbx_ntdll_extra.lib for $(PlatformTarget)</Message> - <Outputs>$(IntermediateOutputPath)mdbx_ntdll_extra.lib</Outputs> - <Inputs>$(ProjectDir)src/ntdll.def</Inputs> - <Command>lib.exe /def:%(Inputs) /out:%(Outputs) /machine:$(PlatformTarget)</Command> - </CustomBuildStep> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> - <WarningLevel>EnableAllWarnings</WarningLevel> - <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> - <StringPooling>true</StringPooling> - <Optimization>Full</Optimization> - <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion> - <IntrinsicFunctions>true</IntrinsicFunctions> - <FavorSizeOrSpeed>Size</FavorSizeOrSpeed> - <OmitFramePointers>true</OmitFramePointers> - <WholeProgramOptimization>true</WholeProgramOptimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <BufferSecurityCheck>false</BufferSecurityCheck> - <AssemblerOutput>All</AssemblerOutput> - </ClCompile> - <Link> - <TargetMachine>MachineX86</TargetMachine> - <GenerateDebugInformation>true</GenerateDebugInformation> - <SubSystem>Windows</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration> - <AdditionalDependencies>ntdll.lib;$(IntermediateOutputPath)mdbx_ntdll_extra.lib;kernel32.lib;advapi32.lib;%(AdditionalDependencies)</AdditionalDependencies> - <IgnoreAllDefaultLibraries>true</IgnoreAllDefaultLibraries> - </Link> - <CustomBuildStep> - <Message>Generate fake-library mdbx_ntdll_extra.lib for $(PlatformTarget)</Message> - <Outputs>$(IntermediateOutputPath)mdbx_ntdll_extra.lib</Outputs> - <Inputs>$(ProjectDir)src/ntdll.def</Inputs> - <Command>lib.exe /def:%(Inputs) /out:%(Outputs) /machine:$(PlatformTarget)</Command> - </CustomBuildStep> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> - </ClCompile> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> - <StringPooling>true</StringPooling> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <AdditionalDependencies>ntdll.lib;$(IntermediateOutputPath)mdbx_ntdll_extra.lib;kernel32.lib;advapi32.lib;%(AdditionalDependencies)</AdditionalDependencies> - </Link> - <CustomBuildStep> - <Message>Generate fake-library mdbx_ntdll_extra.lib for $(PlatformTarget)</Message> - <Outputs>$(IntermediateOutputPath)mdbx_ntdll_extra.lib</Outputs> - <Inputs>$(ProjectDir)src/ntdll.def</Inputs> - <Command>lib.exe /def:%(Inputs) /out:%(Outputs) /machine:$(PlatformTarget)</Command> - </CustomBuildStep> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;MDBX_BUILD_DLL;MDBX_AVOID_CRT;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> - <StringPooling>true</StringPooling> - <Optimization>Full</Optimization> - <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion> - <IntrinsicFunctions>true</IntrinsicFunctions> - <FavorSizeOrSpeed>Size</FavorSizeOrSpeed> - <OmitFramePointers>true</OmitFramePointers> - <WholeProgramOptimization>true</WholeProgramOptimization> - <WarningLevel>EnableAllWarnings</WarningLevel> - <FunctionLevelLinking>true</FunctionLevelLinking> - <BufferSecurityCheck>false</BufferSecurityCheck> - <AssemblerOutput>All</AssemblerOutput> - </ClCompile> - <Link> - <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration> - <AdditionalDependencies>ntdll.lib;$(IntermediateOutputPath)mdbx_ntdll_extra.lib;kernel32.lib;advapi32.lib;%(AdditionalDependencies)</AdditionalDependencies> - <IgnoreAllDefaultLibraries>true</IgnoreAllDefaultLibraries> - </Link> - <CustomBuildStep> - <Message>Generate fake-library mdbx_ntdll_extra.lib for $(PlatformTarget)</Message> - <Outputs>$(IntermediateOutputPath)mdbx_ntdll_extra.lib</Outputs> - <Inputs>$(ProjectDir)src/ntdll.def</Inputs> - <Command>lib.exe /def:%(Inputs) /out:%(Outputs) /machine:$(PlatformTarget)</Command> - </CustomBuildStep> - </ItemDefinitionGroup> - <ItemGroup> - <ClCompile Include="src\lck-windows.c" /> - <ClCompile Include="src\mdbx.c" /> - <ClCompile Include="src\osal.c" /> - <ClCompile Include="src\version.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="mdbx.h" /> - <ClInclude Include="src\bits.h" /> - <ClInclude Include="src\defs.h" /> - <ClInclude Include="src\osal.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/example/CMakeLists.txt b/libs/libmdbx/src/example/CMakeLists.txt new file mode 100644 index 0000000000..d3e56e82e9 --- /dev/null +++ b/libs/libmdbx/src/example/CMakeLists.txt @@ -0,0 +1,6 @@ +set(TARGET mdbx_example) +project(${TARGET}) + +add_executable(${TARGET} example-mdbx.c) + +target_link_libraries(${TARGET} mdbx) diff --git a/libs/libmdbx/src/example/README.md b/libs/libmdbx/src/example/README.md new file mode 100644 index 0000000000..b819cf4a40 --- /dev/null +++ b/libs/libmdbx/src/example/README.md @@ -0,0 +1 @@ +See [example-mdbx.c](example-mdbx.c) as an example of using _libmdbx_, and do a line-by-line comparison of it with the [sample-bdb.txt](sample-bdb.txt) file. diff --git a/libs/libmdbx/src/tutorial/sample-mdbx.c b/libs/libmdbx/src/example/example-mdbx.c index 991ab69806..1d25ef6fc8 100644 --- a/libs/libmdbx/src/tutorial/sample-mdbx.c +++ b/libs/libmdbx/src/example/example-mdbx.c @@ -1,11 +1,11 @@ -/* sample-mdb.txt - MDB toy/sample +/* MDBX usage examle * * Do a line-by-line comparison of this and sample-bdb.txt */ /* - * Copyright 2017 Ilya Shipitsin <chipitsine@gmail.com>. * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>. + * Copyright 2017 Ilya Shipitsin <chipitsine@gmail.com>. * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * diff --git a/libs/libmdbx/src/tutorial/sample-bdb.txt b/libs/libmdbx/src/example/sample-bdb.txt index 440efddb57..5c89540afa 100644 --- a/libs/libmdbx/src/tutorial/sample-bdb.txt +++ b/libs/libmdbx/src/example/sample-bdb.txt @@ -1,6 +1,6 @@ -/* sample-bdb.txt - BerkeleyDB toy/sample +/* BerkeleyDB toy/sample * - * Do a line-by-line comparison of this and sample-mdb.txt + * Do a line-by-line comparison of this and example-mdbx.c */ /* diff --git a/libs/libmdbx/src/libmdbx.cflags b/libs/libmdbx/src/libmdbx.cflags deleted file mode 100644 index bb8c4c3337..0000000000 --- a/libs/libmdbx/src/libmdbx.cflags +++ /dev/null @@ -1 +0,0 @@ --std=c11
\ No newline at end of file diff --git a/libs/libmdbx/src/libmdbx.config b/libs/libmdbx/src/libmdbx.config deleted file mode 100644 index e0284f4257..0000000000 --- a/libs/libmdbx/src/libmdbx.config +++ /dev/null @@ -1,2 +0,0 @@ -// Add predefined macros for your project here. For example: -// #define THE_ANSWER 42 diff --git a/libs/libmdbx/src/libmdbx.creator b/libs/libmdbx/src/libmdbx.creator deleted file mode 100644 index e94cbbd302..0000000000 --- a/libs/libmdbx/src/libmdbx.creator +++ /dev/null @@ -1 +0,0 @@ -[General] diff --git a/libs/libmdbx/src/libmdbx.cxxflags b/libs/libmdbx/src/libmdbx.cxxflags deleted file mode 100644 index 6435dfce2f..0000000000 --- a/libs/libmdbx/src/libmdbx.cxxflags +++ /dev/null @@ -1 +0,0 @@ --std=c++17
\ No newline at end of file diff --git a/libs/libmdbx/src/libmdbx.files b/libs/libmdbx/src/libmdbx.files deleted file mode 100644 index cdea405b5d..0000000000 --- a/libs/libmdbx/src/libmdbx.files +++ /dev/null @@ -1,62 +0,0 @@ -AUTHORS -CMakeLists.txt -LICENSE -Makefile -README-RU.md -README.md -TODO.md -mdbx.h -src/bits.h -src/defs.h -src/lck-linux.c -src/lck-posix.c -src/lck-windows.c -src/mdbx.c -src/osal.c -src/osal.h -src/tools/CMakeLists.txt -src/tools/mdbx_chk.c -src/tools/mdbx_copy.1 -src/tools/mdbx_copy.c -src/tools/mdbx_dump.1 -src/tools/mdbx_dump.c -src/tools/mdbx_load.1 -src/tools/mdbx_load.c -src/tools/mdbx_stat.1 -src/tools/mdbx_stat.c -src/tools/wingetopt.c -src/tools/wingetopt.h -src/version.c -test/CMakeLists.txt -test/actor.cc -test/append.cc -test/base.h -test/cases.cc -test/chrono.cc -test/chrono.h -test/config.cc -test/config.h -test/copy.cc -test/dead.cc -test/hill.cc -test/jitter.cc -test/keygen.cc -test/keygen.h -test/log.cc -test/log.h -test/main.cc -test/osal-unix.cc -test/osal-windows.cc -test/osal.h -test/pcrf/CMakeLists.txt -test/test.cc -test/test.h -test/try.cc -test/ttl.cc -test/utils.cc -test/utils.h -tutorial/CMakeLists.txt -tutorial/README.md -tutorial/sample-bdb.txt -tutorial/sample-mdb.txt -tutorial/sample-mdbx.c diff --git a/libs/libmdbx/src/libmdbx.includes b/libs/libmdbx/src/libmdbx.includes deleted file mode 100644 index eb512a01ce..0000000000 --- a/libs/libmdbx/src/libmdbx.includes +++ /dev/null @@ -1,4 +0,0 @@ -. -src -src/tools -test diff --git a/libs/libmdbx/src/mdbx.h b/libs/libmdbx/src/mdbx.h index 67b725139a..dcbe608b29 100644 --- a/libs/libmdbx/src/mdbx.h +++ b/libs/libmdbx/src/mdbx.h @@ -1,4 +1,465 @@ -/* LICENSE AND COPYRUSTING ***************************************************** +/**** BRIEFLY ****************************************************************** + * + * libmdbx is superior to LMDB (https://bit.ly/26ts7tL) in terms of features + * and reliability, not inferior in performance. In comparison to LMDB, libmdbx + * makes many things just work perfectly, not silently and catastrophically + * break down. libmdbx supports Linux, Windows, MacOS, FreeBSD, DragonFly, + * Solaris, OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems + * compliant with POSIX.1-2008. + * + * Look below for API description, for other information (build, embedding and + * amalgamation, improvements over LMDB, benchmarking, etc) please refer to + * README.md at https://abf.io/erthink/libmdbx. + * + * --- + * + * The next version is under active non-public development and will be released + * as MithrilDB and libmithrildb for libraries & packages. Admittedly mythical + * Mithril is resembling silver but being stronger and lighter than steel. + * Therefore MithrilDB is rightly relevant name. + * + * MithrilDB will be radically different from libmdbx by the new database format + * and API based on C++17, as well as the Apache 2.0 License. The goal of this + * revolution is to provide a clearer and robust API, add more features and new + * valuable properties of database. + * + * The Future will (be) Positive. Всё будет хорошо. + * + * + **** INTRODUCTION ************************************************************* + * + * // For the most part, this section is a copy of the corresponding text + * // from LMDB description, but with some edits reflecting the improvements + * // and enhancements were made in MDBX. + * + * MDBX is a Btree-based database management library modeled loosely on the + * BerkeleyDB API, but much simplified. The entire database (aka "environment") + * is exposed in a memory map, and all data fetches return data directly from + * the mapped memory, so no malloc's or memcpy's occur during data fetches. + * As such, the library is extremely simple because it requires no page caching + * layer of its own, and it is extremely high performance and memory-efficient. + * It is also fully transactional with full ACID semantics, and when the memory + * map is read-only, the database integrity cannot be corrupted by stray pointer + * writes from application code. + * + * The library is fully thread-aware and supports concurrent read/write access + * from multiple processes and threads. Data pages use a copy-on-write strategy + * so no active data pages are ever overwritten, which also provides resistance + * to corruption and eliminates the need of any special recovery procedures + * after a system crash. Writes are fully serialized; only one write transaction + * may be active at a time, which guarantees that writers can never deadlock. + * The database structure is multi-versioned so readers run with no locks; + * writers cannot block readers, and readers don't block writers. + * + * Unlike other well-known database mechanisms which use either write-ahead + * transaction logs or append-only data writes, MDBX requires no maintenance + * during operation. Both write-ahead loggers and append-only databases require + * periodic checkpointing and/or compaction of their log or database files + * otherwise they grow without bound. MDBX tracks free pages within the database + * and re-uses them for new write operations, so the database size does not grow + * without bound in normal use. It is worth noting that the "next" version + * libmdbx (MithrilDB) will solve this problem. + * + * The memory map can be used as a read-only or read-write map. It is read-only + * by default as this provides total immunity to corruption. Using read-write + * mode offers much higher write performance, but adds the possibility for stray + * application writes thru pointers to silently corrupt the database. + * Of course if your application code is known to be bug-free (...) then this is + * not an issue. + * + * If this is your first time using a transactional embedded key-value store, + * you may find the "GETTING STARTED" section below to be helpful. + * + * + **** GETTING STARTED ********************************************************** + * + * // This section is based on Bert Hubert's intro "LMDB Semantics", with + * // edits reflecting the improvements and enhancements were made in MDBX. + * // See https://bit.ly/2maejGY for Bert Hubert's original. + * + * Everything starts with an environment, created by mdbx_env_create(). + * Once created, this environment must also be opened with mdbx_env_open(), + * and after use be closed by mdbx_env_close(). At that a non-zero value of the + * last argument "mode" supposes MDBX will create database and directory if ones + * does not exist. In this case the non-zero "mode" argument specifies the file + * mode bits be applied when a new files are created by open() function. + * + * Within that directory, a lock file (aka LCK-file) and a storage file (aka + * DXB-file) will be generated. If you don't want to use a directory, you can + * pass the MDBX_NOSUBDIR option, in which case the path you provided is used + * directly as the DXB-file, and another file with a "-lck" suffix added + * will be used for the LCK-file. + * + * Once the environment is open, a transaction can be created within it using + * mdbx_txn_begin(). Transactions may be read-write or read-only, and read-write + * transactions may be nested. A transaction must only be used by one thread at + * a time. Transactions are always required, even for read-only access. The + * transaction provides a consistent view of the data. + * + * Once a transaction has been created, a database (i.e. key-value space inside + * the environment) can be opened within it using mdbx_dbi_open(). If only one + * database will ever be used in the environment, a NULL can be passed as the + * database name. For named databases, the MDBX_CREATE flag must be used to + * create the database if it doesn't already exist. Also, mdbx_env_set_maxdbs() + * must be called after mdbx_env_create() and before mdbx_env_open() to set the + * maximum number of named databases you want to support. + * + * NOTE: a single transaction can open multiple databases. Generally databases + * should only be opened once, by the first transaction in the process. + * + * Within a transaction, mdbx_get() and mdbx_put() can store single key-value + * pairs if that is all you need to do (but see CURSORS below if you want to do + * more). + * + * A key-value pair is expressed as two MDBX_val structures. This struct that is + * exactly similar to POSIX's struct iovec and has two fields, iov_len and + * iov_base. The data is a void pointer to an array of iov_len bytes. + * (!) The notable difference between MDBX and LMDB is that MDBX support zero + * length keys. + * + * Because MDBX is very efficient (and usually zero-copy), the data returned in + * an MDBX_val structure may be memory-mapped straight from disk. In other words + * look but do not touch (or free() for that matter). Once a transaction is + * closed, the values can no longer be used, so make a copy if you need to keep + * them after that. + * + * + * CURSORS -- To do more powerful things, we must use a cursor. + * + * Within the transaction, a cursor can be created with mdbx_cursor_open(). + * With this cursor we can store/retrieve/delete (multiple) values using + * mdbx_cursor_get(), mdbx_cursor_put(), and mdbx_cursor_del(). + * + * mdbx_cursor_get() positions itself depending on the cursor operation + * requested, and for some operations, on the supplied key. For example, to list + * all key-value pairs in a database, use operation MDBX_FIRST for the first + * call to mdbx_cursor_get(), and MDBX_NEXT on subsequent calls, until the end + * is hit. + * + * To retrieve all keys starting from a specified key value, use MDBX_SET. For + * more cursor operations, see the API description below. + * + * When using mdbx_cursor_put(), either the function will position the cursor + * for you based on the key, or you can use operation MDBX_CURRENT to use the + * current position of the cursor. NOTE that key must then match the current + * position's key. + * + * + * SUMMARIZING THE OPENING + * + * So we have a cursor in a transaction which opened a database in an + * environment which is opened from a filesystem after it was separately + * created. + * + * Or, we create an environment, open it from a filesystem, create a transaction + * within it, open a database within that transaction, and create a cursor + * within all of the above. + * + * Got it? + * + * + * THREADS AND PROCESSES + * + * Do not have open an database twice in the same process at the same time, MDBX + * will track and prevent this. Instead, share the MDBX environment that has + * opened the file across all threads. The reason for this is: + * - When the "Open file description" locks (aka OFD-locks) are not available, + * MDBX uses POSIX locks on files, and these locks have issues if one process + * opens a file multiple times. + * - If a single process opens the same environment multiple times, closing it + * once will remove all the locks held on it, and the other instances will be + * vulnerable to corruption from other processes. + * + For compatibility with LMDB which allows multi-opening, MDBX can be + * configured at runtime by mdbx_setup_debug(MDBX_DBG_LEGACY_MULTIOPEN, ...) + * prior to calling other MDBX funcitons. In this way MDBX will track + * databases opening, detect multi-opening cases and then recover POSIX file + * locks as necessary. However, lock recovery can cause unexpected pauses, + * such as when another process opened the database in exclusive mode before + * the lock was restored - we have to wait until such a process releases the + * database, and so on. + * + * Do not use opened MDBX environment(s) after fork() in a child process(es), + * MDBX will check and prevent this at critical points. Instead, ensure there is + * no open MDBX-instance(s) during fork(), or atleast close it immediately after + * fork() in the child process and reopen if required - for instance by using + * pthread_atfork(). The reason for this is: + * - For competitive consistent reading, MDBX assigns a slot in the shared + * table for each process that interacts with the database. This slot is + * populated with process attributes, including the PID. + * - After fork(), in order to remain connected to a database, the child + * process must have its own such "slot", which can't be assigned in any + * simple and robust way another than the regular. + * - A write transaction from a parent process cannot continue in a child + * process for obvious reasons. + * - Moreover, in a multithreaded process at the fork() moment any number of + * threads could run in critical and/or intermediate sections of MDBX code + * with interaction and/or racing conditions with threads from other + * process(es). For instance: shrinking a database or copying it to a pipe, + * opening or closing environment, begining or finishing a transaction, + * and so on. + * = Therefore, any solution other than simply close database (and reopen if + * necessary) in a child process would be both extreme complicated and so + * fragile. + * + * Also note that a transaction is tied to one thread by default using Thread + * Local Storage. If you want to pass read-only transactions across threads, + * you can use the MDBX_NOTLS option on the environment. Nevertheless, a write + * transaction entirely should only be used in one thread from start to finish. + * MDBX checks this in a reasonable manner and return the MDBX_THREAD_MISMATCH + * error in rules violation. + * + * + * TRANSACTIONS, ROLLBACKS, etc. + * + * To actually get anything done, a transaction must be committed using + * mdbx_txn_commit(). Alternatively, all of a transaction's operations + * can be discarded using mdbx_txn_abort(). + * + * (!) An important difference between MDBX and LMDB is that MDBX required that + * any opened cursors can be reused and must be freed explicitly, regardless + * ones was opened in a read-only or write transaction. The REASON for this is + * eliminates ambiguity which helps to avoid errors such as: use-after-free, + * double-free, i.e. memory corruption and segfaults. + * + * For read-only transactions, obviously there is nothing to commit to storage. + * (!) An another notable difference between MDBX and LMDB is that MDBX make + * handles opened for existing databases immediately available for other + * transactions, regardless this transaction will be aborted or reset. The + * REASON for this is to avoiding the requirement for multiple opening a same + * handles in concurrent read transactions, and tracking of such open but hidden + * handles until the completion of read transactions which opened them. + * + * In addition, as long as a transaction is open, a consistent view of the + * database is kept alive, which requires storage. A read-only transaction that + * no longer requires this consistent view should be terminated (committed or + * aborted) when the view is no longer needed (but see below for an + * optimization). + * + * There can be multiple simultaneously active read-only transactions but only + * one that can write. Once a single read-write transaction is opened, all + * further attempts to begin one will block until the first one is committed or + * aborted. This has no effect on read-only transactions, however, and they may + * continue to be opened at any time. + * + * + * DUPLICATE KEYS + * + * mdbx_get() and mdbx_put() respectively have no and only some support or + * multiple key-value pairs with identical keys. If there are multiple values + * for a key, mdbx_get() will only return the first value. + * + * When multiple values for one key are required, pass the MDBX_DUPSORT flag to + * mdbx_dbi_open(). In an MDBX_DUPSORT database, by default mdbx_put() will not + * replace the value for a key if the key existed already. Instead it will add + * the new value to the key. In addition, mdbx_del() will pay attention to the + * value field too, allowing for specific values of a key to be deleted. + * + * Finally, additional cursor operations become available for traversing through + * and retrieving duplicate values. + * + * + * SOME OPTIMIZATION + * + * If you frequently begin and abort read-only transactions, as an optimization, + * it is possible to only reset and renew a transaction. + * + * mdbx_txn_reset() releases any old copies of data kept around for a read-only + * transaction. To reuse this reset transaction, call mdbx_txn_renew() on it. + * Any cursors in this transaction can also be renewed using mdbx_cursor_renew() + * or freed by mdbx_cursor_close(). + * + * To permanently free a transaction, reset or not, use mdbx_txn_abort(). + * + * + * CLEANING UP + * + * Any created cursors must be closed using mdbx_cursor_close(). It is advisable + * to repeat: + * (!) An important difference between MDBX and LMDB is that MDBX required that + * any opened cursors can be reused and must be freed explicitly, regardless + * ones was opened in a read-only or write transaction. The REASON for this is + * eliminates ambiguity which helps to avoid errors such as: use-after-free, + * double-free, i.e. memory corruption and segfaults. + * + * It is very rarely necessary to close a database handle, and in general they + * should just be left open. When you close a handle, it immediately becomes + * unavailable for all transactions in the environment. Therefore, you should + * avoid closing the handle while at least one transaction is using it. + * + * + * THE FULL API + * + * The full MDBX documentation lists further details below, + * like how to: + * + * - configure database size and automatic size management + * - drop and clean a database + * - detect and report errors + * - optimize (bulk) loading speed + * - (temporarily) reduce robustness to gain even more speed + * - gather statistics about the database + * - define custom sort orders + * - estimate size of range query result + * - double perfomance by LIFO reclaiming on storages with write-back + * - use sequences and canary markers + * - use lack-of-space callback (aka OOM-KICK) + * - use exclusive mode + * + * + **** RESTRICTIONS & CAVEATS *************************************************** + * in addition to those listed for some functions. + * + * - Troubleshooting the LCK-file. + * 1. A broken LCK-file can cause sync issues, including appearance of + * wrong/inconsistent data for readers. When database opened in the + * cooperative read-write mode the LCK-file requires to be mapped to + * memory in read-write access. In this case it is always possible for + * stray/malfunctioned application could writes thru pointers to + * silently corrupt the LCK-file. + * + * Unfortunately, there is no any portable way to prevent such + * corruption, since the LCK-file is updated concurrently by + * multiple processes in a lock-free manner and any locking is + * unwise due to a large overhead. + * + * The "next" version of libmdbx (MithrilDB) will solve this issue. + * + * Workaround: Just make all programs using the database close it; + * the LCK-file is always reset on first open. + * + * 2. Stale reader transactions left behind by an aborted program cause + * further writes to grow the database quickly, and stale locks can + * block further operation. + * MDBX checks for stale readers while opening environment and before + * growth the database. But in some cases, this may not be enough. + * + * Workaround: Check for stale readers periodically, using the + * mdbx_reader_check() function or the mdbx_stat tool. + * + * 3. Stale writers will be cleared automatically by MDBX on supprted + * platforms. But this is platform-specific, especially of + * implementation of shared POSIX-mutexes and support for robust + * mutexes. For instance there are no known issues on Linux, OSX, + * Windows and FreeBSD. + * + * Workaround: Otherwise just make all programs using the database + * close it; the LCK-file is always reset on first open + * of the environment. + * + * - Do not use MDBX databases on remote filesystems, even between processes + * on the same host. This breaks file locks on some platforms, possibly + * memory map sync, and certainly sync between programs on different hosts. + * + * On the other hand, MDBX support the exclusive database operation over + * a network, and cooperative read-only access to the database placed on + * a read-only network shares. + * + * - Do not use opened MDBX_env instance(s) in a child processes after fork(). + * It would be insane to call fork() and any MDBX-functions simultaneously + * from multiple threads. The best way is to prevent the presence of open + * MDBX-instances during fork(). + * + * The MDBX_TXN_CHECKPID build-time option, which is ON by default on + * non-Windows platforms (i.e. where fork() is available), enables PID + * checking at a few critical points. But this does not give any guarantees, + * but only allows you to detect such errors a little sooner. Depending on + * the platform, you should expect an application crash and/or database + * corruption in such cases. + * + * On the other hand, MDBX allow calling mdbx_close_env() in such cases to + * release resources, but no more and in general this is a wrong way. + * + * - There is no pure read-only mode in a normal explicitly way, since + * readers need write access to LCK-file to be ones visible for writer. + * MDBX always tries to open/create LCK-file for read-write, but switches + * to without-LCK mode on appropriate errors (EROFS, EACCESS, EPERM) + * if the read-only mode was requested by the MDBX_RDONLY flag which is + * described below. + * + * The "next" version of libmdbx (MithrilDB) will solve this issue. + * + * - A thread can only use one transaction at a time, plus any nested + * read-write transactions in the non-writemap mode. Each transaction + * belongs to one thread. The MDBX_NOTLS flag changes this for read-only + * transactions. See below. + * + * - Do not have open an MDBX database twice in the same process at the same + * time. By default MDBX prevent this in most cases by tracking databases + * opening and return MDBX_BUSY if anyone LCK-file is already open. + * + * The reason for this is that when the "Open file description" locks (aka + * OFD-locks) are not available, MDBX uses POSIX locks on files, and these + * locks have issues if one process opens a file multiple times. If a single + * process opens the same environment multiple times, closing it once will + * remove all the locks held on it, and the other instances will be + * vulnerable to corruption from other processes. + * + * For compatibility with LMDB which allows multi-opening, MDBX can be + * configured at runtime by mdbx_setup_debug(MDBX_DBG_LEGACY_MULTIOPEN, ...) + * prior to calling other MDBX funcitons. In this way MDBX will track + * databases opening, detect multi-opening cases and then recover POSIX file + * locks as necessary. However, lock recovery can cause unexpected pauses, + * such as when another process opened the database in exclusive mode before + * the lock was restored - we have to wait until such a process releases the + * database, and so on. + * + * - Avoid long-lived transactions, especially in the scenarios with a high + * rate of write transactions. Read transactions prevent reuse of pages + * freed by newer write transactions, thus the database can grow quickly. + * Write transactions prevent other write transactions, since writes are + * serialized. + * + * Understanding the problem of long-lived read transactions requires some + * explanation, but can be difficult for quick perception. So is is + * reasonable to simplify this as follows: + * 1. Garbage collection problem exists in all databases one way or + * another, e.g. VACUUM in PostgreSQL. But in _libmdbx_ it's even more + * discernible because of high transaction rate and intentional + * internals simplification in favor of performance. + * + * 2. MDBX employs Multiversion concurrency control on the Copy-on-Write + * basis, that allows multiple readers runs in parallel with a write + * transaction without blocking. An each write transaction needs free + * pages to put the changed data, that pages will be placed in the new + * b-tree snapshot at commit. MDBX efficiently recycling pages from + * previous created unused snapshots, BUT this is impossible if anyone + * a read transaction use such snapshot. + * + * 3. Thus massive altering of data during a parallel long read operation + * will increase the process's work set and may exhaust entire free + * database space. + * + * A good example of long readers is a hot backup to the slow destination + * or debugging of a client application while retaining an active read + * transaction. LMDB this results in MAP_FULL error and subsequent write + * performance degradation. + * + * MDBX mostly solve "long-lived" readers issue by the lack-of-space callback + * which allow to aborts long readers, and by the MDBX_LIFORECLAIM mode which + * addresses subsequent performance degradation. + * The "next" version of libmdbx (MithrilDB) will completely solve this. + * + * - Avoid suspending a process with active transactions. These would then be + * "long-lived" as above. + * + * The "next" version of libmdbx (MithrilDB) will solve this issue. + * + * - Avoid aborting a process with an active read-only transaction in scenaries + * with high rate of write transactions. The transaction becomes "long-lived" + * as above until a check for stale readers is performed or the LCK-file is + * reset, since the process may not remove it from the lockfile. This does + * not apply to write transactions if the system clears stale writers, see + * above. + * + * - An MDBX database configuration will often reserve considerable unused + * memory address space and maybe file size for future growth. This does + * not use actual memory or disk space, but users may need to understand + * the difference so they won't be scared off. + * + * - The Write Amplification Factor. + * TBD. + * + **** LICENSE AND COPYRUSTING ************************************************** * * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. @@ -12,13 +473,13 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. * - * --- + * --- * * This code is derived from "LMDB engine" written by * Howard Chu (Symas Corporation), which itself derived from btree.c * written by Martin Hedenfalk. * - * --- + * --- * * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. * @@ -30,7 +491,7 @@ * top-level directory of the distribution or, alternatively, at * <http://www.OpenLDAP.org/license.html>. * - * --- + * --- * * Portions Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> * @@ -44,28 +505,22 @@ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -/* ACKNOWLEDGEMENTS ************************************************************ + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + **** ACKNOWLEDGEMENTS ********************************************************* * * Howard Chu (Symas Corporation) - the author of LMDB, * from which originated the MDBX in 2015. * * Martin Hedenfalk <martin@bzero.se> - the author of `btree.c` code, - * which was used for begin development of LMDB. */ + * which was used for begin development of LMDB. + * + ******************************************************************************/ #pragma once #ifndef LIBMDBX_H #define LIBMDBX_H -/* IMPENDING CHANGES WARNING *************************************************** - * - * MDBX is under active non-public development, database format and API - * will be refined. New version won't be backwards compatible. Main focus - * of the rework is to provide clear and robust API and new features. - * - ******************************************************************************/ - #ifdef _MSC_VER #pragma warning(push, 1) #pragma warning(disable : 4548) /* expression before comma has no effect; \ @@ -106,6 +561,7 @@ typedef DWORD mdbx_tid_t; #define MDBX_EPERM ERROR_INVALID_FUNCTION #define MDBX_EINTR ERROR_CANCELLED #define MDBX_ENOFILE ERROR_FILE_NOT_FOUND +#define MDBX_EREMOTE ERROR_REMOTE_STORAGE_MEDIA_ERROR #else @@ -131,6 +587,7 @@ typedef pthread_t mdbx_tid_t; #define MDBX_EPERM EPERM #define MDBX_EINTR EINTR #define MDBX_ENOFILE ENOENT +#define MDBX_EREMOTE ENOTBLK #endif @@ -138,11 +595,21 @@ typedef pthread_t mdbx_tid_t; #pragma warning(pop) #endif -/*--------------------------------------------------------------------------*/ +/*----------------------------------------------------------------------------*/ #ifndef __has_attribute #define __has_attribute(x) (0) +#endif /* __has_attribute */ + +#ifndef __deprecated +#if defined(__GNUC__) || __has_attribute(__deprecated__) +#define __deprecated __attribute__((__deprecated__)) +#elif defined(_MSC_VER) +#define __deprecated __declspec(deprecated) +#else +#define __deprecated #endif +#endif /* __deprecated */ #ifndef __dll_export #if defined(_WIN32) || defined(__CYGWIN__) @@ -174,78 +641,113 @@ typedef pthread_t mdbx_tid_t; #endif #endif /* __dll_import */ -/*--------------------------------------------------------------------------*/ +/*----------------------------------------------------------------------------*/ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 3 +#define MDBX_VERSION_MINOR 4 +#ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) #define LIBMDBX_API __dll_export #elif defined(LIBMDBX_IMPORTS) #define LIBMDBX_API __dll_import #else #define LIBMDBX_API +#endif #endif /* LIBMDBX_API */ #ifdef __cplusplus extern "C" { #endif +/**** MDBX version information ************************************************/ + +#if defined(LIBMDBX_IMPORTS) +#define LIBMDBX_VERINFO_API __dll_import +#else +#define LIBMDBX_VERINFO_API __dll_export +#endif /* LIBMDBX_VERINFO_API */ + typedef struct mdbx_version_info { uint8_t major; uint8_t minor; uint16_t release; uint32_t revision; - struct { - const char *datetime; - const char *tree; - const char *commit; - const char *describe; + struct /* source info from git */ { + const char *datetime /* committer date, strict ISO-8601 format */; + const char *tree /* commit hash (hexadecimal digits) */; + const char *commit /* tree hash, i.e. digest of the source code */; + const char *describe /* git-describe string */; } git; + const char *sourcery /* sourcery anchor for pinning */; } mdbx_version_info; +extern LIBMDBX_VERINFO_API const mdbx_version_info mdbx_version; +/* MDBX build information. + * WARNING: Some strings could be NULL in case no corresponding information was + * provided at build time (i.e. flags). */ typedef struct mdbx_build_info { - const char *datetime; - const char *target; - const char *options; - const char *compiler; - const char *flags; + const char *datetime /* build timestamp (ISO-8601 or __DATE__ __TIME__) */; + const char *target /* cpu/arch-system-config triplet */; + const char *options /* mdbx-related options */; + const char *compiler /* compiler */; + const char *flags /* CFLAGS */; } mdbx_build_info; - -extern LIBMDBX_API const mdbx_version_info mdbx_version; -extern LIBMDBX_API const mdbx_build_info mdbx_build; +extern LIBMDBX_VERINFO_API const mdbx_build_info mdbx_build; #if defined(_WIN32) || defined(_WIN64) -#ifndef MDBX_BUILD_DLL - -/* Dll initialization callback for ability to dynamically load MDBX DLL by - * LoadLibrary() on Windows versions before Windows Vista. This function MUST be - * called once from DllMain() for each reason (DLL_PROCESS_ATTACH, - * DLL_PROCESS_DETACH, DLL_THREAD_ATTACH and DLL_THREAD_DETACH). Do this - * carefully and ONLY when actual Windows version don't support initialization - * via "TLS Directory" (e.g .CRT$XL[A-Z] sections in executable or dll file). */ +#if !MDBX_BUILD_SHARED_LIBRARY + +/* MDBX internally uses global and thread local storage destructors to + * automatically (de)initialization, releasing reader lock table slots + * and so on. + * + * If MDBX builded as a DLL this is done out-of-the-box by DllEntry() function, + * which called automatically by Windows core with passing corresponding reason + * argument. + * + * Otherwise, if MDBX was builded not as a DLL, some black magic + * may be required depending of Windows version: + * - Modern Windows versions, including Windows Vista and later, provides + * support for "TLS Directory" (e.g .CRT$XL[A-Z] sections in executable + * or dll file). In this case, MDBX capable of doing all automatically, + * and you do not need to call mdbx_dll_handler(). + * - Obsolete versions of Windows, prior to Windows Vista, REQUIRES calling + * mdbx_dll_handler() manually from corresponding DllMain() or WinMain() + * of your DLL or application. + * - This behavior is under control of the MODX_CONFIG_MANUAL_TLS_CALLBACK + * option, which is determined by default according to the target version + * of Windows at build time. + * But you may override MODX_CONFIG_MANUAL_TLS_CALLBACK in special cases. + * + * Therefore, building MDBX as a DLL is recommended for all version of Windows. + * So, if you doubt, just build MDBX as the separate DLL and don't worry. */ #ifndef MDBX_CONFIG_MANUAL_TLS_CALLBACK +#if defined(_WIN32_WINNT_VISTA) && WINVER >= _WIN32_WINNT_VISTA +/* As described above mdbx_dll_handler() is NOT needed forWindows Vista + * and later. */ #define MDBX_CONFIG_MANUAL_TLS_CALLBACK 0 +#else +/* As described above mdbx_dll_handler() IS REQUIRED for Windows versions + * prior to Windows Vista. */ +#define MDBX_CONFIG_MANUAL_TLS_CALLBACK 1 #endif +#endif /* MDBX_CONFIG_MANUAL_TLS_CALLBACK */ + #if MDBX_CONFIG_MANUAL_TLS_CALLBACK -void LIBMDBX_API NTAPI mdbx_dll_callback(PVOID module, DWORD reason, - PVOID reserved); +void LIBMDBX_API NTAPI mdbx_dll_handler(PVOID module, DWORD reason, + PVOID reserved); #endif /* MDBX_CONFIG_MANUAL_TLS_CALLBACK */ -#endif /* MDBX_BUILD_DLL */ +#endif /* !MDBX_BUILD_SHARED_LIBRARY */ #endif /* Windows */ -/* The name of the lock file in the DB environment */ -#define MDBX_LOCKNAME "/mdbx.lck" -/* The name of the data file in the DB environment */ -#define MDBX_DATANAME "/mdbx.dat" -/* The suffix of the lock file when no subdir is used */ -#define MDBX_LOCK_SUFFIX "-lck" +/**** OPACITY STRUCTURES ******************************************************/ /* Opaque structure for a database environment. * - * A DB environment supports multiple databases, all residing in the same - * shared-memory map. */ + * An environment supports multiple key-value databases (aka key-value spaces + * or tables), all residing in the same shared-memory map. */ typedef struct MDBX_env MDBX_env; /* Opaque structure for a transaction handle. @@ -254,87 +756,552 @@ typedef struct MDBX_env MDBX_env; * read-only or read-write. */ typedef struct MDBX_txn MDBX_txn; -/* A handle for an individual database in the DB environment. */ +/* A handle for an individual database (key-value spaces) in the environment. + * Zero handle is used internally (hidden Garbage Collection DB). + * So, any valid DBI-handle great than 0 and less than or equal MDBX_MAX_DBI. */ typedef uint32_t MDBX_dbi; +#define MDBX_MAX_DBI UINT32_C(32765) /* Opaque structure for navigating through a database */ typedef struct MDBX_cursor MDBX_cursor; -/* Generic structure used for passing keys and data in and out - * of the database. +/* Generic structure used for passing keys and data in and out of the database. * * Values returned from the database are valid only until a subsequent * update operation, or the end of the transaction. Do not modify or * free them, they commonly point into the database itself. * - * Key sizes must be between 1 and mdbx_env_get_maxkeysize() inclusive. + * Key sizes must be between 0 and mdbx_env_get_maxkeysize() inclusive. * The same applies to data sizes in databases with the MDBX_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. */ + * Other data items can in theory be from 0 to 0x7fffffff bytes long. + * + * (!) The notable difference between MDBX and LMDB is that MDBX support zero + * length keys. */ #ifndef HAVE_STRUCT_IOVEC struct iovec { - void *iov_base; - size_t iov_len; + void *iov_base /* pointer to some data */; + size_t iov_len /* the length of data in bytes */; }; #define HAVE_STRUCT_IOVEC #endif /* HAVE_STRUCT_IOVEC */ +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) +/* The `iov_len` is signed on Sun/Solaris. + * So define custom MDBX_val to avoid a lot of warings. */ +typedef struct MDBX_val { + void *iov_base /* pointer to some data */; + size_t iov_len /* the length of data in bytes */; +} MDBX_val; +#else typedef struct iovec MDBX_val; +#endif /* The maximum size of a data item. * MDBX only store a 32 bit value for node sizes. */ #define MDBX_MAXDATASIZE INT32_MAX -/* A callback function used to compare two keys in a database */ -typedef int(MDBX_cmp_func)(const MDBX_val *a, const MDBX_val *b); +/**** DEBUG & LOGGING ********************************************************** + * Logging and runtime debug flags. + * + * NOTE: Most of debug feature enabled only when libmdbx builded with + * MDBX_DEBUG options. + */ + +/* Log level (requires build libmdbx with MDBX_DEBUG) */ +#define MDBX_LOG_FATAL 0 /* critical conditions, i.e. assertion failures */ +#define MDBX_LOG_ERROR 1 /* error conditions */ +#define MDBX_LOG_WARN 2 /* warning conditions */ +#define MDBX_LOG_NOTICE 3 /* normal but significant condition */ +#define MDBX_LOG_VERBOSE 4 /* verbose informational */ +#define MDBX_LOG_DEBUG 5 /* debug-level messages */ +#define MDBX_LOG_TRACE 6 /* trace debug-level messages */ +#define MDBX_LOG_EXTRA 7 /* extra debug-level messages (dump pgno lists) */ + +/* Runtime debug flags. + * + * MDBX_DBG_DUMP and MDBX_DBG_LEGACY_MULTIOPEN always have an effect, + * but MDBX_DBG_ASSERT, MDBX_DBG_AUDIT and MDBX_DBG_JITTER only if libmdbx + * builded with MDBX_DEBUG. */ + +#define MDBX_DBG_ASSERT 1 /* Enable assertion checks */ +#define MDBX_DBG_AUDIT 2 /* Enable pages usage audit at commit transactions */ +#define MDBX_DBG_JITTER 4 /* Enable small random delays in critical points */ +#define MDBX_DBG_DUMP 8 /* Include or not database(s) in coredump files */ +#define MDBX_DBG_LEGACY_MULTIOPEN 16 /* Enable multi-opening environment(s) */ + +/* A debug-logger callback function, + * called before printing the message and aborting. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] msg The assertion message, not including newline. */ +typedef void MDBX_debug_func(int loglevel, const char *function, int line, + const char *msg, va_list args); + +/* FIXME: Complete description */ +LIBMDBX_API int mdbx_setup_debug(int loglevel, int flags, + MDBX_debug_func *logger); + +/* A callback function for most MDBX assert() failures, + * called before printing the message and aborting. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] msg The assertion message, not including newline. */ +typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, + const char *function, unsigned line); -/* Environment Flags */ -/* no environment directory */ +/* Set or reset the assert() callback of the environment. + * + * Does nothing if libmdbx was built with MDBX_DEBUG=0 or with NDEBUG, + * and will return MDBX_ENOSYS in such case. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] func An MDBX_assert_func function, or 0. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); + +/* FIXME: Complete description */ +LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, + const size_t bufsize); + +/**** THE FILES **************************************************************** + * At the file system level, the environment corresponds to a pair of files. */ + +/* The name of the lock file in the environment */ +#define MDBX_LOCKNAME "/mdbx.lck" +/* The name of the data file in the environment */ +#define MDBX_DATANAME "/mdbx.dat" + +/* The suffix of the lock file when MDBX_NOSUBDIR is used */ +#define MDBX_LOCK_SUFFIX "-lck" + +/**** ENVIRONMENT FLAGS *******************************************************/ + +/* MDBX_NOSUBDIR = no environment directory. + * + * By default, MDBX creates its environment in a directory whose pathname is + * given in path, and creates its data and lock files under that directory. + * With this option, path is used as-is for the database main data file. + * The database lock file is the path with "-lck" appended. + * + * - with MDBX_NOSUBDIR = in a filesystem we have the pair of MDBX-files which + * names derived from given pathname by appending predefined suffixes. + * + * - without MDBX_NOSUBDIR = in a filesystem we have the MDBX-directory with + * given pathname, within that a pair of MDBX-files with predefined names. + * + * This flag affects only at environment opening and can't be changed after. */ #define MDBX_NOSUBDIR 0x4000u -/* don't fsync after commit */ -#define MDBX_NOSYNC 0x10000u -/* read only */ + +/* MDBX_RDONLY = read only mode. + * + * Open the environment in read-only mode. No write operations will be allowed. + * MDBX will still modify the lock file - except on read-only filesystems, + * where MDBX does not use locks. + * + * - with MDBX_RDONLY = open environment in read-only mode. + * MDBX supports pure read-only mode (i.e. without opening LCK-file) only + * when environment directory and/or both files are not writable (and the + * LCK-file may be missing). In such case allowing file(s) to be placed + * on a network read-only share. + * + * - without MDBX_RDONLY = open environment in read-write mode. + * + * This flag affects only at environment opening but can't be changed after. */ #define MDBX_RDONLY 0x20000u -/* don't fsync metapage after commit */ -#define MDBX_NOMETASYNC 0x40000u -/* use writable mmap */ + +/* MDBX_EXCLUSIVE = open environment in exclusive/monopolistic mode. + * + * MDBX_EXCLUSIVE flag can be used as a replacement for MDB_NOLOCK, which don't + * supported by MDBX. In this way, you can get the minimal overhead, but with + * the correct multi-process and mutli-thread locking. + * + * - with MDBX_EXCLUSIVE = open environment in exclusive/monopolistic mode + * or return MDBX_BUSY if environment already used by other process. + * The main feature of the exclusive mode is the ability to open the + * environment placed on a network share. + * + * - without MDBX_EXCLUSIVE = open environment in cooperative mode, + * i.e. for multi-process access/interaction/cooperation. + * The main requirements of the cooperative mode are: + * 1. data files MUST be placed in the LOCAL file system, + * but NOT on a network share. + * 2. environment MUST be opened only by LOCAL processes, + * but NOT over a network. + * 3. OS kernel (i.e. file system and memory mapping implementation) and + * all processes that open the given environment MUST be running + * in the physically single RAM with cache-coherency. The only + * exception for cache-consistency requirement is Linux on MIPS + * architecture, but this case has not been tested for a long time). + + * This flag affects only at environment opening but can't be changed after. */ +#define MDBX_EXCLUSIVE 0x400000u + +/* MDBX_ACCEDE = using database which already opened by another process(es). + * + * The MDBX_ACCEDE flag avoid MDBX_INCOMPATIBLE error while opening If the + * database is already used by another process(es) and environment mode/flags + * isn't compatible. In such cases, when using the MDBX_ACCEDE flag, instead of + * the specified incompatible options, the mode in which the database is already + * opened by other processes will be used, including MDBX_LIFORECLAIM, + * MDBX_COALESCE and MDBX_NORDAHEAD. The MDBX_ACCEDE flag is useful to open a + * database that already used by another process(es) and used mode/flags isn't + * known. + * + * MDBX_ACCEDE has no effect if the current process is the only one either + * opening the DB in read-only mode or other process(es) uses the DB in + * read-only mode. */ +#define MDBX_ACCEDE 0x40000000u + +/* MDBX_WRITEMAP = map data into memory with write permission. + * + * Use a writeable memory map unless MDBX_RDONLY is set. This uses fewer mallocs + * and requires much less work for tracking database pages, but loses protection + * from application bugs like wild pointer writes and other bad updates into the + * database. This may be slightly faster for DBs that fit entirely in RAM, but + * is slower for DBs larger than RAM. Also adds the possibility for stray + * application writes thru pointers to silently corrupt the database. + * Incompatible with nested transactions. + * + * NOTE: The MDBX_WRITEMAP mode is incompatible with nested transactions, since + * this is unreasonable. I.e. nested transactions requires mallocation of + * database pages and more work for tracking ones, which neuters a + * performance boost caused by the MDBX_WRITEMAP mode. + * + * NOTE: MDBX don't allow to mix processes with and without MDBX_WRITEMAP on + * the same environment. In such case MDBX_INCOMPATIBLE will be generated. + * + * - with MDBX_WRITEMAP = all data will be mapped into memory in the read-write + * mode. This offers a significant performance benefit, since the data will + * be modified directly in mapped memory and then flushed to disk by + * single system call, without any memory management nor copying. + * (!) On the other hand, MDBX_WRITEMAP adds the possibility for stray + * application writes thru pointers to silently corrupt the database. + * Moreover, MDBX_WRITEMAP disallows nested write transactions. + * + * - without MDBX_WRITEMAP = data will be mapped into memory in the read-only + * mode. This requires stocking all modified database pages in memory and + * then writing them to disk through file operations. + * + * This flag affects only at environment opening but can't be changed after. */ #define MDBX_WRITEMAP 0x80000u -/* use asynchronous msync when MDBX_WRITEMAP is used */ -#define MDBX_MAPASYNC 0x100000u -/* tie reader locktable slots to MDBX_txn objects instead of to threads */ + +/* MDBX_NOTLS = tie reader locktable slots to read-only transactions instead + * of to threads. + * + * Don't use Thread-Local Storage, instead tie reader locktable slots to + * MDBX_txn objects instead of to threads. So, mdbx_txn_reset() keeps the slot + * reserved for the MDBX_txn object. A thread may use parallel read-only + * transactions. And a read-only transaction may span threads if you + * synchronizes its use. + * + * Applications that multiplex many user threads over individual OS threads need + * this option. Such an application must also serialize the write transactions + * in an OS thread, since MDBX's write locking is unaware of the user threads. + * + * NOTE: Regardless to MDBX_NOTLS flag a write transaction entirely should + * always be used in one thread from start to finish. MDBX checks this in a + * reasonable manner and return the MDBX_THREAD_MISMATCH error in rules + * violation. + * + * This flag affects only at environment opening but can't be changed after. */ #define MDBX_NOTLS 0x200000u -/* open DB in exclusive/monopolistic mode. */ -#define MDBX_EXCLUSIVE 0x400000u -/* don't do readahead */ + +/* MDBX_NORDAHEAD = don't do readahead. + * + * Turn off readahead. Most operating systems perform readahead on read requests + * by default. This option turns it off if the OS supports it. Turning it off + * may help random read performance when the DB is larger than RAM and system + * RAM is full. + * + * By default libmdbx dynamically enables/disables readahead depending on the + * actual database size and currently available memory. On the other hand, such + * automation has some limitation, i.e. could be performed only when DB size + * changing but can't tracks and reacts changing a free RAM availability, since + * it changes independently and asynchronously. + * + * NOTE: The mdbx_is_readahead_reasonable() function allows to quickly find out + * whether to use readahead or not based on the size of the data and the + * amount of available memory. + * + * This flag affects only at environment opening and can't be changed after. */ #define MDBX_NORDAHEAD 0x800000u -/* don't initialize malloc'd memory before writing to datafile */ + +/* MDBX_NOMEMINIT = don't initialize malloc'd memory before writing to datafile. + * + * Don't initialize malloc'd memory before writing to unused spaces in the data + * file. By default, memory for pages written to the data file is obtained using + * malloc. While these pages may be reused in subsequent transactions, freshly + * malloc'd pages will be initialized to zeroes before use. This avoids + * persisting leftover data from other code (that used the heap and subsequently + * freed the memory) into the data file. + * + * Note that many other system libraries may allocate and free memory from the + * heap for arbitrary uses. E.g., stdio may use the heap for file I/O buffers. + * This initialization step has a modest performance cost so some applications + * may want to disable it using this flag. This option can be a problem for + * applications which handle sensitive data like passwords, and it makes memory + * checkers like Valgrind noisy. This flag is not needed with MDBX_WRITEMAP, + * which writes directly to the mmap instead of using malloc for pages. The + * initialization is also skipped if MDBX_RESERVE is used; the caller is + * expected to overwrite all of the memory that was reserved in that case. + * + * This flag may be changed at any time using mdbx_env_set_flags(). */ #define MDBX_NOMEMINIT 0x1000000u -/* aim to coalesce FreeDB records */ + +/* MDBX_COALESCE = aims to coalesce a Garbage Collection items. + * + * With MDBX_COALESCE flag MDBX will aims to coalesce items while recycling + * a Garbage Collection. Technically, when possible short lists of pages will + * be combined into longer ones, but to fit on one database page. As a result, + * there will be fewer items in Garbage Collection and a page lists are longer, + * which slightly increases the likelihood of returning pages to Unallocated + * space and reducing the database file. + * + * This flag may be changed at any time using mdbx_env_set_flags(). */ #define MDBX_COALESCE 0x2000000u -/* LIFO policy for reclaiming FreeDB records */ + +/* MDBX_LIFORECLAIM = LIFO policy for recycling a Garbage Collection items. + * + * MDBX_LIFORECLAIM flag turns on LIFO policy for recycling a Garbage + * Collection items, instead of FIFO by default. On systems with a disk + * write-back cache, this can significantly increase write performance, up to + * several times in a best case scenario. + * + * LIFO recycling policy means that for reuse pages will be taken which became + * unused the lastest (i.e. just now or most recently). Therefore the loop of + * database pages circulation becomes as short as possible. In other words, the + * number of pages, that are overwritten in memory and on disk during a series + * of write transactions, will be as small as possible. Thus creates ideal + * conditions for the efficient operation of the disk write-back cache. + * + * MDBX_LIFORECLAIM is compatible with all no-sync flags (i.e. MDBX_NOMETASYNC, + * MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC), but gives no noticeable + * impact in combination with MDB_NOSYNC and MDX_MAPASYNC. Because MDBX will + * not reused paged from the last "steady" MVCC-snapshot and later, i.e. the + * loop length of database pages circulation will be mostly defined by frequency + * of calling mdbx_env_sync() rather than LIFO and FIFO difference. + * + * This flag may be changed at any time using mdbx_env_set_flags(). */ #define MDBX_LIFORECLAIM 0x4000000u -/* make a steady-sync only on close and explicit env-sync */ -#define MDBX_UTTERLY_NOSYNC (MDBX_NOSYNC | MDBX_MAPASYNC) -/* debuging option, fill/perturb released pages */ + +/* Debugging option, fill/perturb released pages. */ #define MDBX_PAGEPERTURB 0x8000000u -/* Database Flags */ -/* use reverse string keys */ +/**** SYNC MODES *************************************************************** + * (!!!) Using any combination of MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC + * and especially MDBX_UTTERLY_NOSYNC is always a deal to reduce durability + * for gain write performance. You must know exactly what you are doing and + * what risks you are taking! + * + * NOTE for LMDB users: MDBX_NOSYNC is NOT similar to LMDB_NOSYNC, but + * MDBX_UTTERLY_NOSYNC is exactly match LMDB_NOSYNC. + * See details below. + * + * THE SCENE: + * - The DAT-file contains several MVCC-snapshots of B-tree at same time, + * each of those B-tree has its own root page. + * - Each of meta pages at the beginning of the DAT file contains a pointer + * to the root page of B-tree which is the result of the particular + * transaction, and a number of this transaction. + * - For data durability, MDBX must first write all MVCC-snapshot data pages + * and ensure that are written to the disk, then update a meta page with + * the new transaction number and a pointer to the corresponding new root + * page, and flush any buffers yet again. + * - Thus during commit a I/O buffers should be flushed to the disk twice; + * i.e. fdatasync(), FlushFileBuffers() or similar syscall should be called + * twice for each commit. This is very expensive for performance, but + * guaranteed durability even on unexpected system failure or power outage. + * Of course, provided that the operating system and the underlying hardware + * (e.g. disk) work correctly. + * + * TRADE-OFF: By skipping some stages described above, you can significantly + * benefit in speed, while partially or completely losing in the guarantee of + * data durability and/or consistency in the event of system or power failure. + * Moreover, if for any reason disk write order is not preserved, then at moment + * of a system crash, a meta-page with a pointer to the new B-tree may be + * written to disk, while the itself B-tree not yet. In that case, the database + * will be corrupted! + * + * + * MDBX_NOMETASYNC = don't sync the meta-page after commit. + * + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDBX_RDONLY commit or mdbx_env_sync(). Depending on the + * platform and hardware, with MDBX_NOMETASYNC you may get a doubling of + * write performance. + * + * This trade-off maintains database integrity, but a system crash may + * undo the last committed transaction. I.e. it preserves the ACI + * (atomicity, consistency, isolation) but not D (durability) database + * property. + * + * MDBX_NOMETASYNC flag may be changed at any time using + * mdbx_env_set_flags() or by passing to mdbx_txn_begin() for particular + * write transaction. + * + * + * MDBX_UTTERLY_NOSYNC = don't sync anything and wipe previous steady commits. + * + * Don't flush system buffers to disk when committing a transaction. This + * optimization means a system crash can corrupt the database, if buffers + * are not yet flushed to disk. Depending on the platform and hardware, + * with MDBX_UTTERLY_NOSYNC you may get a multiple increase of write + * performance, even 100 times or more. + * + * If the filesystem preserves write order (which is rare and never + * provided unless explicitly noted) and the MDBX_WRITEMAP and + * MDBX_LIFORECLAIM flags are not used, then a system crash can't corrupt + * the database, but you can lose the last transactions, if at least one + * buffer is not yet flushed to disk. The risk is governed by how often the + * system flushes dirty buffers to disk and how often mdbx_env_sync() is + * called. So, transactions exhibit ACI (atomicity, consistency, isolation) + * properties and only lose D (durability). I.e. database integrity is + * maintained, but a system crash may undo the final transactions. + * + * Otherwise, if the filesystem not preserves write order (which is + * typically) or MDBX_WRITEMAP or MDBX_LIFORECLAIM flags are used, you + * should expect the corrupted database after a system crash. + * + * So, most important thing about MDBX_UTTERLY_NOSYNC: + * - a system crash immediately after commit the write transaction + * high likely lead to database corruption. + * - successful completion of mdbx_env_sync(force = true) after one or + * more commited transactions guarantees consystency and durability. + * - BUT by committing two or more transactions you back database into a + * weak state, in which a system crash may lead to database corruption! + * In case single transaction after mdbx_env_sync, you may lose + * transaction itself, but not a whole database. + * + * Nevertheless, MDBX_UTTERLY_NOSYNC provides ACID in case of a application + * crash, and therefore may be very useful in scenarios where data + * durability is not required over a system failure (e.g for short-lived + * data), or if you can ignore such risk. + * + * MDBX_UTTERLY_NOSYNC flag may be changed at any time using + * mdbx_env_set_flags(), but don't has effect if passed to mdbx_txn_begin() + * for particular write transaction. + * + * + * MDBX_NOSYNC = don't sync anything but keep previous steady commits. + * + * Like MDBX_UTTERLY_NOSYNC the MDBX_NOSYNC flag similarly disable flush + * system buffers to disk when committing a transaction. But there is a + * huge difference in how are recycled the MVCC snapshots corresponding + * to previous "steady" transactions (see below). + * + * Depending on the platform and hardware, with MDBX_NOSYNC you may get + * a multiple increase of write performance, even 10 times or more. + * NOTE that (MDBX_NOSYNC | MDBX_WRITEMAP) leaves the system with no hint + * for when to write transactions to disk. Therefore the (MDBX_MAPASYNC | + * MDBX_WRITEMAP) may be preferable, but without MDBX_NOSYNC because + * the (MDBX_MAPASYNC | MDBX_NOSYNC) actually gives MDBX_UTTERLY_NOSYNC. + * + * In contrast to MDBX_UTTERLY_NOSYNC mode, with MDBX_NOSYNC flag MDBX will + * keeps untouched pages within B-tree of the last transaction "steady" + * which was synced to disk completely. This has big implications for both + * data durability and (unfortunately) performance: + * - a system crash can't corrupt the database, but you will lose the + * last transactions; because MDBX will rollback to last steady commit + * since it kept explicitly. + * - the last steady transaction makes an effect similar to "long-lived" + * read transaction (see above in the "RESTRICTIONS & CAVEATS" section) + * since prevents reuse of pages freed by newer write transactions, + * thus the any data changes will be placed in newly allocated pages. + * - to avoid rapid database growth, the system will sync data and issue + * a steady commit-point to resume reuse pages, each time there is + * insufficient space and before increasing the size of the file on + * disk. + * + * In other words, with MDBX_NOSYNC flag MDBX insures you from the whole + * database corruption, at the cost increasing database size and/or number + * of disk IOPS. So, MDBX_NOSYNC flag could be used with mdbx_env_synv() + * as alternatively for batch committing or nested transaction (in some + * cases). As well, auto-sync feature exposed by mdbx_env_set_syncbytes() + * and mdbx_env_set_syncperiod() functions could be very usefull with + * MDBX_NOSYNC flag. + * + * The number and volume of of disk IOPS with MDBX_NOSYNC flag will + * exactly the as without any no-sync flags. However, you should expect + * a larger process's work set (https://bit.ly/2kA2tFX) and significantly + * worse a locality of reference (https://bit.ly/2mbYq2J), due to the + * more intensive allocation of previously unused pages and increase the + * size of the database. + * + * MDBX_NOSYNC flag may be changed at any time using + * mdbx_env_set_flags() or by passing to mdbx_txn_begin() for particular + * write transaction. + * + * + * MDBX_MAPASYNC = use asynchronous msync when MDBX_WRITEMAP is used. + * + * MDBX_MAPASYNC meaningful and give effect only in conjunction + * with MDBX_WRITEMAP or MDBX_NOSYNC: + * - with MDBX_NOSYNC actually gives MDBX_UTTERLY_NOSYNC, which + * wipe previous steady commits for reuse pages as described above. + * - with MDBX_WRITEMAP but without MDBX_NOSYNC instructs MDBX to use + * asynchronous mmap-flushes to disk as described below. + * - with both MDBX_WRITEMAP and MDBX_NOSYNC you get the both effects. + * + * Asynchronous mmap-flushes means that actually all writes will scheduled + * and performed by operation system on it own manner, i.e. unordered. + * MDBX itself just notify operating system that it would be nice to write + * data to disk, but no more. + * + * With MDBX_MAPASYNC flag, but without MDBX_UTTERLY_NOSYNC (i.e. without + * OR'ing with MDBX_NOSYNC) MDBX will keeps untouched pages within B-tree + * of the last transaction "steady" which was synced to disk completely. + * So, this makes exactly the same "long-lived" impact and the same + * consequences as described above for MDBX_NOSYNC flag. + * + * Depending on the platform and hardware, with combination of + * MDBX_WRITEMAP and MDBX_MAPASYNC you may get a multiple increase of write + * performance, even 25 times or more. MDBX_MAPASYNC flag may be changed at + * any time using mdbx_env_set_flags() or by passing to mdbx_txn_begin() + * for particular write transaction. + */ + +/* Don't sync meta-page after commit, + * see description in the "SYNC MODES" section above. */ +#define MDBX_NOMETASYNC 0x40000u + +/* Don't sync anything but keep previous steady commits, + * see description in the "SYNC MODES" section above. + * + * (!) don't combine this flag with MDBX_MAPASYNC + * since you will got MDBX_UTTERLY_NOSYNC in that way (see below) */ +#define MDBX_NOSYNC 0x10000u + +/* Use asynchronous msync when MDBX_WRITEMAP is used, + * see description in the "SYNC MODES" section above. + * + * (!) don't combine this flag with MDBX_NOSYNC + * since you will got MDBX_UTTERLY_NOSYNC in that way (see below) */ +#define MDBX_MAPASYNC 0x100000u + +/* Don't sync anything and wipe previous steady commits, + * see description in the "SYNC MODES" section above. */ +#define MDBX_UTTERLY_NOSYNC (MDBX_NOSYNC | MDBX_MAPASYNC) + +/**** DATABASE FLAGS **********************************************************/ +/* Use reverse string keys */ #define MDBX_REVERSEKEY 0x02u -/* use sorted duplicates */ +/* Use sorted duplicates */ #define MDBX_DUPSORT 0x04u -/* numeric keys in native byte order, either uint32_t or uint64_t. +/* Numeric keys in native byte order, either uint32_t or uint64_t. * The keys must all be of the same size. */ #define MDBX_INTEGERKEY 0x08u -/* with MDBX_DUPSORT, sorted dup items have fixed size */ +/* With MDBX_DUPSORT, sorted dup items have fixed size */ #define MDBX_DUPFIXED 0x10u -/* with MDBX_DUPSORT, dups are MDBX_INTEGERKEY-style integers */ +/* With MDBX_DUPSORT, dups are MDBX_INTEGERKEY-style integers */ #define MDBX_INTEGERDUP 0x20u -/* with MDBX_DUPSORT, use reverse string dups */ +/* With MDBX_DUPSORT, use reverse string dups */ #define MDBX_REVERSEDUP 0x40u -/* create DB if not already existing */ +/* Create DB if not already existing */ #define MDBX_CREATE 0x40000u -/* Write Flags */ +/**** DATA UPDATE FLAGS *******************************************************/ /* For put: Don't write if the key already exists. */ #define MDBX_NOOVERWRITE 0x10u /* Only for MDBX_DUPSORT @@ -355,16 +1322,15 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, const MDBX_val *b); /* Store multiple data items in one call. Only for MDBX_DUPFIXED. */ #define MDBX_MULTIPLE 0x80000u -/* Transaction Flags */ +/**** TRANSACTION FLAGS *******************************************************/ /* Do not block when starting a write transaction */ #define MDBX_TRYTXN 0x10000000u -/* Copy Flags */ -/* Compacting copy: Omit free space from copy, and renumber all - * pages sequentially. */ +/**** ENVIRONMENT COPY FLAGS **************************************************/ +/* Compacting: Omit free space from copy, and renumber all pages sequentially */ #define MDBX_CP_COMPACT 1u -/* Cursor Get operations. +/**** CURSOR OPERATIONS ******************************************************** * * This is the set of all operations for retrieving data * using a cursor. */ @@ -384,8 +1350,8 @@ typedef enum MDBX_cursor_op { MDBX_NEXT, /* Position at next data item */ MDBX_NEXT_DUP, /* MDBX_DUPSORT-only: Position at next data item * of current key. */ - MDBX_NEXT_MULTIPLE, /* MDBX_DUPFIXED-only: Return up to a page of duplicate - * data items from next cursor position. + MDBX_NEXT_MULTIPLE, /* MDBX_DUPFIXED-only: Return up to a page of + * duplicate data items from next cursor position. * Move cursor to prepare for MDBX_NEXT_MULTIPLE. */ MDBX_NEXT_NODUP, /* Position at first data item of next key */ MDBX_PREV, /* Position at previous data item */ @@ -400,12 +1366,13 @@ typedef enum MDBX_cursor_op { * return up to a page of duplicate data items. */ } MDBX_cursor_op; -/* Return Codes +/**** ERRORS & RETURN CODES **************************************************** * BerkeleyDB uses -30800 to -30999, we'll go under them */ /* Successful result */ #define MDBX_SUCCESS 0 #define MDBX_RESULT_FALSE MDBX_SUCCESS +/* Successful result with special meaning or a flag */ #define MDBX_RESULT_TRUE (-1) /* key/data pair already exists */ @@ -414,9 +1381,9 @@ typedef enum MDBX_cursor_op { #define MDBX_NOTFOUND (-30798) /* Requested page not found - this usually indicates corruption */ #define MDBX_PAGE_NOTFOUND (-30797) -/* Located page was wrong type */ +/* Database is corrupted (page was wrong type and so on) */ #define MDBX_CORRUPTED (-30796) -/* Update of meta page failed or environment had fatal error */ +/* Environment had fatal error (i.e. update of meta page failed and so on) */ #define MDBX_PANIC (-30795) /* DB file version mismatch with libmdbx */ #define MDBX_VERSION_MISMATCH (-30794) @@ -452,13 +1419,14 @@ typedef enum MDBX_cursor_op { #define MDBX_BAD_DBI (-30780) /* Unexpected problem - txn should abort */ #define MDBX_PROBLEM (-30779) -/* Another write transaction is running */ +/* Another write transaction is running or environment is already used while + * opening with MDBX_EXCLUSIVE flag */ #define MDBX_BUSY (-30778) /* The last defined error code */ #define MDBX_LAST_ERRCODE MDBX_BUSY /* The mdbx_put() or mdbx_replace() was called for key, - that has more that one associated value. */ + * that has more that one associated value. */ #define MDBX_EMULTIVAL (-30421) /* Bad signature of a runtime object(s), this can mean: @@ -482,40 +1450,7 @@ typedef enum MDBX_cursor_op { * e.g. a transaction that started by another thread. */ #define MDBX_THREAD_MISMATCH (-30416) -/* Statistics for a database in the environment */ -typedef struct MDBX_stat { - uint32_t ms_psize; /* Size of a database page. - * This is currently the same for all databases. */ - uint32_t ms_depth; /* Depth (height) of the B-tree */ - uint64_t ms_branch_pages; /* Number of internal (non-leaf) pages */ - uint64_t ms_leaf_pages; /* Number of leaf pages */ - uint64_t ms_overflow_pages; /* Number of overflow pages */ - uint64_t ms_entries; /* Number of data items */ -} MDBX_stat; - -/* Information about the environment */ -typedef struct MDBX_envinfo { - struct { - uint64_t lower; /* lower limit for datafile size */ - uint64_t upper; /* upper limit for datafile size */ - uint64_t current; /* current datafile size */ - uint64_t shrink; /* shrink threshold for datafile */ - uint64_t grow; /* growth step for datafile */ - } mi_geo; - uint64_t mi_mapsize; /* Size of the data memory map */ - uint64_t mi_last_pgno; /* ID of the last used page */ - uint64_t mi_recent_txnid; /* ID of the last committed transaction */ - uint64_t mi_latter_reader_txnid; /* ID of the last reader transaction */ - uint64_t mi_self_latter_reader_txnid; /* ID of the last reader transaction of - caller process */ - uint64_t mi_meta0_txnid, mi_meta0_sign; - uint64_t mi_meta1_txnid, mi_meta1_sign; - uint64_t mi_meta2_txnid, mi_meta2_sign; - uint32_t mi_maxreaders; /* max reader slots in the environment */ - uint32_t mi_numreaders; /* max reader slots used in the environment */ - uint32_t mi_dxb_pagesize; /* database pagesize */ - uint32_t mi_sys_pagesize; /* system pagesize */ -} MDBX_envinfo; +/**** FUNCTIONS & RELATED STRUCTURES ******************************************/ /* Return a string describing a given error code. * @@ -525,164 +1460,111 @@ typedef struct MDBX_envinfo { * is less than 0, an error string corresponding to the MDBX library error is * returned. See errors for a list of MDBX-specific error codes. * - * [in] err The error code + * mdbx_strerror() - is NOT thread-safe because may share common internal + * buffer for system maessages. The returned string must + * NOT be modified by the application, but MAY be modified + * by a subsequent call to mdbx_strerror(), strerror() and + * other related functions. + * + * mdbx_strerror_r() - is thread-safe since uses user-supplied buffer where + * appropriate. The returned string must NOT be modified + * by the application, since it may be pointer to internal + * constatn string. However, there is no restriction if the + * returned string points to the supplied buffer. + * + * [in] err The error code. * - * Returns "error message" The description of the error */ + * Returns "error message" The description of the error. */ LIBMDBX_API const char *mdbx_strerror(int errnum); LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); -/* Create an MDBX environment handle. +#if defined(_WIN32) || defined(_WIN64) +/* Bit of Windows' madness. The similar functions but returns Windows + * error-messages in the OEM-encoding for console utilities. */ +LIBMDBX_API const char *mdbx_strerror_ANSI2OEM(int errnum); +LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, + size_t buflen); +#endif /* Bit of Windows' madness */ + +/* Create an MDBX environment instance. * * This function allocates memory for a MDBX_env structure. To release * the allocated memory and discard the handle, call mdbx_env_close(). * Before the handle may be used, it must be opened using mdbx_env_open(). + * * Various other options may also need to be set before opening the handle, - * e.g. mdbx_env_set_mapsize(), mdbx_env_set_maxreaders(), + * e.g. mdbx_env_set_geometry(), mdbx_env_set_maxreaders(), * mdbx_env_set_maxdbs(), depending on usage requirements. * - * [out] env The address where the new handle will be stored + * [out] env The address where the new handle will be stored. * - * Returns A non-zero error value on failure and 0 on success. */ + * Returns a non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); -/* Open an environment handle. +/* Open an environment instance. * - * If this function fails, mdbx_env_close() must be called to discard - * the MDBX_env handle. + * Indifferently this function will fails or not, the mdbx_env_close() must be + * called later to discard the MDBX_env handle and release associated resources. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] path The directory in which the database files reside. - * This directory must already exist and be writable. - * [in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one - * or more of the values described here. + * [in] env An environment handle returned by mdbx_env_create() + * [in] pathname The directory in which the database files reside. + * This directory must already exist and be writable. + * [in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described above in the + * "ENVIRONMENT FLAGS" and "SYNC MODES" sections. * * Flags set by mdbx_env_set_flags() are also used: - * - MDBX_NOSUBDIR - * By default, MDBX creates its environment in a directory whose - * pathname is given in path, and creates its data and lock files - * under that directory. With this option, path is used as-is for - * the database main data file. The database lock file is the path - * with "-lock" appended. + * - MDBX_NOSUBDIR, MDBX_RDONLY, MDBX_EXCLUSIVE, MDBX_WRITEMAP, MDBX_NOTLS, + * MDBX_NORDAHEAD, MDBX_NOMEMINIT, MDBX_COALESCE, MDBX_LIFORECLAIM. + * See "ENVIRONMENT FLAGS" section above. * - * - MDBX_RDONLY - * Open the environment in read-only mode. No write operations will - * be allowed. MDBX will still modify the lock file - except on - * read-only filesystems, where MDBX does not use locks. - * - * - MDBX_WRITEMAP - * Use a writeable memory map unless MDBX_RDONLY is set. This uses fewer - * mallocs but loses protection from application bugs like wild pointer - * writes and other bad updates into the database. - * This may be slightly faster for DBs that fit entirely in RAM, - * but is slower for DBs larger than RAM. - * Incompatible with nested transactions. - * Do not mix processes with and without MDBX_WRITEMAP on the same - * environment. This can defeat durability (mdbx_env_sync etc). - * - * - MDBX_NOMETASYNC - * Flush system buffers to disk only once per transaction, omit the - * metadata flush. Defer that until the system flushes files to disk, - * or next non-MDBX_RDONLY commit or mdbx_env_sync(). This optimization - * maintains database integrity, but a system crash may undo the last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database property. - * This flag may be changed at any time using mdbx_env_set_flags(). - * - * - MDBX_NOSYNC - * Don't flush system buffers to disk when committing a transaction. - * This optimization means a system crash can corrupt the database or - * lose the last transactions if buffers are not yet flushed to disk. - * The risk is governed by how often the system flushes dirty buffers - * to disk and how often mdbx_env_sync() is called. However, if the - * filesystem preserves write order and the MDBX_WRITEMAP and/or - * MDBX_LIFORECLAIM flags are not used, transactions exhibit ACI - * (atomicity, consistency, isolation) properties and only lose D - * (durability). I.e. database integrity is maintained, but a system - * crash may undo the final transactions. - * - * Note that (MDBX_NOSYNC | MDBX_WRITEMAP) leaves the system with no - * hint for when to write transactions to disk. - * Therefore the (MDBX_MAPASYNC | MDBX_WRITEMAP) may be preferable. - * This flag may be changed at any time using mdbx_env_set_flags(). - * - * - MDBX_UTTERLY_NOSYNC (internally MDBX_NOSYNC | MDBX_MAPASYNC) - * FIXME: TODO - * - * - MDBX_MAPASYNC - * When using MDBX_WRITEMAP, use asynchronous flushes to disk. As with - * MDBX_NOSYNC, a system crash can then corrupt the database or lose - * the last transactions. Calling mdbx_env_sync() ensures on-disk - * database integrity until next commit. This flag may be changed at - * any time using mdbx_env_set_flags(). - * - * - MDBX_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * MDBX_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps - * the slot reserved for the MDBX_txn object. A thread may use parallel - * read-only transactions. A read-only transaction may span threads if - * the user synchronizes its use. Applications that multiplex many - * user threads over individual OS threads need this option. Such an - * application must also serialize the write transactions in an OS - * thread, since MDBX's write locking is unaware of the user threads. - * - * - MDBX_NOLOCK (don't supported by MDBX) - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper operation - * the caller must enforce single-writer semantics, and must ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so that - * no readers may be active at all when a writer begins. - * - * - MDBX_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. - * - * - MDBX_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused spaces - * in the data file. By default, memory for pages written to the data - * file is obtained using malloc. While these pages may be reused in - * subsequent transactions, freshly malloc'd pages will be initialized - * to zeroes before use. This avoids persisting leftover data from other - * code (that used the heap and subsequently freed the memory) into the - * data file. Note that many other system libraries may allocate and free - * memory from the heap for arbitrary uses. E.g., stdio may use the heap - * for file I/O buffers. This initialization step has a modest performance - * cost so some applications may want to disable it using this flag. This - * option can be a problem for applications which handle sensitive data - * like passwords, and it makes memory checkers like Valgrind noisy. This - * flag is not needed with MDBX_WRITEMAP, which writes directly to the - * mmap instead of using malloc for pages. The initialization is also - * skipped if MDBX_RESERVE is used; the caller is expected to overwrite - * all of the memory that was reserved in that case. This flag may be - * changed at any time using mdbx_env_set_flags(). - * - * - MDBX_COALESCE - * Aim to coalesce records while reclaiming FreeDB. This flag may be - * changed at any time using mdbx_env_set_flags(). - * FIXME: TODO - * - * - MDBX_LIFORECLAIM - * LIFO policy for reclaiming FreeDB records. This significantly reduce - * write IPOs in case MDBX_NOSYNC with periodically checkpoints. - * FIXME: TODO - * - * [in] mode The UNIX permissions to set on created files. + * - MDBX_NOMETASYNC, MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC, MDBX_MAPASYNC. + * See "SYNC MODES" section above. + * + * NOTE: MDB_NOLOCK flag don't supported by MDBX, + * try use MDBX_EXCLUSIVE as a replacement. + * + * NOTE: MDBX don't allow to mix processes with different MDBX_WRITEMAP, + * MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC flags onthe same + * environment. In such case MDBX_INCOMPATIBLE will be returned. + * + * If the database is already exist and parameters specified early by + * mdbx_env_set_geometry() are incompatible (i.e. for instance, different page + * size) then mdbx_env_open() will return MDBX_INCOMPATIBLE error. + * + * [in] mode The UNIX permissions to set on created files. Zero value means + * to open existing, but do not create. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_VERSION_MISMATCH - the version of the MDBX library doesn't match the + * - MDBX_VERSION_MISMATCH = the version of the MDBX library doesn't match the * version that created the database environment. - * - MDBX_INVALID - the environment file headers are corrupted. - * - MDBX_ENOENT - the directory specified by the path parameter - * doesn't exist. - * - MDBX_EACCES - the user didn't have permission to access - * the environment files. - * - MDBX_EAGAIN - the environment was locked by another process. */ -LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, - mode_t mode); + * - MDBX_INVALID = the environment file headers are corrupted. + * - MDBX_ENOENT = the directory specified by the path parameter + * doesn't exist. + * - MDBX_EACCES = the user didn't have permission to access + * the environment files. + * - MDBX_EAGAIN = the environment was locked by another process. + * - MDBX_BUSY = MDBX_EXCLUSIVE flag was specified and the + * environment is in use by another process, + * or the current process tries to open environment + * more than once. + * - MDBX_INCOMPATIBLE = Environment is already opened by another process, + * but with different set of MDBX_WRITEMAP, + * MDBX_NOSYNC, MDBX_NOMETASYNC, MDBX_MAPASYNC + * flags. + * Or if the database is already exist and + * parameters specified early by + * mdbx_env_set_geometry() are incompatible (i.e. + * for instance, different page size). + * - MDBX_WANNA_RECOVERY = MDBX_RDONLY flag was specified but read-write + * access is required to rollback inconsistent state + * after a system crash. + * - MDBX_TOO_LARGE = Database is too large for this process, i.e. + * 32-bit process tries to open >4Gb database. */ +LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, + unsigned flags, mode_t mode); /* Copy an MDBX environment to the specified path, with options. * @@ -694,7 +1576,7 @@ LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, * * [in] env An environment handle returned by mdbx_env_create(). It must * have already been opened successfully. - * [in] path The directory in which the copy will reside. This directory + * [in] dest The directory in which the copy will reside. This directory * must already exist and be writable but must otherwise be empty. * [in] flags Special options for this operation. This parameter must be set * to 0 or by bitwise OR'ing together one or more of the values @@ -706,11 +1588,8 @@ LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, * CPU for processing, but may running quickly than the default, on * account skipping free pages. * - * NOTE: Currently it fails if the environment has suffered a page leak. - * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest_path, - unsigned flags); +LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, unsigned flags); /* Copy an MDBX environment to the specified file descriptor, * with options. @@ -720,8 +1599,11 @@ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest_path, * mdbx_env_copy() for further details. * * NOTE: This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under "Caveats" section. + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under "Caveats" section. + * + * NOTE: Fails if the environment has suffered a page leak and the destination + * file descriptor is associated with a pipe, socket, or FIFO. * * [in] env An environment handle returned by mdbx_env_create(). It must * have already been opened successfully. @@ -734,43 +1616,199 @@ LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest_path, LIBMDBX_API int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags); +/* Statistics for a database in the environment */ +typedef struct MDBX_stat { + uint32_t ms_psize; /* Size of a database page. + * This is the same for all databases. */ + uint32_t ms_depth; /* Depth (height) of the B-tree */ + uint64_t ms_branch_pages; /* Number of internal (non-leaf) pages */ + uint64_t ms_leaf_pages; /* Number of leaf pages */ + uint64_t ms_overflow_pages; /* Number of overflow pages */ + uint64_t ms_entries; /* Number of data items */ + uint64_t ms_mod_txnid; /* Transaction ID of commited last modification */ +} MDBX_stat; + /* Return statistics about the MDBX environment. * + * At least one of env or txn argument must be non-null. If txn is passed + * non-null then stat will be filled accordingly to the given transaction. + * Otherwise, if txn is null, then stat will be populated by a snapshot from the + * last committed write transaction, and at next time, other information can be + * returned. + * + * Legacy mdbx_env_stat() correspond to calling mdbx_env_stat_ex() with the null + * txn argument. + * * [in] env An environment handle returned by mdbx_env_create() + * [in] txn A transaction handle returned by mdbx_txn_begin() * [out] stat The address of an MDBX_stat structure where the statistics - * will be copied */ -LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes); -LIBMDBX_API int mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn, - MDBX_stat *stat, size_t bytes); + * will be copied + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *stat, size_t bytes); +__deprecated LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, + size_t bytes); + +/* Information about the environment */ +typedef struct MDBX_envinfo { + struct { + uint64_t lower; /* lower limit for datafile size */ + uint64_t upper; /* upper limit for datafile size */ + uint64_t current; /* current datafile size */ + uint64_t shrink; /* shrink threshold for datafile */ + uint64_t grow; /* growth step for datafile */ + } mi_geo; + uint64_t mi_mapsize; /* Size of the data memory map */ + uint64_t mi_last_pgno; /* ID of the last used page */ + uint64_t mi_recent_txnid; /* ID of the last committed transaction */ + uint64_t mi_latter_reader_txnid; /* ID of the last reader transaction */ + uint64_t mi_self_latter_reader_txnid; /* ID of the last reader transaction of + caller process */ + uint64_t mi_meta0_txnid, mi_meta0_sign; + uint64_t mi_meta1_txnid, mi_meta1_sign; + uint64_t mi_meta2_txnid, mi_meta2_sign; + uint32_t mi_maxreaders; /* max reader slots in the environment */ + uint32_t mi_numreaders; /* max reader slots used in the environment */ + uint32_t mi_dxb_pagesize; /* database pagesize */ + uint32_t mi_sys_pagesize; /* system pagesize */ + + struct { + /* A mostly unique ID that is regenerated on each boot. As such it can be + used to identify the local machine's current boot. MDBX uses such when + open the database to determine whether rollback required to the last + steady sync point or not. I.e. if current bootid is differ from the value + within a database then the system was rebooted and all changes since last + steady sync must be reverted for data integrity. Zeros mean that no + relevant information is available from the system. */ + struct { + uint64_t l, h; + } current, meta0, meta1, meta2; + } mi_bootid; + + uint64_t mi_unsync_volume; /* bytes not explicitly synchronized to disk */ + uint64_t mi_autosync_threshold; /* current auto-sync threshold, see + mdbx_env_set_syncbytes(). */ + uint32_t mi_since_sync_seconds16dot16; /* time since the last steady sync in + 1/65536 of second */ + uint32_t mi_autosync_period_seconds16dot16 /* current auto-sync period in + 1/65536 of second, see + mdbx_env_set_syncperiod(). */ + ; + uint32_t mi_since_reader_check_seconds16dot16; /* time since the last readers + check in 1/65536 of second, + see mdbx_reader_check(). */ + uint32_t mi_mode; /* current environment mode, the same as + mdbx_env_get_flags() returns. */ +} MDBX_envinfo; /* Return information about the MDBX environment. * + * At least one of env or txn argument must be non-null. If txn is passed + * non-null then stat will be filled accordingly to the given transaction. + * Otherwise, if txn is null, then stat will be populated by a snapshot from the + * last committed write transaction, and at next time, other information can be + * returned. + * + * Legacy mdbx_env_info() correspond to calling mdbx_env_info_ex() with the null + * txn argument. + * [in] env An environment handle returned by mdbx_env_create() + * [in] txn A transaction handle returned by mdbx_txn_begin() * [out] stat The address of an MDBX_envinfo structure - * where the information will be copied */ -LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, size_t bytes); -LIBMDBX_API int mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *info, size_t bytes); + * where the information will be copied + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *info, size_t bytes); +__deprecated LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, + size_t bytes); -/* Flush the data buffers to disk. +/* Flush the environment data buffers to disk. * - * Data is always written to disk when mdbx_txn_commit() is called, - * but the operating system may keep it buffered. MDBX always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with MDBX_NOSYNC or in part MDBX_NOMETASYNC. This call is - * not valid if the environment was opened with MDBX_RDONLY. + * Unless the environment was opened with no-sync flags (MDBX_NOMETASYNC, + * MDBX_NOSYNC, MDBX_UTTERLY_NOSYNC and MDBX_MAPASYNC), then data is always + * written an flushed to disk when mdbx_txn_commit() is called. Otherwise + * mdbx_env_sync() may be called to manually write and flush unsynced data to + * disk. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] force If non-zero, force a synchronous flush. Otherwise if the - * environment has the MDBX_NOSYNC flag set the flushes will be - * omitted, and with MDBX_MAPASYNC they will be asynchronous. + * Besides, mdbx_env_sync_ex() with argument force=false may be used to + * provide polling mode for lazy/asynchronous sync in conjunction with + * mdbx_env_set_syncbytes() and/or mdbx_env_set_syncperiod(). * - * Returns A non-zero error value on failure and 0 on success, some - * possible errors are: - * - MDBX_EACCES - the environment is read-only. - * - MDBX_EINVAL - an invalid parameter was specified. - * - MDBX_EIO - an error occurred during synchronization. */ -LIBMDBX_API int mdbx_env_sync(MDBX_env *env, int force); + * The mdbx_env_sync() is shortcut to calling mdbx_env_sync_ex() with + * try force=true and nonblock=false arguments. + * + * The mdbx_env_sync_poll() is shortcut to calling mdbx_env_sync_ex() with + * the force=false and nonblock=true arguments. + * + * NOTE: This call is not valid if the environment was opened with MDBX_RDONLY. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] force If non-zero, force a flush. Otherwise, if force is zero, then + * will run in polling mode, i.e. it will check the thresholds + * that were set mdbx_env_set_syncbytes() and/or + * mdbx_env_set_syncperiod() and perform flush If at least one + * of the thresholds is reached. + * [in] nonblock Don't wait if write transaction is running by other thread. + * + * Returns A non-zero error value on failure and MDBX_RESULT_TRUE or 0 on + * success. The MDBX_RESULT_TRUE means no data pending for flush to disk, + * and 0 otherwise. Some possible errors are: + * - MDBX_EACCES = the environment is read-only. + * - MDBX_BUSY = the environment is used by other thread and nonblock=true. + * - MDBX_EINVAL = an invalid parameter was specified. + * - MDBX_EIO = an error occurred during synchronization. */ +LIBMDBX_API int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock); +LIBMDBX_API int mdbx_env_sync(MDBX_env *env); +LIBMDBX_API int mdbx_env_sync_poll(MDBX_env *env); + +/* Sets threshold to force flush the data buffers to disk, even of MDBX_NOSYNC, + * MDBX_NOMETASYNC and MDBX_MAPASYNC flags in the environment. The threshold + * value affects all processes which operates with given environment until the + * last process close environment or a new value will be settled. + * + * Data is always written to disk when mdbx_txn_commit() is called, but the + * operating system may keep it buffered. MDBX always flushes the OS buffers + * upon commit as well, unless the environment was opened with MDBX_NOSYNC, + * MDBX_MAPASYNC or in part MDBX_NOMETASYNC. + * + * The default is 0, than mean no any threshold checked, and no additional + * flush will be made. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] threshold The size in bytes of summary changes when a synchronous + * flush would be made. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold); + +/* Sets relative period since the last unsteay commit to force flush the data + * buffers to disk, even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags + * in the environment. The relative period value affects all processes which + * operates with given environment until the last process close environment or a + * new value will be settled. + * + * Data is always written to disk when mdbx_txn_commit() is called, but the + * operating system may keep it buffered. MDBX always flushes the OS buffers + * upon commit as well, unless the environment was opened with MDBX_NOSYNC, + * MDBX_MAPASYNC or in part MDBX_NOMETASYNC. + * + * Settled period don't checked asynchronously, but only by the + * mdbx_txn_commit() and mdbx_env_sync() functions. Therefore, in cases where + * transactions are committed infrequently and/or irregularly, polling by + * mdbx_env_sync() may be a reasonable solution to timeout enforcement. + * + * The default is 0, than mean no any timeout checked, and no additional + * flush will be made. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] seconds_16dot16 The period in 1/65536 of second when a synchronous + * flush would be made since the last unsteay commit. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env, + unsigned seconds_16dot16); /* Close the environment and release the memory map. * @@ -780,107 +1818,310 @@ LIBMDBX_API int mdbx_env_sync(MDBX_env *env, int force); * The environment handle will be freed and must not be used again after this * call. * - * [in] env An environment handle returned by mdbx_env_create() + * Legacy mdbx_env_close() correspond to calling mdbx_env_close_ex() with the + * argument dont_sync=false. + * + * [in] env An environment handle returned by mdbx_env_create(). * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint (meta-page * update) will be kept "as is" and may be still "weak" in the * NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be * ignored on opening next time, and transactions since the * last non-weak checkpoint (meta-page update) will rolledback - * for consistency guarantee. */ + * for consistency guarantee. + * + * Returns A non-zero error value on failure and 0 on success. + * Some possible errors are: + * - MDBX_BUSY = The write transaction is running by other thread, in such + * case MDBX_env instance has NOT be destroyed not released! + * NOTE: if any OTHER error code was returned then given + * MDBX_env instance has been destroyed and released. + * - MDBX_PANIC = If mdbx_env_close_ex() was called in the child process + * after fork(). In this case MDBX_PANIC is a expecte, + * i.e. MDBX_env instance was freed in proper manner. + * - MDBX_EIO = an error occurred during synchronization. */ +LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync); LIBMDBX_API int mdbx_env_close(MDBX_env *env); /* Set environment flags. * * This may be used to set some flags in addition to those from - * mdbx_env_open(), or to unset these flags. If several threads - * change the flags at the same time, the result is undefined. + * mdbx_env_open(), or to unset these flags. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] flags The flags to change, bitwise OR'ed together + * NOTE: In contrast to LMDB, the MDBX serialize threads via mutex while + * changing the flags. Therefore this function will be blocked while a write + * transaction running by other thread, or MDBX_BUSY will be returned if + * function called within a write transaction. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] flags The flags to change, bitwise OR'ed together. * [in] onoff A non-zero value sets the flags, zero clears them. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff); /* Get environment flags. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] flags The address of an integer to store the flags + * [in] env An environment handle returned by mdbx_env_create(). + * [out] flags The address of an integer to store the flags. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_flags(MDBX_env *env, unsigned *flags); /* Return the path that was used in mdbx_env_open(). * * [in] env An environment handle returned by mdbx_env_create() - * [out] path Address of a string pointer to contain the path. + * [out] dest Address of a string pointer to contain the path. * This is the actual string in the environment, not a copy. * It should not be altered in any way. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_path(MDBX_env *env, const char **path); + * - MDBX_EINVAL = an invalid parameter was specified. */ +LIBMDBX_API int mdbx_env_get_path(MDBX_env *env, const char **dest); /* Return the file descriptor for the given environment. * * NOTE: All MDBX file descriptors have FD_CLOEXEC and * could't be used after exec() and or fork(). * - * [in] env An environment handle returned by mdbx_env_create() + * [in] env An environment handle returned by mdbx_env_create(). * [out] fd Address of a int to contain the descriptor. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *fd); -/* Set the size of the memory map to use for this environment. - * - * The size should be a multiple of the OS page size. The default is - * 10485760 bytes. The size of the memory map is also the maximum size - * of the database. The value should be chosen as large as possible, - * to accommodate future growth of the database. - * This function should be called after mdbx_env_create() and before - * mdbx_env_open(). It may be called at later times if no transactions - * are active in this process. Note that the library does not check for - * this condition, the caller must ensure it explicitly. - * - * The new size takes effect immediately for the current process but - * will not be persisted to any others until a write transaction has been - * committed by the current process. Also, only mapsize increases are - * persisted into the environment. - * - * If the mapsize is increased by another process, and data has grown - * beyond the range of the current mapsize, mdbx_txn_begin() will - * return MDBX_MAP_RESIZED. This function may be called with a size - * of zero to adopt the new size. - * - * Any attempt to set a size smaller than the space already consumed by the - * environment will be silently changed to the current size of the used space. +/* Set all size-related parameters of environment, including page size and the + * min/max size of the memory map. + * + * In contrast to LMDB, the MDBX provide automatic size management of an + * database according the given parameters, including shrinking and resizing + * on the fly. From user point of view all of these just working. Nevertheless, + * it is reasonable to know some details in order to make optimal decisions when + * choosing parameters. + * + * Both mdbx_env_info_ex() and legacy mdbx_env_info() are inapplicable to + * read-only opened environment. + * + * Both mdbx_env_info_ex() and legacy mdbx_env_info() could be called either + * before or after mdbx_env_open(), either within the write transaction running + * by current thread or not: + * + * - In case mdbx_env_info_ex() or legacy mdbx_env_info() was called BEFORE + * mdbx_env_open(), i.e. for closed environment, then the specified + * parameters will be used for new database creation, or will be appliend + * during openeing if database exists and no other process using it. + * + * If the database is already exist, opened with MDBX_EXCLUSIVE or not used + * by any other process, and parameters specified by mdbx_env_set_geometry() + * are incompatible (i.e. for instance, different page size) then + * mdbx_env_open() will return MDBX_INCOMPATIBLE error. + * + * In another way, if database will opened read-only or will used by other + * process during calling mdbx_env_open() that specified parameters will + * silently discarded (open the database with MDBX_EXCLUSIVE flag to avoid + * this). + * + * - In case mdbx_env_info_ex() or legacy mdbx_env_info() was called after + * mdbx_env_open() WITHIN the write transaction running by current thread, + * then specified parameters will be appliad as a part of write transaction, + * i.e. will not be visible to any others processes until the current write + * transaction has been committed by the current process. However, if + * transaction will be aborted, then the database file will be reverted to + * the previous size not immediately, but when a next transaction will be + * committed or when the database will be opened next time. + * + * - In case mdbx_env_info_ex() or legacy mdbx_env_info() was called after + * mdbx_env_open() but OUTSIDE a write transaction, then MDBX will execute + * internal pseudo-transaction to apply new parameters (but only if anything + * has been changed), and changes be visible to any others processes + * immediatelly after succesfull competeion of function. + * + * Essentially a concept of "automatic size management" is simple and useful: + * - There are the lower and upper bound of the database file size; + * - There is the growth step by which the database file will be increased, + * in case of lack of space. + * - There is the threshold for unused space, beyond which the database file + * will be shrunk. + * - The size of the memory map is also the maximum size of the database. + * - MDBX will automatically manage both the size of the database and the size + * of memory map, according to the given parameters. + * + * So, there some considerations about choosing these parameters: + * - The lower bound allows you to prevent database shrinking below some + * rational size to avoid unnecessary resizing costs. + * - The upper bound allows you to prevent database growth above some rational + * size. Besides, the upper bound defines the linear address space + * reservation in each process that opens the database. Therefore changing + * the upper bound is costly and may be required reopening environment in + * case of MDBX_MAP_RESIZED errors, and so on. Therefore, this value should + * be chosen reasonable as large as possible, to accommodate future growth + * of the database. + * - The growth step must be greater than zero to allow the database to grow, + * but also reasonable not too small, since increasing the size by little + * steps will result a large overhead. + * - The shrink threshold must be greater than zero to allow the database + * to shrink but also reasonable not too small (to avoid extra overhead) and + * not less than growth step to avoid up-and-down flouncing. + * - The current size (i.e. size_now argument) is an auxiliary parameter for + * simulation legacy mdbx_env_set_mapsize() and as workaround Windows issues + * (see below). + * + * Unfortunately, Windows has is a several issues + * with resizing of memory-mapped file: + * - Windows unable shrinking a memory-mapped file (i.e memory-mapped section) + * in any way except unmapping file entirely and then map again. Moreover, + * it is impossible in any way if a memory-mapped file is used more than + * one process. + * - Windows does not provide the usual API to augment a memory-mapped file + * (that is, a memory-mapped partition), but only by using "Native API" + * in an undocumented way. + * MDBX bypasses all Windows issues, but at a cost: + * - Ability to resize database on the fly requires an additional lock + * and release SlimReadWriteLock during each read-only transaction. + * - During resize all in-process threads should be paused and then resumed. + * - Shrinking of database file is performed only when it used by single + * process, i.e. when a database closes by the last process or opened + * by the first. + * = Therefore, the size_now argument may be useful to set database size + * by the first process which open a database, and thus avoid expensive + * remapping further. + * + * For create a new database with particular parameters, including the page + * size, mdbx_env_set_geometry() should be called after mdbx_env_create() and + * before mdbx_env_open(). Once the database is created, the page size cannot be + * changed. If you do not specify all or some of the parameters, the + * corresponding default values will be used. For instance, the default for + * database size is 10485760 bytes. + * + * If the mapsize is increased by another process, MDBX silently and + * transparently adopt these changes at next transaction start. However, + * mdbx_txn_begin() will return MDBX_MAP_RESIZED if new mapping size could not + * be applied for current process (for instance if address space is busy). + * Therefore, in the case of MDBX_MAP_RESIZED error you need close and reopen + * the environment to resolve error. + * + * NOTE: Actual values may be different than your have specified because of + * rounding to specified database page size, the system page size and/or the + * size of the system virtual memory management unit. You can get actual values + * by mdbx_env_sync_ex() or see by using the tool "mdbx_chk" with the "-v" + * option. + * + * Legacy mdbx_env_set_mapsize() correspond to calling mdbx_env_set_geometry() + * with the arguments size_lower, size_now, size_upper equal to the size + * and -1 (i.e. default) for all other parameters. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] size The size in bytes + * [in] env An environment handle returned by mdbx_env_create() * - * Returns A non-zero error value on failure and 0 on success, some - * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified, - * or the environment has an active write transaction. */ -LIBMDBX_API int mdbx_env_set_mapsize(MDBX_env *env, size_t size); + * [in] size_lower The lower bound of database sive in bytes. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * + * [in] size_now The size in bytes to setup the database size for now. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * So, it is recommended always pass -1 in this argument + * except some special cases. + * + * [in] size_upper The upper bound of database sive in bytes. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * It is recommended to avoid change upper bound while + * database is used by other processes or threaded + * (i.e. just pass -1 in this argument except absolutely + * necessity). Otherwise you must be ready for + * MDBX_MAP_RESIZED error(s), unexpected pauses during + * remapping and/or system errors like "addtress busy", + * and so on. In other words, there is no way to handle + * a growth of the upper bound robustly because there may + * be a lack of appropriate system resources (which are + * extremely volatile in a multi-process multi-threaded + * environment). + * + * [in] growth_step The growth step in bytes, must be greater than zero + * to allow the database to grow. + * Negative value means "keep current or use default". + * + * [in] shrink_threshold The shrink threshold in bytes, must be greater than + * zero to allow the database to shrink. + * Negative value means "keep current or use default". + * + * [in] pagesize The database page size for new database creation + * or -1 otherwise. Must be power of 2 in the range + * between MDBX_MIN_PAGESIZE and MDBX_MAX_PAGESIZE. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * + * Returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * - MDBX_EINVAL = An invalid parameter was specified, + * or the environment has an active write transaction. + * - MDBX_EPERM = specific for Windows: Shrinking was disabled before and + * now it wanna be enabled, but there are reading threads + * that don't use the additional SRWL (that is required to + * avoid Windows issues). + * - MDBX_EACCESS = The environment opened in read-only. + * - MDBX_MAP_FULL = Specified size smaller than the space already + * consumed by the environment. + * - MDBX_TOO_LARGE = Specified size is too large, i.e. too many pages for + * given size, or a 32-bit process requests too much bytes + * for the 32-bit address space. */ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, intptr_t size_upper, intptr_t growth_step, intptr_t shrink_threshold, intptr_t pagesize); +__deprecated LIBMDBX_API int mdbx_env_set_mapsize(MDBX_env *env, size_t size); + +/* Find out whether to use readahead or not, based on the given database size + * and the amount of available memory. + * + * [in] volume The expected database size in bytes. + * [in] redundancy Additional reserve or overload in case of negative value. + * + * Returns: + * - MDBX_RESULT_TRUE = readahead is reasonable. + * - MDBX_RESULT_FALSE = readahead is NOT reasonable, i.e. MDBX_NORDAHEAD + * is useful to open environment by mdbx_env_open(). + * - Otherwise the error code. */ +LIBMDBX_API int mdbx_is_readahead_reasonable(size_t volume, + intptr_t redundancy); + +/* The minimal database page size in bytes. */ +#define MDBX_MIN_PAGESIZE 256 +__inline intptr_t mdbx_limits_pgsize_min(void) { return MDBX_MIN_PAGESIZE; } + +/* The maximal database page size in bytes. */ +#define MDBX_MAX_PAGESIZE 65536 +__inline intptr_t mdbx_limits_pgsize_max(void) { return MDBX_MAX_PAGESIZE; } + +/* Returns minimal database size in bytes for given page size, + * or -1 if pagesize is invalid. */ +LIBMDBX_API intptr_t mdbx_limits_dbsize_min(intptr_t pagesize); + +/* Returns maximal database size in bytes for given page size, + * or -1 if pagesize is invalid. */ +LIBMDBX_API intptr_t mdbx_limits_dbsize_max(intptr_t pagesize); + +/* Returns maximal key and data size in bytes for given page size + * and database flags (see mdbx_dbi_open_ex() description), + * or -1 if pagesize is invalid. */ +LIBMDBX_API intptr_t mdbx_limits_keysize_max(intptr_t pagesize, unsigned flags); +LIBMDBX_API intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags); + +/* Returns maximal write transaction size (i.e. limit for summary volume of + * dirty pages) in bytes for given page size, or -1 if pagesize is invalid. */ +LIBMDBX_API intptr_t mdbx_limits_txnsize_max(intptr_t pagesize); /* Set the maximum number of threads/reader slots for the environment. * * This defines the number of slots in the lock table that is used to track - * readers in the the environment. The default is 61. + * readers in the the environment. The default is 119 for 4K system page size. * Starting a read-only transaction normally ties a lock table slot to the * current thread until the environment closes or the thread exits. If * MDBX_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the @@ -888,23 +2129,23 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, * This function may only be called after mdbx_env_create() and before * mdbx_env_open(). * - * [in] env An environment handle returned by mdbx_env_create() - * [in] readers The maximum number of reader lock table slots + * [in] env An environment handle returned by mdbx_env_create(). + * [in] readers The maximum number of reader lock table slots. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified, - * or the environment is already open. */ + * - MDBX_EINVAL = an invalid parameter was specified. + * - MDBX_EPERM = the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers); /* Get the maximum number of threads/reader slots for the environment. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] readers Address of an integer to store the number of readers + * [in] env An environment handle returned by mdbx_env_create(). + * [out] readers Address of an integer to store the number of readers. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers); /* Set the maximum number of named databases for the environment. @@ -919,25 +2160,30 @@ LIBMDBX_API int mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers); * expensive: 7-120 words per transaction, and every mdbx_dbi_open() * does a linear search of the opened slots. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] dbs The maximum number of databases + * [in] env An environment handle returned by mdbx_env_create(). + * [in] dbs The maximum number of databases. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified, - * or the environment is already open. */ + * - MDBX_EINVAL = an invalid parameter was specified. + * - MDBX_EPERM = the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); -/* Get the maximum size of keys and MDBX_DUPSORT data we can write. +/* Get the maximum size of keys and data we can write. * - * [in] env An environment handle returned by mdbx_env_create() + * [in] env An environment handle returned by mdbx_env_create(). + * [in] flags Database options (MDBX_DUPSORT, MDBX_INTEGERKEY ans so on), + * see mdbx_dbi_open_ex() description. * - * Returns The maximum size of a key we can write. */ -LIBMDBX_API int mdbx_env_get_maxkeysize(MDBX_env *env); + * Returns The maximum size of a key we can write, + * or -1 if something is wrong. */ +LIBMDBX_API int mdbx_env_get_maxkeysize_ex(MDBX_env *env, unsigned flags); +LIBMDBX_API int mdbx_env_get_maxvalsize_ex(MDBX_env *env, unsigned flags); +__deprecated LIBMDBX_API int mdbx_env_get_maxkeysize(MDBX_env *env); /* Set application information associated with the MDBX_env. * - * [in] env An environment handle returned by mdbx_env_create() + * [in] env An environment handle returned by mdbx_env_create(). * [in] ctx An arbitrary pointer for whatever the application needs. * * Returns A non-zero error value on failure and 0 on success. */ @@ -949,32 +2195,15 @@ LIBMDBX_API int mdbx_env_set_userctx(MDBX_env *env, void *ctx); * Returns The pointer set by mdbx_env_set_userctx(). */ LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); -/* A callback function for most MDBX assert() failures, - * called before printing the message and aborting. - * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] msg The assertion message, not including newline. */ -typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, - const char *function, unsigned line); - -/* Set or reset the assert() callback of the environment. - * - * Disabled if libmdbx is buillt with MDBX_DEBUG=0. - * NOTE: This hack should become obsolete as mdbx's error handling matures. - * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] func An MDBX_assert_func function, or 0. - * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); - /* Create a transaction for use with the environment. * * The transaction handle may be discarded using mdbx_txn_abort() * or mdbx_txn_commit(). - * NOTE: A transaction and its cursors must only be used by a single - * thread, and a thread may only have a single transaction at a time. - * If MDBX_NOTLS is in use, this does not apply to read-only transactions. + * + * NOTE: A transaction and its cursors must only be used by a single thread, + * and a thread may only have a single transaction at a time. If MDBX_NOTLS is + * in use, this does not apply to read-only transactions. + * * NOTE: Cursors may not span transactions. * * [in] env An environment handle returned by mdbx_env_create() @@ -992,25 +2221,93 @@ LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); * This transaction will not perform any write operations. * * - MDBX_TRYTXN - * Do not block when starting a write transaction + * Do not block when starting a write transaction. + * + * - MDBX_NOSYNC, MDBX_NOMETASYNC or MDBX_MAPASYNC + * Do not sync data to disk corresponding to MDBX_NOMETASYNC + * or MDBX_NOSYNC description (see abobe). * * [out] txn Address where the new MDBX_txn handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_PANIC - a fatal error occurred earlier and the environment + * - MDBX_PANIC = a fatal error occurred earlier and the environment * must be shut down. - * - MDBX_MAP_RESIZED - another process wrote data beyond this MDBX_env's + * - MDBX_MAP_RESIZED = another process wrote data beyond this MDBX_env's * mapsize and this environment's map must be resized * as well. See mdbx_env_set_mapsize(). - * - MDBX_READERS_FULL - a read-only transaction was requested and the reader + * - MDBX_READERS_FULL = a read-only transaction was requested and the reader * lock table is full. See mdbx_env_set_maxreaders(). - * - MDBX_ENOMEM - out of memory. - * - MDBX_BUSY - a write transaction is already started. */ + * - MDBX_ENOMEM = out of memory. + * - MDBX_BUSY = the write transaction is already started by the + * current thread. */ LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, MDBX_txn **txn); -/* Returns the transaction's MDBX_env +/* Information about the transaction */ +typedef struct MDBX_txn_info { + uint64_t txn_id; /* The ID of the transaction. For a READ-ONLY transaction, + this corresponds to the snapshot being read. */ + + uint64_t + txn_reader_lag; /* For READ-ONLY transaction: the lag from a recent + MVCC-snapshot, i.e. the number of committed + transaction since read transaction started. + For WRITE transaction (provided if scan_rlt=true): the + lag of the oldest reader from current transaction (i.e. + atleast 1 if any reader running). */ + + uint64_t txn_space_used; /* Used space by this transaction, i.e. corresponding + to the last used database page. */ + + uint64_t txn_space_limit_soft; /* Current size of database file. */ + + uint64_t + txn_space_limit_hard; /* Upper bound for size the database file, + i.e. the value "size_upper" argument of the + approriate call of mdbx_env_set_geometry(). */ + + uint64_t txn_space_retired; /* For READ-ONLY transaction: The total size of + the database pages that were retired by + committed write transactions after the reader's + MVCC-snapshot, i.e. the space which would be + freed after the Reader releases the + MVCC-snapshot for reuse by completion read + transaction. + For WRITE transaction: The summarized size of + the database pages that were retired for now + due Copy-On-Write during this transaction. */ + + uint64_t + txn_space_leftover; /* For READ-ONLY transaction: the space available for + writer(s) and that must be exhausted for reason to + call the OOM-killer for this read transaction. + For WRITE transaction: the space inside transaction + that left to MDBX_TXN_FULL error. */ + + uint64_t txn_space_dirty; /* For READ-ONLY transaction (provided if + scan_rlt=true): The space that actually become + available for reuse when only this transaction + will be finished. + For WRITE transaction: The summarized size of the + dirty database pages that generated during this + transaction. */ +} MDBX_txn_info; + +/* Return information about the MDBX transaction. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [out] stat The address of an MDBX_txn_info structure + * where the information will be copied. + * [in[ scan_rlt The boolean flag controls the scan of the read lock table to + * provide complete information. Such scan is relatively + * expensive and you can avoid it if corresponding fields are + * not needed (see description of MDBX_txn_info above). + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_info(MDBX_txn *txn, MDBX_txn_info *info, int scan_rlt); + +/* Returns the transaction's MDBX_env. * * [in] txn A transaction handle returned by mdbx_txn_begin() */ LIBMDBX_API MDBX_env *mdbx_txn_env(MDBX_txn *txn); @@ -1019,71 +2316,76 @@ LIBMDBX_API MDBX_env *mdbx_txn_env(MDBX_txn *txn); * * This returns the flags associated with this transaction. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin(). * - * Returns A transaction flags, valid if input is an active transaction. */ + * Returns A transaction flags, valid if input is an valid transaction, + * otherwise -1. */ LIBMDBX_API int mdbx_txn_flags(MDBX_txn *txn); /* Return the transaction's ID. * - * This returns the identifier associated with this transaction. For a - * read-only transaction, this corresponds to the snapshot being read; - * concurrent readers will frequently have the same transaction ID. + * This returns the identifier associated with this transaction. For a read-only + * transaction, this corresponds to the snapshot being read; concurrent readers + * will frequently have the same transaction ID. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin(). * - * Returns A transaction ID, valid if input is an active transaction. */ + * Returns A transaction ID, valid if input is an active transaction, + * otherwise 0. */ LIBMDBX_API uint64_t mdbx_txn_id(MDBX_txn *txn); /* Commit all the operations of a transaction into the database. * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with mdbx_cursor_renew(). + * The transaction handle is freed. It and its cursors must not be used again + * after this call, except with mdbx_cursor_renew() and mdbx_cursor_close(). * - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * mdbx_cursor_renew() before finally closing it. + * A cursor must be closed explicitly always, before or after its transaction + * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin(). * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. - * - MDBX_ENOSPC - no more disk space. - * - MDBX_EIO - a low-level I/O error occurred while writing. - * - MDBX_ENOMEM - out of memory. */ + * - MDBX_EINVAL = an invalid parameter was specified. + * - MDBX_ENOSPC = no more disk space. + * - MDBX_EIO = a low-level I/O error occurred while writing. + * - MDBX_ENOMEM = out of memory. */ LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn); /* Abandon all the operations of the transaction instead of saving them. * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with mdbx_cursor_renew(). + * The transaction handle is freed. It and its cursors must not be used again + * after this call, except with mdbx_cursor_renew() and mdbx_cursor_close(). * * A cursor must be closed explicitly always, before or after its transaction * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin(). */ + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); /* Reset a read-only transaction. * - * Abort the transaction like mdbx_txn_abort(), but keep the transaction - * handle. Therefore mdbx_txn_renew() may reuse the handle. This saves - * allocation overhead if the process will start a new read-only transaction - * soon, and also locking overhead if MDBX_NOTLS is in use. The reader table - * lock is released, but the table slot stays tied to its thread or - * MDBX_txn. Use mdbx_txn_abort() to discard a reset handle, and to free - * its lock table slot if MDBX_NOTLS is in use. + * Abort the read-only transaction like mdbx_txn_abort(), but keep the + * transaction handle. Therefore mdbx_txn_renew() may reuse the handle. This + * saves allocation overhead if the process will start a new read-only + * transaction soon, and also locking overhead if MDBX_NOTLS is in use. The + * reader table lock is released, but the table slot stays tied to its thread or + * MDBX_txn. Use mdbx_txn_abort() to discard a reset handle, and to free its + * lock table slot if MDBX_NOTLS is in use. * - * Cursors opened within the transaction must not be used - * again after this call, except with mdbx_cursor_renew(). + * Cursors opened within the transaction must not be used again after this call, + * except with mdbx_cursor_renew() and mdbx_cursor_close(). * * Reader locks generally don't interfere with writers, but they keep old - * versions of database pages allocated. Thus they prevent the old pages - * from being reused when writers commit new data, and so under heavy load - * the database size may grow much more rapidly than otherwise. + * versions of database pages allocated. Thus they prevent the old pages from + * being reused when writers commit new data, and so under heavy load the + * database size may grow much more rapidly than otherwise. * - * [in] txn A transaction handle returned by mdbx_txn_begin() */ + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); /* Renew a read-only transaction. @@ -1092,41 +2394,90 @@ LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); * released by mdbx_txn_reset(). It must be called before a reset transaction * may be used again. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin(). * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_PANIC - a fatal error occurred earlier and the environment + * - MDBX_PANIC = a fatal error occurred earlier and the environment * must be shut down. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); -/* Open a table in the environment. +/* The fours integers markers (aka "canary") associated with the environment. + * + * The `x`, `y` and `z` values could be set by mdbx_canary_put(), while the 'v' + * will be always set to the transaction number. Updated values becomes visible + * outside the current transaction only after it was committed. Current values + * could be retrieved by mdbx_canary_get(). */ +typedef struct mdbx_canary { + uint64_t x, y, z, v; +} mdbx_canary; + +/* Set integers markers (aka "canary") associated with the environment. + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] canary A optional pointer to mdbx_canary structure for `x`, `y` + * and `z` values from. + * - If canary is NOT NULL then the `x`, `y` and `z` values will be + * updated from given canary argument, but the 'v' be always set + * to the current transaction number if at least one `x`, `y` or + * `z` values have changed (i.e. if `x`, `y` and `z` have the same + * values as currently present then nothing will be changes or + * updated). + * - if canary is NULL then the `v` value will be explicitly update + * to the current transaction number without changes `x`, `y` nor + * `z`. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary); + +/* Returns fours integers markers (aka "canary") associated with the + * environment. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] canary The address of an mdbx_canary structure where the information + * will be copied. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary); + +/* A callback function used to compare two keys in a database */ +typedef int(MDBX_cmp_func)(const MDBX_val *a, const MDBX_val *b); + +/* Open a database in the environment. + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. The database handle may be + * discarded by calling mdbx_dbi_close(). The old database handle is returned if + * the database was already open. The handle may only be closed once. + * + * (!) A notable difference between MDBX and LMDB is that MDBX make handles + * opened for existing databases immediately available for other transactions, + * regardless this transaction will be aborted or reset. The REASON for this is + * to avoiding the requirement for multiple opening a same handles in concurrent + * read transactions, and tracking of such open but hidden handles until the + * completion of read transactions which opened them. * - * A table handle denotes the name and parameters of a table, independently - * of whether such a table exists. The table handle may be discarded by - * calling mdbx_dbi_close(). The old table handle is returned if the table - * was already open. The handle may only be closed once. + * Nevertheless, the handle for the NEWLY CREATED database will be invisible for + * other transactions until the this write transaction is successfully + * committed. If the write transaction is aborted the handle will be closed + * automatically. After a successful commit the such handle will reside in the + * shared environment, and may be used by other transactions. * - * The table handle will be private to the current transaction until - * the transaction is successfully committed. If the transaction is - * aborted the handle will be closed automatically. - * After a successful commit the handle will reside in the shared - * environment, and may be used by other transactions. + * In contrast to LMDB, the MDBX allow this function to be called from multiple + * concurrent transactions or threads in the same process. * - * This function must not be called from multiple concurrent - * transactions in the same process. A transaction that uses - * this function must finish (either commit or abort) before - * any other transaction in the process may use this function. + * Legacy mdbx_dbi_open() correspond to calling mdbx_dbi_open_ex() with the null + * keycmp and datacmp arguments. * - * To use named table (with name != NULL), mdbx_env_set_maxdbs() + * To use named database (with name != NULL), mdbx_env_set_maxdbs() * must be called before opening the environment. Table names are - * keys in the internal unnamed table, and may be read but not written. + * keys in the internal unnamed database, and may be read but not written. * - * [in] txn transaction handle returned by mdbx_txn_begin() - * [in] name The name of the table to open. If only a single - * table is needed in the environment, this value may be NULL. - * [in] flags Special options for this table. This parameter must be set + * [in] txn transaction handle returned by mdbx_txn_begin(). + * [in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * [in] flags Special options for this database. This parameter must be set * to 0 or by bitwise OR'ing together one or more of the values * described here: * - MDBX_REVERSEKEY @@ -1134,7 +2485,7 @@ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); * of the strings to the beginning. By default, Keys are treated as * strings and compared from beginning to end. * - MDBX_DUPSORT - * Duplicate keys may be used in the table. Or, from another point of + * Duplicate keys may be used in the database. Or, from another point of * view, keys may have multiple data items, stored in sorted order. By * default keys must be unique and may have only a single data item. * - MDBX_INTEGERKEY @@ -1159,14 +2510,21 @@ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); * Create the named database if it doesn't exist. This option is not * allowed in a read-only transaction or a read-only environment. * - * [out] dbi Address where the new MDBX_dbi handle will be stored + * [in] keycmp Optional custom key comparison function for a database. + * [in] datacmp Optional custom data comparison function for a database, takes + * effect only if database was opened with the MDB_DUPSORT flag. + * [out] dbi Address where the new MDBX_dbi handle will be stored. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - the specified database doesn't exist in the - * environment and MDBX_CREATE was not specified. - * - MDBX_DBS_FULL - too many databases have been opened. - * See mdbx_env_set_maxdbs(). */ + * - MDBX_NOTFOUND = the specified database doesn't exist in the + * environment and MDBX_CREATE was not specified. + * - MDBX_DBS_FULL = too many databases have been opened. + * See mdbx_env_set_maxdbs(). + * - MDBX_INCOMPATIBLE = Database is incompatible with given flags, + * i.e. the passed flags is different with which the + * database was created, or the database was already + * opened with a different comparison function(s). */ LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, unsigned flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); @@ -1175,24 +2533,27 @@ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, unsigned flags, /* Retrieve statistics for a database. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). * [out] stat The address of an MDBX_stat structure where the statistics - * will be copied + * will be copied. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *stat, size_t bytes); -/* Retrieve the DB flags for a database handle. +/* Retrieve the DB flags and status for a database handle. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). * [out] flags Address where the flags will be returned. * [out] state Address where the state will be returned. * + * Legacy mdbx_dbi_flags() correspond to calling mdbx_dbi_flags_ex() with + * discarding result from the last argument. + * * Returns A non-zero error value on failure and 0 on success. */ #define MDBX_TBL_DIRTY 0x01 /* DB was written in this txn */ #define MDBX_TBL_STALE 0x02 /* Named-DB record is older than txnID */ @@ -1204,29 +2565,33 @@ LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. * - * Use with care: - * FIXME: This call is not mutex protected. Handles should only be closed by - * a single thread, and only if no other threads are going to reference - * the database handle or one of its cursors any further. Do not close - * a handle if an existing transaction has modified its database. - * Doing so can cause misbehavior from database corruption to errors - * like MDBX_BAD_VALSIZE (since the DB name is gone). + * NOTE: Use with care. + * This call is synchronized via mutex with mdbx_dbi_close(), but NOT with + * other transactions running by other threads. The "next" version of libmdbx + * (MithrilDB) will solve this issue. * - * Closing a database handle is not necessary, but lets mdbx_dbi_open() - * reuse the handle value. Usually it's better to set a bigger - * mdbx_env_set_maxdbs(), unless that value would be large. + * Handles should only be closed if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close a handle + * if an existing transaction has modified its database. Doing so can cause + * misbehavior from database corruption to errors like MDBX_BAD_VALSIZE (since + * the DB name is gone). * - * [in] env An environment handle returned by mdbx_env_create() - * [in] dbi A database handle returned by mdbx_dbi_open() - */ + * Closing a database handle is not necessary, but lets mdbx_dbi_open() reuse + * the handle value. Usually it's better to set a bigger mdbx_env_set_maxdbs(), + * unless that value would be large. + * + * [in] env An environment handle returned by mdbx_env_create(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi); -/* Empty or delete+close a database. +/* Empty or delete and close a database. * * See mdbx_dbi_close() for restrictions about closing the DB handle. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). * [in] del 0 to empty the DB, 1 to delete it from the environment * and close the DB handle. * @@ -1250,19 +2615,68 @@ LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del); * NOTE: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to search for in the database - * [in,out] data The data corresponding to the key + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to search for in the database. + * [in,out] data The data corresponding to the key. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - the key was not in the database. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND = the key was not in the database. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data); -LIBMDBX_API int mdbx_get2(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data); + +/* Get items from a database and optionaly number of data items for a given key. + * + * Briefly this function does the same as mdbx_get() with a few differences: + * 1. If values_count is NOT NULL, then returns the count + * of multi-values/duplicates for a given key. + * 2. Updates BOTH the key and the data for pointing to the actual key-value + * pair inside the database. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in,out] key The key to search for in the database. + * [in,out] data The data corresponding to the key. + * [out] values_count The optional address to return number of values + * associated with given key, i.e. + * = 0 - in case MDBX_NOTFOUND error; + * = 1 - exactly for databases WITHOUT MDBX_DUPSORT; + * >= 1 for databases WITH MDBX_DUPSORT. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_NOTFOUND = the key was not in the database. + * - MDBX_EINVAL = an invalid parameter was specified. */ +LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, size_t *values_count); + +/* Get nearest items from a database. + * + * Briefly this function does the same as mdbx_get() with a few differences: + * 1. Return nearest (i.e. equal or great due comparison function) key-value + * pair, but not only exactly matching with the key. + * 2. On success return MDBX_SUCCESS if key found exactly, + * and MDBX_RESULT_TRUE otherwise. Moreover, for databases with MDBX_DUPSORT + * flag the data argument also will be used to match over + * multi-value/duplicates, and MDBX_SUCCESS will be returned only when BOTH + * the key and the data match exactly. + * 3. Updates BOTH the key and the data for pointing to the actual key-value + * pair inside the database. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in,out] key The key to search for in the database. + * [in,out] data The data corresponding to the key. + * + * Returns A non-zero error value on failure and MDBX_RESULT_TRUE (0) or + * MDBX_RESULT_TRUE on success (as described above). + * Some possible errors are: + * - MDBX_NOTFOUND = the key was not in the database. + * - MDBX_EINVAL = an invalid parameter was specified. */ +LIBMDBX_API int mdbx_get_nearest(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data); /* Store items into a database. * @@ -1271,13 +2685,13 @@ LIBMDBX_API int mdbx_get2(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * if duplicates are disallowed, or adding a duplicate data item if * duplicates are allowed (MDBX_DUPSORT). * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to store in the database - * [in,out] data The data to store - * [in] flags Special options for this operation. This parameter must be - * set to 0 or by bitwise OR'ing together one or more of the - * values described here. + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to store in the database. + * [in,out] data The data to store. + * [in] flags Special options for this operation. This parameter must be + * set to 0 or by bitwise OR'ing together one or more of the + * values described here. * * - MDBX_NODUPDATA * Enter the new key/data pair only if it does not already appear @@ -1318,90 +2732,143 @@ LIBMDBX_API int mdbx_get2(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_KEYEXIST - * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_MAP_FULL = the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL = the transaction has too many dirty pages. + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags); +/* Replace items in a database. + * + * This function allows to update or delete an existing value at the same time + * as the previous value is retrieved. If the argument new_data equal is NULL + * zero, the removal is performed, otherwise the update/insert. + * + * The current value may be in an already changed (aka dirty) page. In this + * case, the page will be overwritten during the update, and the old value will + * be lost. Therefore, an additional buffer must be passed via old_data argument + * initially to copy the old value. If the buffer passed in is too small, the + * function will return MDBX_RESULT_TRUE (-1) by setting iov_len field pointed + * by old_data argument to the appropriate value, without performing any + * changes. + * + * For databases with non-unique keys (i.e. with MDBX_DUPSORT flag), another use + * case is also possible, when by old_data argument selects a specific item from + * multi-value/duplicates with the same key for deletion or update. To select + * this scenario in flags should simultaneously specify MDBX_CURRENT and + * MDBX_NOOVERWRITE. This combination is chosen because it makes no sense, and + * thus allows you to identify the request of such a scenario. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to store in the database. + * [in,out] new_data The data to store, if NULL then deletion will be + * performed. + * [in,out] old_data The buffer for retrieve previous value as describe + * above. + * [in] flags Special options for this operation. This parameter must + * be set to 0 or by bitwise OR'ing together one or more of + * the values described in mdbx_put() description above, + * and additionally (MDBX_CURRENT | MDBX_NOOVERWRITE) + * combination for selection particular item from + * multi-value/duplicates. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + unsigned flags); + /* Delete items from a database. * * This function removes key/data pairs from the database. * - * The data parameter is NOT ignored regardless the database does + * NOTE: The data parameter is NOT ignored regardless the database does * support sorted duplicate data items or not. If the data parameter * is non-NULL only the matching data item will be deleted. * * This function will return MDBX_NOTFOUND if the specified key/data * pair is not in the database. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to delete from the database - * [in] data The data to delete + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to delete from the database. + * [in] data The data to delete. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data); /* Create a cursor handle. * - * A cursor is associated with a specific transaction and database. - * A cursor cannot be used when its database handle is closed. Nor - * when its transaction has ended, except with mdbx_cursor_renew(). - * It can be discarded with mdbx_cursor_close(). + * A cursor is associated with a specific transaction and database. A cursor + * cannot be used when its database handle is closed. Nor when its transaction + * has ended, except with mdbx_cursor_renew(). Also it can be discarded with + * mdbx_cursor_close(). * - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * mdbx_cursor_renew() before finally closing it. + * A cursor must be closed explicitly always, before or after its transaction + * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] cursor Address where the new MDBX_cursor handle will be stored + * NOTE: In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. memory + * corruption and segfaults. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [out] cursor Address where the new MDBX_cursor handle will be stored. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **cursor); /* Close a cursor handle. * - * The cursor handle will be freed and must not be used again after this call. - * Its transaction must still be live if it is a write-transaction. + * The cursor handle will be freed and must not be used again after this call, + * but its transaction may still be live. + * + * NOTE: In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. memory + * corruption and segfaults. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() */ + * [in] cursor A cursor handle returned by mdbx_cursor_open(). */ LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); /* Renew a cursor handle. * - * A cursor is associated with a specific transaction and database. - * Cursors that are only used in read-only transactions may be re-used, - * to avoid unnecessary malloc/free overhead. The cursor may be associated - * with a new read-only transaction, and referencing the same database handle - * as it was created with. + * A cursor is associated with a specific transaction and database. The cursor + * may be associated with a new transaction, and referencing the same database + * handle as it was created with. This may be done whether the previous + * transaction is live or dead. * - * This may be done whether the previous transaction is live or dead. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] cursor A cursor handle returned by mdbx_cursor_open() + * NOTE: In contrast to LMDB, the MDBX allow any cursor to be re-used by using + * mdbx_cursor_renew(), to avoid unnecessary malloc/free overhead until it freed + * by mdbx_cursor_close(). + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] cursor A cursor handle returned by mdbx_cursor_open(). * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *cursor); /* Return the cursor's transaction handle. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() */ + * [in] cursor A cursor handle returned by mdbx_cursor_open(). */ LIBMDBX_API MDBX_txn *mdbx_cursor_txn(MDBX_cursor *cursor); /* Return the cursor's database handle. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() */ + * [in] cursor A cursor handle returned by mdbx_cursor_open(). */ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *cursor); /* Retrieve by cursor. @@ -1412,15 +2879,15 @@ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *cursor); * and the address and length of the data are returned in the object to which * data refers. See mdbx_get() for restrictions on using the output values. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [in,out] key The key for a retrieved item - * [in,out] data The data of a retrieved item - * [in] op A cursor operation MDBX_cursor_op + * [in] cursor A cursor handle returned by mdbx_cursor_open(). + * [in,out] key The key for a retrieved item. + * [in,out] data The data of a retrieved item. + * [in] op A cursor operation MDBX_cursor_op. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - no matching key found. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND = no matching key found. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); @@ -1429,7 +2896,7 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, * This function stores key/data pairs into the database. The cursor is * positioned at the new item, or on failure usually near it. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] cursor A cursor handle returned by mdbx_cursor_open(). * [in] key The key operated on. * [in] data The data operated on. * [in] flags Options for this operation. This parameter @@ -1486,21 +2953,21 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EKEYMISMATCH - * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_MAP_FULL = the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL = the transaction has too many dirty pages. + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, unsigned flags); -/* Delete current key/data pair +/* Delete current key/data pair. * - * This function deletes the key/data pair to which the cursor refers. - * This does not invalidate the cursor, so operations such as MDBX_NEXT - * can still be used on it. Both MDBX_NEXT and MDBX_GET_CURRENT will return - * the same record after this operation. + * This function deletes the key/data pair to which the cursor refers. This does + * not invalidate the cursor, so operations such as MDBX_NEXT can still be used + * on it. Both MDBX_NEXT and MDBX_GET_CURRENT will return the same record after + * this operation. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] cursor A cursor handle returned by mdbx_cursor_open(). * [in] flags Options for this operation. This parameter must be set to 0 * or one of the values described here. * @@ -1510,192 +2977,361 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. * - * This call is only valid on databases that support sorted duplicate data - * items MDBX_DUPSORT. + * This call is valid for all databases, but reasonable only for that support + * sorted duplicate data items MDBX_DUPSORT. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [out] countp Address where the count will be stored + * [in] cursor A cursor handle returned by mdbx_cursor_open(). + * [out] countp Address where the count will be stored. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EINVAL - cursor is not initialized, or an invalid parameter + * - MDBX_EINVAL = cursor is not initialized, or an invalid parameter * was specified. */ LIBMDBX_API int mdbx_cursor_count(MDBX_cursor *cursor, size_t *countp); -/* Compare two data items according to a particular database. +/* Determines whether the cursor is pointed to a key-value pair or not, + * i.e. was not positioned or points to the end of data. * - * This returns a comparison as if the two data items were keys in the - * specified database. + * [in] cursor A cursor handle returned by mdbx_cursor_open(). * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] a The first item to compare - * [in] b The second item to compare + * Returns: + * - MDBX_RESULT_TRUE = no more data available or cursor not positioned; + * - MDBX_RESULT_FALSE = data available; + * - Otherwise the error code. */ +LIBMDBX_API int mdbx_cursor_eof(MDBX_cursor *mc); + +/* Determines whether the cursor is pointed to the first key-value pair or not. * - * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, - const MDBX_val *b); + * [in] cursor A cursor handle returned by mdbx_cursor_open(). + * + * Returns: + * - MDBX_RESULT_TRUE = cursor positioned to the first key-value pair. + * - MDBX_RESULT_FALSE = cursor NOT positioned to the first key-value pair. + * - Otherwise the error code. */ +LIBMDBX_API int mdbx_cursor_on_first(MDBX_cursor *mc); -/* Compare two data items according to a particular database. +/* Determines whether the cursor is pointed to the last key-value pair or not. * - * This returns a comparison as if the two items were data items of - * the specified database. The database must have the MDBX_DUPSORT flag. + * [in] cursor A cursor handle returned by mdbx_cursor_open(). * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] a The first item to compare - * [in] b The second item to compare + * Returns: + * - MDBX_RESULT_TRUE = cursor positioned to the last key-value pair. + * - MDBX_RESULT_FALSE = cursor NOT positioned to the last key-value pair. + * - Otherwise the error code. */ +LIBMDBX_API int mdbx_cursor_on_last(MDBX_cursor *mc); + +/* Estimates the distance between cursors as a number of elements. The results + * of such estimation can be used to build and/or optimize query execution + * plans. + * + * This function performs a rough estimate based only on b-tree pages that are + * common for the both cursor's stacks. + * + * NOTE: The result varies greatly depending on the filling of specific pages + * and the overall balance of the b-tree: + * + * 1. The number of items is estimated by analyzing the height and fullness of + * the b-tree. The accuracy of the result directly depends on the balance of the + * b-tree, which in turn is determined by the history of previous insert/delete + * operations and the nature of the data (i.e. variability of keys length and so + * on). Therefore, the accuracy of the estimation can vary greatly in a + * particular situation. + * + * 2. To understand the potential spread of results, you should consider a + * possible situations basing on the general criteria for splitting and merging + * b-tree pages: + * - the page is split into two when there is no space for added data; + * - two pages merge if the result fits in half a page; + * - thus, the b-tree can consist of an arbitrary combination of pages filled + * both completely and only 1/4. Therefore, in the worst case, the result + * can diverge 4 times for each level of the b-tree excepting the first and + * the last. + * + * 3. In practice, the probability of extreme cases of the above situation is + * close to zero and in most cases the error does not exceed a few percent. On + * the other hand, it's just a chance you shouldn't overestimate. + * + * Both cursors must be initialized for the same database and the same + * transaction. + * + * [in] first The first cursor for estimation. + * [in] last The second cursor for estimation. + * [out] distance_items A pointer to store estimated distance value, + * i.e. *distance_items = distance(first, last). * - * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, - const MDBX_val *b); + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_estimate_distance(const MDBX_cursor *first, + const MDBX_cursor *last, + ptrdiff_t *distance_items); -/* A callback function used to print a message from the library. +/* Estimates the move distance, i.e. between the current cursor position and + * next position after the specified move-operation with given key and data. + * The results of such estimation can be used to build and/or optimize query + * execution plans. Current cursor position and state are preserved. * - * [in] msg The string to be printed. - * [in] ctx An arbitrary context pointer for the callback. + * Please see notes on accuracy of the result in mdbx_estimate_distance() + * description above. * - * Returns < 0 on failure, >= 0 on success. */ -typedef int(MDBX_msg_func)(const char *msg, void *ctx); + * [in] cursor Cursor for estimation. + * [in,out] key The key for a retrieved item. + * [in,out] data The data of a retrieved item. + * [in] op A cursor operation MDBX_cursor_op. + * [out] distance_items A pointer to store estimated move distance + * as the number of elements. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op move_op, + ptrdiff_t *distance_items); -/* Dump the entries in the reader lock table. +/* Estimates the size of a range as a number of elements. The results + * of such estimation can be used to build and/or optimize query execution + * plans. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] func A MDBX_msg_func function - * [in] ctx Anything the message function needs + * Please see notes on accuracy of the result in mdbx_estimate_distance() + * description above. * - * Returns < 0 on failure, >= 0 on success. */ -LIBMDBX_API int mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx); + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] begin_key The key of range beginning or NULL for explicit FIRST. + * [in] begin_data Optional additional data to seeking among sorted + * duplicates. Only for MDBX_DUPSORT, NULL otherwise. + * [in] end_key The key of range ending or NULL for explicit LAST. + * [in] end_data Optional additional data to seeking among sorted + * duplicates. Only for MDBX_DUPSORT, NULL otherwise. + * [out] distance_items A pointer to store range estimation result. + * + * Returns A non-zero error value on failure and 0 on success. */ +#define MDBX_EPSILON ((MDBX_val *)((ptrdiff_t)-1)) +LIBMDBX_API int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, + MDBX_val *begin_key, MDBX_val *begin_data, + MDBX_val *end_key, MDBX_val *end_data, + ptrdiff_t *size_items); -/* Check for stale entries in the reader lock table. +/* Determines whether the given address is on a dirty database page of the + * transaction or not. Ultimately, this allows to avoid copy data from non-dirty + * pages. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] dead Number of stale slots that were cleared + * "Dirty" pages are those that have already been changed during a write + * transaction. Accordingly, any further changes may result in such pages being + * overwritten. Therefore, all functions libmdbx performing changes inside the + * database as arguments should NOT get pointers to data in those pages. In + * turn, "not dirty" pages before modification will be copied. * - * Returns 0 on success, non-zero on failure. */ -LIBMDBX_API int mdbx_reader_check(MDBX_env *env, int *dead); - -LIBMDBX_API char *mdbx_dkey(const MDBX_val *key, char *const buf, - const size_t bufsize); + * In other words, data from dirty pages must either be copied before being + * passed as arguments for further processing or rejected at the argument + * validation stage. Thus, mdbx_is_dirty() allows you to get rid of unnecessary + * copying, and perform a more complete check of the arguments. + * + * NOTE: The address passed must point to the beginning of the data. This is the + * only way to ensure that the actual page header is physically located in the + * same memory page, including for multi-pages with long data. + * + * NOTE: In rare cases the function may return a false positive answer + * (DBX_RESULT_TRUE when data is NOT on a dirty page), but never a false + * negative if the arguments are correct. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] ptr The address of data to check. + * + * Returns: + * - MDBX_RESULT_TRUE = given address is on the dirty page. + * - MDBX_RESULT_FALSE = given address is NOT on the dirty page. + * - Otherwise the error code. */ +LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); -LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync); +/* Sequence generation for a database. + * + * The function allows to create a linear sequence of unique positive integers + * for each database. The function can be called for a read transaction to + * retrieve the current sequence value, and the increment must be zero. + * Sequence changes become visible outside the current write transaction after + * it is committed, and discarded on abort. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [out] result The optional address where the value of sequence before the + * change will be stored. + * [in] increment Value to increase the sequence, + * must be 0 for read-only transactions. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_RESULT_TRUE = Increasing the sequence has resulted in an overflow + * and therefore cannot be executed. */ +LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, + uint64_t increment); -/* Sets threshold to force flush the data buffers to disk, - * even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags - * in the environment. The value affects all processes which operates with given - * DB until the last process close DB or a new value will be settled. +/* Compare two data items according to a particular database. * - * Data is always written to disk when mdbx_txn_commit() is called, - * but the operating system may keep it buffered. MDBX always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC. + * This returns a comparison as if the two data items were keys in the + * specified database. * - * The default is 0, than mean no any threshold checked, and no additional - * flush will be made. + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] a The first item to compare. + * [in] b The second item to compare. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] bytes The size in bytes of summary changes when a synchronous - * flush would be made. + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ +LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b); + +/* Compare two data items according to a particular database. * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes); + * This returns a comparison as if the two items were data items of the + * specified database. The database must have the MDBX_DUPSORT flag. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] a The first item to compare. + * [in] b The second item to compare. + * + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ +LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b); -/* Sets relative period since the last unsteay commit to force flush the data - * buffers to disk, even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags - * in the environment. The value affects all processes which operates with given - * DB until the last process close DB or a new value will be settled. +/* A callback function used to enumerate the reader lock table. + * + * [in] ctx An arbitrary context pointer for the callback. + * [in] num The serial number during enumeration, starting from 1. + * [in] slot The reader lock table slot number. + * [in] txnid The ID of the transaction being read, + * i.e. the MVCC-snaphot number. + * [in] lag The lag from a recent MVCC-snapshot, i.e. the number of + * committed transaction since read transaction started. + * [in] pid The reader process ID. + * [in] thread The reader thread ID. + * [in] bytes_used The number of last used page in the MVCC-snapshot which + * being read, i.e. database file can't shrinked beyond this. + * [in] bytes_retired The total size of the database pages that were retired by + * committed write transactions after the reader's + * MVCC-snapshot, i.e. the space which would be freed after + * the Reader releases the MVCC-snapshot for reuse by + * completion read transaction. * - * Data is always written to disk when mdbx_txn_commit() is called, - * but the operating system may keep it buffered. MDBX always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with MDBX_NOSYNC, MDBX_MAPASYNC or in part MDBX_NOMETASYNC. + * Returns < 0 on failure, >= 0 on success. */ +typedef int(MDBX_reader_list_func)(void *ctx, int num, int slot, mdbx_pid_t pid, + mdbx_tid_t thread, uint64_t txnid, + uint64_t lag, size_t bytes_used, + size_t bytes_retained); + +/* Enumarete the entries in the reader lock table. * - * Settled period don't checked asynchronously, but only inside the functions. - * mdbx_txn_commit() and mdbx_env_sync(). Therefore, in cases where transactions - * are committed infrequently and/or irregularly, polling by mdbx_env_sync() may - * be a reasonable solution to timeout enforcement. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] func A MDBX_reader_list_func function. + * [in] ctx An arbitrary context pointer for the enumeration function. * - * The default is 0, than mean no any timeout checked, and no additional - * flush will be made. + * Returns A non-zero error value on failure and 0 on success, + * or MDBX_RESULT_TRUE (-1) if the reader lock table is empty. */ +LIBMDBX_API int mdbx_reader_list(MDBX_env *env, MDBX_reader_list_func *func, + void *ctx); + +/* Check for stale entries in the reader lock table. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] seconds_16dot16 The period in 1/65536 of second when a synchronous - * flush would be made since the last unsteay commit. + * [in] env An environment handle returned by mdbx_env_create(). + * [out] dead Number of stale slots that were cleared. * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_syncperiod(MDBX_env *env, - unsigned seconds_16dot16); + * Returns A non-zero error value on failure and 0 on success, + * or MDBX_RESULT_TRUE (-1) if a dead reader(s) found or mutex was recovered. */ +LIBMDBX_API int mdbx_reader_check(MDBX_env *env, int *dead); /* Returns a lag of the reading for the given transaction. * * Returns an information for estimate how much given read-only * transaction is lagging relative the to actual head. + * This is deprecated function, use mdbx_txn_info() instead. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin(). * [out] percent Percentage of page allocation in the database. * * Returns Number of transactions committed after the given was started for * read, or negative value on failure. */ -LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); +__deprecated LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); -/* A callback function for killing a laggard readers, - * but also could waiting ones. Called in case of MDBX_MAP_FULL error. +/* A lack-of-space callback function to resolve issues with a laggard readers. + * + * Read transactions prevent reuse of pages freed by newer write transactions, + * thus the database can grow quickly. This callback will be called when there + * is not enough space in the database (ie. before increasing the database size + * or before MDBX_MAP_FULL error) and thus can be used to resolve issues with + * a "long-lived" read transactions. + * + * Depending on the arguments and needs, your implementation may wait, terminate + * a process or thread that is performing a long read, or perform some other + * action. In doing so it is important that the returned code always corresponds + * to the performed action. * * [in] env An environment handle returned by mdbx_env_create(). - * [in] pid pid of the reader process. - * [in] tid thread_id of the reader thread. - * [in] txn Transaction number on which stalled. + * [in] pid A pid of the reader process. + * [in] tid A thread_id of the reader thread. + * [in] txn A transaction number on which stalled. * [in] gap A lag from the last commited txn. - * [in] retry A retry number, less that zero for notify end of OOM-loop. - * - * Returns -1 on failure (reader is not killed), - * 0 should wait or retry, - * 1 drop reader txn-lock (reading-txn was aborted), - * >1 drop reader registration (reader process was killed). */ -typedef int(MDBX_oom_func)(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, - unsigned gap, int retry); + * [in] space A space that actually become available for reuse after this + * reader finished. The callback function can take this value into + * account to evaluate the impact that a long-running transaction + * has. + * [in] retry A retry number starting from 0. if callback has returned 0 + * at least once, then at end of current OOM-handler loop callback + * will be called additionally with negative value to notify about + * the end of loop. The callback function can use this value to + * implement timeout logic while waiting for readers. + * + * The RETURN CODE determines the further actions libmdbx and must match the + * action which was executed by the callback: + * + * -2 or less = An error condition and the reader was not killed. + * + * -1 = The callback was unable to solve the problem and agreed + * on MDBX_MAP_FULL error, libmdbx should increase the + * database size or return MDBX_MAP_FULL error. + * + * 0 (zero) = The callback solved the problem or just waited for + * a while, libmdbx should rescan the reader lock table and + * retry. This also includes a situation when corresponding + * transaction terminated in normal way by mdbx_txn_abort() + * or mdbx_txn_reset(), and my be restarted. I.e. reader + * slot don't needed to be cleaned from transaction. + * + * 1 = Transaction aborted asynchronous and reader slot should + * be cleared immediately, i.e. read transaction will not + * continue but mdbx_txn_abort() or mdbx_txn_reset() will + * be called later. + * + * 2 or great = The reader process was terminated or killed, and libmdbx + * should entirely reset reader registration. */ +typedef int(MDBX_oom_func)(MDBX_env *env, mdbx_pid_t pid, mdbx_tid_t tid, + uint64_t txn, unsigned gap, size_t space, int retry); /* Set the OOM callback. * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. + * The callback will only be triggered on lack of space to resolve issues with + * lagging reader(s) (i.e. to kill it) for resume reuse pages from the garbage + * collector. * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] oomfunc A MDBX_oom_func function or NULL to disable. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] oom_func A MDBX_oom_func function or NULL to disable. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. - * * [in] env An environment handle returned by mdbx_env_create(). * * Returns A MDBX_oom_func function or NULL if disabled. */ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); -#define MDBX_DBG_ASSERT 1 -#define MDBX_DBG_PRINT 2 -#define MDBX_DBG_TRACE 4 -#define MDBX_DBG_EXTRA 8 -#define MDBX_DBG_AUDIT 16 -#define MDBX_DBG_JITTER 32 -#define MDBX_DBG_DUMP 64 -#define MDBX_DBG_LEGACY_MULTIOPEN 128 - -typedef void MDBX_debug_func(int type, const char *function, int line, - const char *msg, va_list args); - -LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger); +/**** B-tree Traversal ********************************************************* + * This is internal API for mdbx_chk tool. You should avoid to use it, except + * some extremal special cases. */ +/* Page types for traverse the b-tree. */ typedef enum { MDBX_page_void, MDBX_page_meta, @@ -1711,108 +3347,20 @@ typedef enum { #define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) #define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) +/* Callback function for traverse the b-tree. */ typedef int MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, const int deep, const char *const dbi, const size_t page_size, const MDBX_page_type_t type, const size_t nentries, const size_t payload_bytes, const size_t header_bytes, const size_t unused_bytes); + +/* B-tree traversal function. */ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); -typedef struct mdbx_canary { - uint64_t x, y, z, v; -} mdbx_canary; - -LIBMDBX_API int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary); -LIBMDBX_API int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary); - -/* Returns: - * - MDBX_RESULT_TRUE - * when no more data available or cursor not positioned; - * - MDBX_RESULT_FALSE - * when data available; - * - Otherwise the error code. */ -LIBMDBX_API int mdbx_cursor_eof(MDBX_cursor *mc); - -/* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -LIBMDBX_API int mdbx_cursor_on_first(MDBX_cursor *mc); - -/* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -LIBMDBX_API int mdbx_cursor_on_last(MDBX_cursor *mc); - -/* Estimates the distance between cursors as the number of elements. - * Both cursors must be initialized for the same DBI. - * - * [in] cursor_a The first cursor for estimation. - * [in] cursor_b The second cursor for estimation. - * [out] distance_items A pointer to store estimated distance value, - * i.e. *distance_items = distance(a - b). - * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_estimate_distance(const MDBX_cursor *first, - const MDBX_cursor *last, - ptrdiff_t *distance_items); - -/* Estimates the move distance, i.e. between the current cursor position and - * next position after the specified move-operation with given key and data. - * Current cursor position and state are preserved. - * - * [in] cursor Cursor for estimation. - * [in,out] key The key for a retrieved item. - * [in,out] data The data of a retrieved item. - * [in] op A cursor operation MDBX_cursor_op. - * [out] distance_items A pointer to store estimated move distance - * as the number of elements. - * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, MDBX_cursor_op move_op, - ptrdiff_t *distance_items); - -/* Estimates the size of a range in the number of elements. - * - * [in] txn A transaction handle returned by mdbx_txn_begin(). - * [in] dbi A database handle returned by mdbx_dbi_open(). - * [in] begin_key The key of range beginning or NULL for explicit FIRST. - * [in] begin_data Optional additional data to seeking among sorted - * duplicates. Only for MDBX_DUPSORT, NULL otherwise. - * [in] end_key The key of range ending or NULL for explicit LAST. - * [in] end_data Optional additional data to seeking among sorted - * duplicates. Only for MDBX_DUPSORT, NULL otherwise. - * [out] distance_items A pointer to store range estimation result. - * - * Returns A non-zero error value on failure and 0 on success. */ -#define MDBX_EPSILON ((MDBX_val *)((ptrdiff_t)-1)) -LIBMDBX_API int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, - MDBX_val *begin_key, MDBX_val *begin_data, - MDBX_val *end_key, MDBX_val *end_data, - ptrdiff_t *size_items); - -LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *new_data, MDBX_val *old_data, - unsigned flags); -/* Same as mdbx_get(), but: - * 1) if values_count is not NULL, then returns the count - * of multi-values/duplicates for a given key. - * 2) updates the key for pointing to the actual key's data inside DB. */ -LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, size_t *values_count); - -LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); - -LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, - uint64_t increment); - -LIBMDBX_API int mdbx_limits_pgsize_min(void); -LIBMDBX_API int mdbx_limits_pgsize_max(void); -LIBMDBX_API intptr_t mdbx_limits_dbsize_min(intptr_t pagesize); -LIBMDBX_API intptr_t mdbx_limits_dbsize_max(intptr_t pagesize); -LIBMDBX_API intptr_t mdbx_limits_keysize_max(intptr_t pagesize); -LIBMDBX_API intptr_t mdbx_limits_txnsize_max(intptr_t pagesize); - -/*----------------------------------------------------------------------------*/ -/* attribute support functions for Nexenta */ +/**** Attribute support functions for Nexenta *********************************/ +#ifdef MDBX_NEXENTA_ATTRS typedef uint_fast64_t mdbx_attr_t; /* Store by cursor with attribute. @@ -1844,10 +3392,10 @@ typedef uint_fast64_t mdbx_attr_t; * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EKEYMISMATCH - * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_MAP_FULL = the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL = the transaction has too many dirty pages. + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr, unsigned flags); @@ -1890,10 +3438,10 @@ LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_KEYEXIST - * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_MAP_FULL = the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL = the transaction has too many dirty pages. + * - MDBX_EACCES = an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr, unsigned flags); @@ -1912,8 +3460,8 @@ LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - the key-value pair was not in the database. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND = the key-value pair was not in the database. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr); @@ -1925,15 +3473,15 @@ LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * and the address and length of the data are returned in the object to which * data refers. See mdbx_get() for restrictions on using the output values. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [in,out] key The key for a retrieved item - * [in,out] data The data of a retrieved item - * [in] op A cursor operation MDBX_cursor_op + * [in] cursor A cursor handle returned by mdbx_cursor_open(). + * [in,out] key The key for a retrieved item. + * [in,out] data The data of a retrieved item. + * [in] op A cursor operation MDBX_cursor_op. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - no matching key found. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND = no matching key found. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_attr_t *attrptr, MDBX_cursor_op op); @@ -1955,20 +3503,21 @@ LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, * NOTE: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to search for in the database - * [in,out] data The data corresponding to the key + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to search for in the database. + * [in,out] data The data corresponding to the key. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - the key was not in the database. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND = the key was not in the database. + * - MDBX_EINVAL = an invalid parameter was specified. */ LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t *attrptr); +#endif /* MDBX_NEXENTA_ATTRS */ -/*----------------------------------------------------------------------------*/ -/* LY: temporary workaround for Elbrus's memcmp() bug. */ +/******************************************************************************* + * LY: temporary workaround for Elbrus's memcmp() bug. */ #ifndef __GLIBC_PREREQ #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) #define __GLIBC_PREREQ(maj, min) \ diff --git a/libs/libmdbx/src/src/CMakeLists.txt b/libs/libmdbx/src/src/CMakeLists.txt new file mode 100644 index 0000000000..a127eaee88 --- /dev/null +++ b/libs/libmdbx/src/src/CMakeLists.txt @@ -0,0 +1,268 @@ +## +## Copyright 2019 Leonid Yuriev <leo@yuriev.ru> +## and other libmdbx authors: please see AUTHORS file. +## All rights reserved. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted only as authorized by the OpenLDAP +## Public License. +## +## A copy of this license is available in the file LICENSE in the +## top-level directory of the distribution or, alternatively, at +## <http://www.OpenLDAP.org/license.html>. +## + +# Get version +fetch_version(MDBX "${CMAKE_CURRENT_SOURCE_DIR}/../VERSION") +message(STATUS "libmdbx version is ${MDBX_VERSION}") + +if(MDBX_ALLOY_MODE) + set(LIBMDBX_SOURCES alloy.c) +else() + if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LIBMDBX_OSAL windows) + else() + set(LIBMDBX_OSAL posix) + endif() + set(LIBMDBX_SOURCES + elements/options.h elements/defs.h elements/internals.h elements/osal.h + elements/core.c elements/osal.c elements/lck-${LIBMDBX_OSAL}.c) +endif() +list(APPEND LIBMDBX_SOURCES ../mdbx.h + "${CMAKE_CURRENT_SOURCE_DIR}/elements/version.c" + "${CMAKE_CURRENT_SOURCE_DIR}/elements/config.h") + +if(MDBX_BUILD_SHARED_LIBRARY) + add_library(mdbx SHARED ${LIBMDBX_SOURCES}) + target_compile_definitions(mdbx PRIVATE LIBMDBX_EXPORTS INTERFACE LIBMDBX_IMPORTS) + set(MDBX_LIBDEP_MODE PRIVATE) +else() + add_library(mdbx STATIC ${LIBMDBX_SOURCES}) + set(MDBX_LIBDEP_MODE PUBLIC) +endif() + +if(CC_HAS_VISIBILITY AND (LTO_ENABLED OR INTERPROCEDURAL_OPTIMIZATION)) + set_target_properties(mdbx PROPERTIES LINK_FLAGS "-fvisibility=hidden") +endif() + +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + if(MSVC) + if(NOT MSVC_LIB_EXE) + # Find lib.exe + get_filename_component(CL_NAME ${CMAKE_C_COMPILER} NAME) + string(REPLACE cl.exe lib.exe MSVC_LIB_EXE ${CL_NAME}) + find_program(MSVC_LIB_EXE ${MSVC_LIB_EXE}) + endif() + if(MSVC_LIB_EXE) + message(STATUS "Found MSVC's lib tool: ${MSVC_LIB_EXE}") + set(MDBX_NTDLL_EXTRA_IMPLIB ${CMAKE_CURRENT_BINARY_DIR}/mdbx_ntdll_extra.lib) + add_custom_command(OUTPUT ${MDBX_NTDLL_EXTRA_IMPLIB} + COMMENT "Create extra-import-library for ntdll.dll" + MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/elements/ntdll.def" + COMMAND ${MSVC_LIB_EXE} /def:"${CMAKE_CURRENT_SOURCE_DIR}/elements/ntdll.def" /out:"${MDBX_NTDLL_EXTRA_IMPLIB}" ${INITIAL_CMAKE_STATIC_LINKER_FLAGS}) + else() + message(SEND_ERROR "MSVC's lib tool not found") + endif() + elseif(MINGW OR MINGW64) + if(NOT DLLTOOL) + # Find dlltool + get_filename_component(GCC_NAME ${CMAKE_C_COMPILER} NAME) + string(REPLACE gcc dlltool DLLTOOL_NAME ${GCC_NAME}) + find_program(DLLTOOL NAMES ${DLLTOOL_NAME}) + endif() + if(DLLTOOL) + message(STATUS "Found dlltool: ${DLLTOOL}") + set(MDBX_NTDLL_EXTRA_IMPLIB "${CMAKE_CURRENT_BINARY_DIR}/mdbx_ntdll_extra.a") + add_custom_command(OUTPUT ${MDBX_NTDLL_EXTRA_IMPLIB} + COMMENT "Create extra-import-library for ntdll.dll" + MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/elements/ntdll.def" + COMMAND ${DLLTOOL} -d "${CMAKE_CURRENT_SOURCE_DIR}/elements/ntdll.def" -l "${MDBX_NTDLL_EXTRA_IMPLIB}") + else() + message(SEND_ERROR "dlltool not found") + endif() + endif() +endif() + +target_link_libraries(mdbx ${MDBX_LIBDEP_MODE} ${CMAKE_THREAD_LIBS_INIT}) +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + target_link_libraries(mdbx ${MDBX_LIBDEP_MODE} ntdll.lib) + if(MDBX_NTDLL_EXTRA_IMPLIB) + # LY: Sometimes Cmake requires a nightmarish magic for simple things. + # 1) create a target out of the library compilation result + add_custom_target(ntdll_extra_target DEPENDS ${MDBX_NTDLL_EXTRA_IMPLIB}) + # 2) create an library target out of the library compilation result + add_library(ntdll_extra STATIC IMPORTED GLOBAL) + add_dependencies(ntdll_extra ntdll_extra_target) + # 3) specify where the library is (and where to find the headers) + set_target_properties(ntdll_extra + PROPERTIES + IMPORTED_LOCATION ${MDBX_NTDLL_EXTRA_IMPLIB}) + target_link_libraries(mdbx ${MDBX_LIBDEP_MODE} ntdll_extra) + endif() +endif() + +if(${CMAKE_SYSTEM_NAME} STREQUAL "SunOS" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Solaris") + target_link_libraries(mdbx ${MDBX_LIBDEP_MODE} kstat) +endif() + +set_target_properties(mdbx PROPERTIES + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}> + C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON + PUBLIC_HEADER "../mdbx.h") + +if(CC_HAS_FASTMATH) + target_compile_options(mdbx PRIVATE "-ffast-math") +endif() +if(BUILD_FOR_NATIVE_CPU AND CC_HAS_ARCH_NATIVE) + target_compile_options(mdbx PUBLIC "-march=native") +endif() +if(CC_HAS_VISIBILITY) + target_compile_options(mdbx PRIVATE "-fvisibility=hidden") +endif() + +################################################################################ +# +# library build info (used in library version output) +# +set(MDBX_BUILD_FLAGS "") + +# append cmake's build-type flags and defines +if(NOT CMAKE_CONFIGURATION_TYPES) + list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPERCASE}}) + list(APPEND MDBX_BUILD_FLAGS ${CMAKE_C_DEFINES_${CMAKE_BUILD_TYPE_UPPERCASE}}) +endif() + +# append linker dll's options +if(LIBMDBX_TYPE STREQUAL "SHARED") + list(APPEND MDBX_BUILD_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) +endif() + +# get definitions +get_target_property(defs_list mdbx COMPILE_DEFINITIONS) +if(defs_list) + list(APPEND MDBX_BUILD_FLAGS ${defs_list}) +endif() + +# get target compile options +get_target_property(options_list mdbx COMPILE_OPTIONS) +if(options_list) + list(APPEND MDBX_BUILD_FLAGS ${options_list}) +endif() + +list(REMOVE_DUPLICATES MDBX_BUILD_FLAGS) +string(REPLACE ";" " " MDBX_BUILD_FLAGS "${MDBX_BUILD_FLAGS}") +if(CMAKE_CONFIGURATION_TYPES) + # add dynamic part via per-configuration define + message(STATUS "MDBX Compile Flags: ${MDBX_BUILD_FLAGS} <AND CONFIGURATION DEPENDENT>") + add_definitions(-DMDBX_BUILD_FLAGS_CONFIG="$<$<CONFIG:Debug>:${CMAKE_C_FLAGS_DEBUG} ${CMAKE_C_DEFINES_DEBUG}>$<$<CONFIG:Release>:${CMAKE_C_FLAGS_RELEASE} ${CMAKE_C_DEFINES_RELEASE}>$<$<CONFIG:RelWithDebInfo>:${CMAKE_C_FLAGS_RELWITHDEBINFO} ${CMAKE_C_DEFINES_RELWITHDEBINFO}>$<$<CONFIG:MinSizeRel>:${CMAKE_C_FLAGS_MINSIZEREL} ${CMAKE_C_DEFINES_MINSIZEREL}>") +else() + message(STATUS "MDBX Compile Flags: ${MDBX_BUILD_FLAGS}") +endif() + +# get compiler info +execute_process(COMMAND sh -c "${CMAKE_C_COMPILER} --version | head -1" + OUTPUT_VARIABLE MDBX_BUILD_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE rc) +if(rc OR NOT MDBX_BUILD_COMPILER) + string(STRIP "${CMAKE_C_COMPILER_ID}-${CMAKE_C_COMPILER_VERSION}" MDBX_BUILD_COMPILER) +endif() + +# make a build-target triplet +if(CMAKE_C_COMPILER_TARGET) + set(MDBX_BUILD_TARGET "${CMAKE_C_COMPILER_TARGET}") +elseif(CMAKE_C_PLATFORM_ID AND NOT CMAKE_C_PLATFORM_ID STREQUAL CMAKE_SYSTEM_NAME) + string(STRIP "${CMAKE_C_PLATFORM_ID}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_LIBRARY_ARCHITECTURE) + string(STRIP "${CMAKE_LIBRARY_ARCHITECTURE}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_GENERATOR_PLATFORM AND NOT CMAKE_C_PLATFORM_ID STREQUAL CMAKE_SYSTEM_NAME) + string(STRIP "${CMAKE_GENERATOR_PLATFORM}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +elseif(CMAKE_SYSTEM_ARCH) + string(STRIP "${CMAKE_SYSTEM_ARCH}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +else() + string(STRIP "${CMAKE_SYSTEM_PROCESSOR}-${CMAKE_SYSTEM_NAME}" MDBX_BUILD_TARGET) +endif() +if(CMAKE_CONFIGURATION_TYPES) + add_definitions(-DMDBX_BUILD_CONFIG="$<CONFIG>") +else() + set(MDBX_BUILD_CONFIG ${CMAKE_BUILD_TYPE}) +endif() + +# options +string(TIMESTAMP MDBX_BUILD_TIMESTAMP UTC) +set(options VERSION C_COMPILER CXX_COMPILER) +foreach(item IN LISTS options) + if(DEFINED ${item}) + set(value "${${item}}") + elseif(DEFINED MDBX_${item}) + set(item MDBX_${item}) + set(value "${${item}}") + elseif(DEFINED CMAKE_${item}) + set(item CMAKE_${item}) + set(value "${${item}}") + else() + set(value "undefined") + endif() + message(STATUS "${item}: ${value}") +endforeach(item) + +# generate version and config files +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/elements/version.c.in" + "${CMAKE_CURRENT_SOURCE_DIR}/elements/version.c" ESCAPE_QUOTES) + +file(SHA256 "${CMAKE_CURRENT_SOURCE_DIR}/elements/version.c" MDBX_SOURCERY_DIGEST) +string(MAKE_C_IDENTIFIER "${MDBX_GIT_DESCRIBE}" MDBX_SOURCERY_SUFFIX) +set(MDBX_BUILD_SOURCERY "${MDBX_SOURCERY_DIGEST}_${MDBX_SOURCERY_SUFFIX}") + +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/elements/config.h.in" + "${CMAKE_CURRENT_SOURCE_DIR}/elements/config.h" ESCAPE_QUOTES) +add_definitions(-DMDBX_CONFIG_H="config.h") + +# installation +if(MDBX_BUILD_SHARED_LIBRARY) + if(CMAKE_VERSION VERSION_LESS 3.12) + install(TARGETS mdbx EXPORT libmdbx + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) + else() + install(TARGETS mdbx EXPORT libmdbx + RUNTIME + DESTINATION bin + COMPONENT runtime + LIBRARY + DESTINATION lib + COMPONENT runtime + NAMELINK_COMPONENT devel + OBJECTS + DESTINATION lib + COMPONENT devel + ARCHIVE + DESTINATION lib + COMPONENT devel + PUBLIC_HEADER + DESTINATION include + COMPONENT devel + INCLUDES + DESTINATION include + COMPONENT devel) + endif() +endif() + +add_subdirectory(tools) diff --git a/libs/libmdbx/src/src/alloy.c b/libs/libmdbx/src/src/alloy.c new file mode 100644 index 0000000000..98f3aac3a9 --- /dev/null +++ b/libs/libmdbx/src/src/alloy.c @@ -0,0 +1,26 @@ +/* + * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. */ + +/* Amalgamated build */ +#define MDBX_ALLOY 1 +#include "elements/internals.h" /* must be included fisrt */ + +#include "elements/core.c" +#include "elements/osal.c" +#include "elements/version.c" + +#if defined(_WIN32) || defined(_WIN64) +#include "elements/lck-windows.c" +#else +#include "elements/lck-posix.c" +#endif diff --git a/libs/libmdbx/src/src/elements/config.h.in b/libs/libmdbx/src/src/elements/config.h.in new file mode 100644 index 0000000000..1079899d9a --- /dev/null +++ b/libs/libmdbx/src/src/elements/config.h.in @@ -0,0 +1,55 @@ +/* This is CMake-template for libmdbx's config.h + ******************************************************************************/ + +/* *INDENT-OFF* */ +/* clang-format off */ + +#cmakedefine HAVE_VALGRIND_MEMCHECK_H +#cmakedefine HAS_RELAXED_CONSTEXPR + +#cmakedefine LTO_ENABLED +#cmakedefine MDBX_USE_VALGRIND +#cmakedefine ENABLE_GPROF +#cmakedefine ENABLE_GCOV +#cmakedefine ENABLE_ASAN + +/* Common */ +#cmakedefine01 MDBX_TXN_CHECKOWNER +#cmakedefine MDBX_TXN_CHECKPID_AUTO +#ifndef MDBX_TXN_CHECKPID_AUTO +#cmakedefine01 MDBX_TXN_CHECKPID +#endif +#cmakedefine01 MDBX_BUILD_SHARED_LIBRARY +#cmakedefine MDBX_LOCKING_AUTO +#ifndef MDBX_LOCKING_AUTO +#cmakedefine MDBX_LOCKING @MDBX_LOCKING@ +#endif +#cmakedefine MDBX_TRUST_RTC_AUTO +#ifndef MDBX_TRUST_RTC_AUTO +#cmakedefine01 MDBX_TRUST_RTC +#endif + +/* Windows */ +#cmakedefine01 MDBX_CONFIG_MANUAL_TLS_CALLBACK +#cmakedefine01 MDBX_AVOID_CRT + +/* MacOS */ +#cmakedefine01 MDBX_OSX_SPEED_INSTEADOF_DURABILITY + +/* POSIX */ +#cmakedefine01 MDBX_DISABLE_GNU_SOURCE +#cmakedefine MDBX_USE_OFDLOCKS_AUTO +#ifndef MDBX_USE_OFDLOCKS_AUTO +#cmakedefine01 MDBX_USE_OFDLOCKS +#endif + +/* Build Info */ +#cmakedefine MDBX_BUILD_TIMESTAMP "@MDBX_BUILD_TIMESTAMP@" +#cmakedefine MDBX_BUILD_TARGET "@MDBX_BUILD_TARGET@" +#cmakedefine MDBX_BUILD_CONFIG "@MDBX_BUILD_CONFIG@" +#cmakedefine MDBX_BUILD_COMPILER "@MDBX_BUILD_COMPILER@" +#cmakedefine MDBX_BUILD_FLAGS "@MDBX_BUILD_FLAGS@" +#cmakedefine MDBX_BUILD_SOURCERY @MDBX_BUILD_SOURCERY@ + +/* *INDENT-ON* */ +/* clang-format on */ diff --git a/libs/libmdbx/src/src/mdbx.c b/libs/libmdbx/src/src/elements/core.c index 35d8ae1b68..8cd92760ef 100644 --- a/libs/libmdbx/src/src/mdbx.c +++ b/libs/libmdbx/src/src/elements/core.c @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -35,132 +35,550 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include "./bits.h" +#include "internals.h" -/*----------------------------------------------------------------------------*/ -/* Internal inlines */ +/*------------------------------------------------------------------------------ + * Internal inlines */ -static __inline bool mdbx_is_power2(size_t x) { return (x & (x - 1)) == 0; } +static __pure_function __inline bool is_powerof2(size_t x) { + return (x & (x - 1)) == 0; +} -static __inline size_t mdbx_roundup2(size_t value, size_t granularity) { - assert(mdbx_is_power2(granularity)); +static __pure_function __inline size_t roundup_powerof2(size_t value, + size_t granularity) { + assert(is_powerof2(granularity)); return (value + granularity - 1) & ~(granularity - 1); } -static __inline unsigned mdbx_log2(size_t value) { - assert(mdbx_is_power2(value)); +static __pure_function unsigned log2n(size_t value) { + assert(value > 0 && value < INT32_MAX && is_powerof2(value)); + assert((value & -(int32_t)value) == value); +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) + return __builtin_ctzl(value); +#elif defined(_MSC_VER) + unsigned long index; + _BitScanForward(&index, (unsigned long)value); + return index; +#else + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; +#endif +} + +/*------------------------------------------------------------------------------ + * Unaligned access */ + +static __pure_function __maybe_unused __inline unsigned +field_alignment(unsigned alignment_baseline, size_t field_offset) { + unsigned merge = alignment_baseline | (unsigned)field_offset; + return merge & -(int)merge; +} + +/* read-thunk for UB-sanitizer */ +static __pure_function __inline uint8_t peek_u8(const uint8_t *ptr) { + return *ptr; +} + +/* write-thunk for UB-sanitizer */ +static __inline void poke_u8(uint8_t *ptr, const uint8_t v) { *ptr = v; } - unsigned log = 0; - while (value > 1) { - log += 1; - value >>= 1; +static __pure_function __inline uint16_t +unaligned_peek_u16(const unsigned expected_alignment, const void *ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint16_t)) == 0) + return *(const uint16_t *)ptr; + else { + uint16_t v; + memcpy(&v, ptr, sizeof(v)); + return v; } - return log; } -/* Address of node i in page p */ -static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { - assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); +static __inline void unaligned_poke_u16(const unsigned expected_alignment, + void *ptr, const uint16_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint16_t *)ptr = v; + else + memcpy(ptr, &v, sizeof(v)); +} + +static __pure_function __inline uint32_t +unaligned_peek_u32(const unsigned expected_alignment, const void *ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint32_t)) == 0) + return *(const uint32_t *)ptr; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + const uint16_t lo = + ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint16_t hi = + ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint32_t)hi << 16; + } else { + uint32_t v; + memcpy(&v, ptr, sizeof(v)); + return v; + } } -/* Get the page number pointed to by a branch node */ -static __inline pgno_t NODEPGNO(const MDBX_node *node) { - pgno_t pgno; - if (UNALIGNED_OK) { - pgno = node->mn_ksize_and_pgno; - if (sizeof(pgno_t) > 4) - pgno &= MAX_PAGENO; +static __inline void unaligned_poke_u32(const unsigned expected_alignment, + void *ptr, const uint32_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint32_t *)ptr = v; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v; + ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint16_t)(v >> 16); + } else + memcpy(ptr, &v, sizeof(v)); +} + +static __pure_function __inline uint64_t +unaligned_peek_u64(const unsigned expected_alignment, const void *ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint64_t)) == 0) + return *(const uint64_t *)ptr; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + const uint32_t lo = + ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint32_t hi = + ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint64_t)hi << 32; } else { - pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_flags) << 32; + uint64_t v; + memcpy(&v, ptr, sizeof(v)); + return v; } +} + +static __inline void unaligned_poke_u64(const unsigned expected_alignment, + void *ptr, const uint64_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint64_t *)ptr = v; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v; + ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint32_t)(v >> 32); + } else + memcpy(ptr, &v, sizeof(v)); +} + +#define UNALIGNED_PEEK_8(ptr, struct, field) \ + peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_8(ptr, struct, field, value) \ + poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_16(ptr, struct, field) \ + unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_16(ptr, struct, field, value) \ + unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_32(ptr, struct, field) \ + unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_32(ptr, struct, field, value) \ + unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_64(ptr, struct, field) \ + unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_64(ptr, struct, field, value) \ + unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) + +/* Get the page number pointed to by a branch node */ +static __pure_function __inline pgno_t node_pgno(const MDBX_node *node) { + pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32); + if (sizeof(pgno) > 4) + pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32; return pgno; } /* Set the page number in a branch node */ -static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - assert(pgno <= MAX_PAGENO); +static __inline void node_set_pgno(MDBX_node *node, pgno_t pgno) { + assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); - if (UNALIGNED_OK) { - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_ksize) << 48; - node->mn_ksize_and_pgno = pgno; - } else { - node->mn_lo = (uint16_t)pgno; - node->mn_hi = (uint16_t)(pgno >> 16); - if (sizeof(pgno_t) > 4) - node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); - } + UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno); + if (sizeof(pgno) > 4) + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, + (uint8_t)((uint64_t)pgno >> 32)); } /* Get the size of the data in a leaf node */ -static __inline size_t NODEDSZ(const MDBX_node *node) { - size_t size; - if (UNALIGNED_OK) { - size = node->mn_dsize; - } else { - size = node->mn_lo | ((size_t)node->mn_hi << 16); - } - return size; +static __pure_function __inline size_t node_ds(const MDBX_node *node) { + return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize); } /* Set the size of the data for a leaf node */ -static __inline void SETDSZ(MDBX_node *node, size_t size) { +static __inline void node_set_ds(MDBX_node *node, size_t size) { assert(size < INT_MAX); - if (UNALIGNED_OK) { - node->mn_dsize = (uint32_t)size; - } else { - node->mn_lo = (uint16_t)size; - node->mn_hi = (uint16_t)(size >> 16); + UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size); +} + +/* The size of a key in a node */ +static __pure_function __inline size_t node_ks(const MDBX_node *node) { + return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize); +} + +/* Set the size of the key for a leaf node */ +static __inline void node_set_ks(MDBX_node *node, size_t size) { + assert(size < INT16_MAX); + UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size); +} + +static __pure_function __inline uint8_t node_flags(const MDBX_node *node) { + return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags); +} + +static __inline void node_set_flags(MDBX_node *node, uint8_t flags) { + UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags); +} + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDBX_node, mn_data) + +/* Address of the key for the node */ +static __pure_function __inline void *node_key(const MDBX_node *node) { + return (char *)node + NODESIZE; +} + +/* Address of the data for a node */ +static __pure_function __inline void *node_data(const MDBX_node *node) { + return (char *)node_key(node) + node_ks(node); +} + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +static __pure_function __inline size_t node_size_len(const size_t key_len, + const size_t value_len) { + return NODESIZE + EVEN(key_len + value_len); +} +static __pure_function __inline size_t node_size(const MDBX_val *key, + const MDBX_val *value) { + return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); +} + +static __pure_function __inline pgno_t peek_pgno(const void *ptr) { + if (sizeof(pgno_t) == sizeof(uint32_t)) + return (pgno_t)unaligned_peek_u32(1, ptr); + else if (sizeof(pgno_t) == sizeof(uint64_t)) + return (pgno_t)unaligned_peek_u64(1, ptr); + else { + pgno_t pgno; + memcpy(&pgno, ptr, sizeof(pgno)); + return pgno; } } -static __inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { +static __inline void poke_pgno(void *ptr, const pgno_t pgno) { + if (sizeof(pgno) == sizeof(uint32_t)) + unaligned_poke_u32(1, ptr, pgno); + else if (sizeof(pgno) == sizeof(uint64_t)) + unaligned_poke_u64(1, ptr, pgno); + else + memcpy(ptr, &pgno, sizeof(pgno)); +} + +static __pure_function __inline pgno_t +node_largedata_pgno(const MDBX_node *node) { + assert(node_flags(node) & F_BIGDATA); + return peek_pgno(node_data(node)); +} + +/*------------------------------------------------------------------------------ + * Key length limitation factors: + * + * - Branch-page must contain at least two (MDBX_MINKEYS) nodes, + * within each a key and a child page number. But we can't split a page if + * it contains less that 4 keys. Therefore, at least 3 branch-node should + * fit in the single branch-page: + * pageroom = pagesize - page_hdr_len; + * branch.maxnode = even_floor(pageroom / 3 - sizeof(indx_t)); + * branch.maxkey = branch.maxnode - node_hdr_len; + * + * - Leaf-node of non-dupsort database must fit into one leaf-page, + * where a value could be placed on a large/overflow page: + * leaf.maxnode = even_floor(pageroom - sizeof(indx_t)); + * leaf.maxkey = leaf.maxnode - node_hdr_len - sizeof(pgno_t); + * + * - SubDatabase-node must fit into one leaf-page: + * subdb.maxname = leaf.maxnode - node_hdr_len - sizeof(MDBX_db); + * + * - Dupsort values itself are a keys in a dupsort-subdb and couldn't be + * longer than the branch.maxkey. But dupsort node must fit into one + * leaf-page, since dupsort value couldn't be placed on a large/overflow + * page. + * + * - So, the simpliest solution is to use half of branch.maxkey as + * a common maxkey value. Nevertheless, the actual values of maxkey are: + * nondupsort.maxkey = even_floor(pageroom / 3) + * - sizeof(indx_t) - node_hdr_len; + * dupsort.maxkey(value) = min(nondupsort.maxkey, + * leaf.maxnode - even_ceil(length(value))); + */ + +#define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ) +#define EVEN_FLOOR(n) ((n) & ~1ul) +#define BRANCH_NODEMAX(pagesize) \ + (EVEN_FLOOR(PAGEROOM(pagesize) / (MDBX_MINKEYS * 2 - 1)) - sizeof(indx_t)) +#define LEAF_NODEMAX(pagesize) (PAGEROOM(pagesize) - sizeof(indx_t)) +#define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) + +__cold int mdbx_env_get_maxkeysize(MDBX_env *env) { + return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); +} + +__cold int mdbx_env_get_maxkeysize_ex(MDBX_env *env, unsigned flags) { + if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags); +} + +__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, unsigned flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + STATIC_ASSERT(BRANCH_NODEMAX(MIN_PAGESIZE) - NODESIZE - sizeof(pgno_t) >= 8); + STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) - NODESIZE - sizeof(pgno_t) >= 8); + STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) - NODESIZE >= sizeof(MDBX_db)); + if (flags & MDBX_INTEGERKEY) + return 8 /* sizeof(uint64_t) */; + + STATIC_ASSERT(BRANCH_NODEMAX(MAX_PAGESIZE) - NODESIZE - sizeof(pgno_t) < + LEAF_NODEMAX(MAX_PAGESIZE) - NODESIZE - sizeof(MDBX_db)); + STATIC_ASSERT(BRANCH_NODEMAX(MIN_PAGESIZE) - NODESIZE - sizeof(pgno_t) < + LEAF_NODEMAX(MIN_PAGESIZE) - NODESIZE - sizeof(MDBX_db)); + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return BRANCH_NODEMAX(pagesize) - NODESIZE - sizeof(MDBX_db); + + return BRANCH_NODEMAX(pagesize) - NODESIZE - sizeof(pgno_t); +} + +__cold int mdbx_env_get_maxvalsize_ex(MDBX_env *env, unsigned flags) { + if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags); +} + +__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, unsigned flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_syspagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + if (flags & MDBX_INTEGERDUP) + return 8 /* sizeof(uint64_t) */; + + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return BRANCH_NODEMAX(pagesize) - NODESIZE; + + const unsigned page_ln2 = log2n(pagesize); + const size_t hard = 0x7FF00000ul; + const size_t hard_pages = hard >> page_ln2; + const size_t limit = (hard_pages < MDBX_DPL_TXNFULL) + ? hard + : ((size_t)MDBX_DPL_TXNFULL << page_ln2); + return (limit < MAX_MAPSIZE) ? limit / 2 : MAX_MAPSIZE / 2; +} + +/* Calculate the size of a leaf node. + * + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the MDBX_node headers. */ +static __pure_function __inline size_t +leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { + size_t node_bytes = node_size(key, data); + /* NOTE: The actual limit is LEAF_NODEMAX(env->me_psize), but it reasonable to + * use env->me_branch_nodemax (which is 3 times less) as the treshold because: + * - Large threshold implies that any insertion/update could result split + * a single leaf page to THREE, which requires TWO insertion into parent + * branch page, then could leads to split parent page and so on up to + * the root. Such double-splitting is complex, ie costly (in case simple + * clear implementation) either dangerous (in case high-optimized + * implementation). + * - This does not affect capabilities, i.e. it does not limit the maximum + * key size. + * - At a lower threshold, on average, the density of keys on leaf pages + * increases and the height of the tree decreases. Thus, this lead the + * less number of pages participating in the search, and the search + * speed increases. + * - On the other hand, there is no universal gold ratio here and with a + * smaller threshold, we will create more overflows/large pages, + * i.e. the database size will be larger as will the IOPS volume. + * + * So, the lower threshold is not a silver bullet, but it allow implementation + * to be much simple and robust, without adding a flaws. */ + if (node_bytes > env->me_branch_nodemax) { + /* put on overflow page */ + node_bytes = node_size(key, nullptr) + sizeof(pgno_t); + } + + return node_bytes + sizeof(indx_t); +} + +/* Calculate the size of a branch node. + * + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the MDBX_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. + * + * [in] env The environment handle. + * [in] key The key for the node. + * + * Returns The number of bytes needed to store the node. */ +static __pure_function __inline size_t branch_size(const MDBX_env *env, + const MDBX_val *key) { + /* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ + size_t node_bytes = node_size(key, nullptr); + if (unlikely(node_bytes > env->me_branch_nodemax)) { + /* put on overflow page */ + /* not implemented */ + mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, + __LINE__); + node_bytes = node_size(key, nullptr) + sizeof(pgno_t); + } + + return node_bytes + sizeof(indx_t); +} + +/*----------------------------------------------------------------------------*/ + +static __pure_function __inline size_t pgno2bytes(const MDBX_env *env, + pgno_t pgno) { mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } -static __inline MDBX_page *pgno2page(const MDBX_env *env, pgno_t pgno) { +static __pure_function __inline MDBX_page *pgno2page(const MDBX_env *env, + pgno_t pgno) { return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); } -static __inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { +static __pure_function __inline pgno_t bytes2pgno(const MDBX_env *env, + size_t bytes) { mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); return (pgno_t)(bytes >> env->me_psize2log); } -static __inline size_t pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { - return mdbx_roundup2(pgno2bytes(env, pgno), env->me_os_psize); +static __pure_function __inline size_t pgno_align2os_bytes(const MDBX_env *env, + pgno_t pgno) { + return roundup_powerof2(pgno2bytes(env, pgno), env->me_os_psize); } -static __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, pgno_t pgno) { +static __pure_function __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, + pgno_t pgno) { return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); } -/* Perform act while tracking temporary cursor mn */ -#define WITH_CURSOR_TRACKING(mn, act) \ - do { \ - mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ - MDBX_cursor mc_dummy, *tracked, \ - **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ - if ((mn).mc_flags & C_SUB) { \ - mc_dummy.mc_flags = C_INITIALIZED; \ - mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ - tracked = &mc_dummy; \ - } else { \ - tracked = &(mn); \ - } \ - tracked->mc_next = *tp; \ - *tp = tracked; \ - { act; } \ - *tp = tracked->mc_next; \ - } while (0) +static __pure_function __inline size_t bytes_align2os_bytes(const MDBX_env *env, + size_t bytes) { + return roundup_powerof2(roundup_powerof2(bytes, env->me_psize), + env->me_os_psize); +} -/*----------------------------------------------------------------------------*/ -/* LY: temporary workaround for Elbrus's memcmp() bug. */ +/* Address of first usable data byte in a page, after the header */ +static __pure_function __inline void *page_data(const MDBX_page *mp) { + return (char *)mp + PAGEHDRSZ; +} + +static __pure_function __inline const MDBX_page *data_page(const void *data) { + return container_of(data, MDBX_page, mp_ptrs); +} + +static __pure_function __inline MDBX_meta *page_meta(MDBX_page *mp) { + return (MDBX_meta *)page_data(mp); +} + +/* Number of nodes on a page */ +static __pure_function __inline unsigned page_numkeys(const MDBX_page *mp) { + return mp->mp_lower >> 1; +} + +/* The amount of space remaining in the page */ +static __pure_function __inline unsigned page_room(const MDBX_page *mp) { + return mp->mp_upper - mp->mp_lower; +} + +static __pure_function __inline unsigned page_space(const MDBX_env *env) { + STATIC_ASSERT(PAGEHDRSZ % 2 == 0); + return env->me_psize - PAGEHDRSZ; +} + +static __pure_function __inline unsigned page_used(const MDBX_env *env, + const MDBX_page *mp) { + return page_space(env) - page_room(mp); +} + +/* The percentage of space used in the page, in a percents. */ +static __pure_function __maybe_unused __inline double +page_fill(const MDBX_env *env, const MDBX_page *mp) { + return page_used(env, mp) * 100.0 / page_space(env); +} + +static __pure_function __inline bool +page_fill_enough(const MDBX_page *mp, unsigned spaceleft_threshold, + unsigned minkeys_threshold) { + return page_room(mp) < spaceleft_threshold && + page_numkeys(mp) >= minkeys_threshold; +} + +/* The number of overflow pages needed to store the given size. */ +static __pure_function __inline pgno_t number_of_ovpages(const MDBX_env *env, + size_t bytes) { + return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; +} + +/* Address of node i in page p */ +static __pure_function __inline MDBX_node *page_node(const MDBX_page *mp, + unsigned i) { + assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); + assert(page_numkeys(mp) > (unsigned)(i)); + assert(mp->mp_ptrs[i] % 2 == 0); + return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); +} + +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ +static __pure_function __inline void * +page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { + assert(mp->mp_leaf2_ksize == keysize); + (void)keysize; + return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); +} + +/* Set the node's key into keyptr. */ +static __inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { + keyptr->iov_len = node_ks(node); + keyptr->iov_base = node_key(node); +} + +/* Set the node's key into keyptr, if requested. */ +static __inline void get_key_optional(const MDBX_node *node, + MDBX_val *keyptr /* __may_null */) { + if (keyptr) + get_key(node, keyptr); +} + +/*------------------------------------------------------------------------------ + * LY: temporary workaround for Elbrus's memcmp() bug. */ #if defined(__e2k__) && !__GLIBC_PREREQ(2, 24) int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, @@ -272,13 +690,229 @@ size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { } #endif /* Elbrus's memcmp() bug. */ +/*------------------------------------------------------------------------------ + * safe read/write volatile 64-bit fields on 32-bit architectures. */ + +static __inline void atomic_yield(void) { +#if defined(_WIN32) || defined(_WIN64) + YieldProcessor(); +#elif defined(__x86_64__) || defined(__i386__) || defined(__e2k__) + __builtin_ia32_pause(); +#elif defined(__ia64__) +#if defined(__HP_cc__) || defined(__HP_aCC__) + _Asm_hint(_HINT_PAUSE); +#else + __asm__ __volatile__("hint @pause"); +#endif +#elif defined(__arm__) || defined(__aarch64__) +#ifdef __CC_ARM + __yield(); +#else + __asm__ __volatile__("yield"); +#endif +#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \ + __mips_isa_rev >= 2 + __asm__ __volatile__("pause"); +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) + __asm__ __volatile__(".word 0x00000140"); +#else + pthread_yield(); +#endif +} + +#if MDBX_64BIT_CAS +static __inline bool atomic_cas64(volatile uint64_t *p, uint64_t c, + uint64_t v) { +#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_LLONG_LOCK_FREE) + STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); + STATIC_ASSERT(atomic_is_lock_free(p)); + return atomic_compare_exchange_strong((_Atomic uint64_t *)p, &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(p, c, v); +#elif defined(_MSC_VER) + return c == + (uint64_t)_InterlockedCompareExchange64((volatile int64_t *)p, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap64Barrier(c, v, (volatile uint64_t *)p); +#else +#error FIXME: Unsupported compiler +#endif +} +#endif /* MDBX_64BIT_CAS */ + +static __inline bool atomic_cas32(volatile uint32_t *p, uint32_t c, + uint32_t v) { +#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); + STATIC_ASSERT(atomic_is_lock_free(p)); + return atomic_compare_exchange_strong((_Atomic uint32_t *)p, &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(p, c, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return c == (uint32_t)_InterlockedCompareExchange((volatile long *)p, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); +#else +#error FIXME: Unsupported compiler +#endif +} + +static __inline uint32_t atomic_add32(volatile uint32_t *p, uint32_t v) { +#if defined(ATOMIC_VAR_INIT) || defined(ATOMIC_INT_LOCK_FREE) + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); + STATIC_ASSERT(atomic_is_lock_free(p)); + return atomic_fetch_add((_Atomic uint32_t *)p, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_fetch_and_add(p, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return _InterlockedExchangeAdd((volatile long *)p, v); +#elif defined(__APPLE__) + return OSAtomicAdd32Barrier(v, (volatile int32_t *)p); +#else +#error FIXME: Unsupported compiler +#endif +} + +#define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) + +static __maybe_unused __inline bool safe64_is_valid(uint64_t v) { +#if MDBX_WORDBITS >= 64 + return v < SAFE64_INVALID_THRESHOLD; +#else + return (v >> 32) != UINT32_MAX; +#endif /* MDBX_WORDBITS */ +} + +static __maybe_unused __inline bool +safe64_is_valid_ptr(const mdbx_safe64_t *ptr) { + mdbx_compiler_barrier(); +#if MDBX_64BIT_ATOMIC + return ptr->atomic < SAFE64_INVALID_THRESHOLD; +#else + return ptr->high != UINT32_MAX; +#endif /* MDBX_64BIT_ATOMIC */ +} + +static __inline uint64_t safe64_txnid_next(uint64_t txnid) { + txnid += MDBX_TXNID_STEP; +#if !MDBX_64BIT_CAS + /* avoid overflow of low-part in safe64_reset() */ + txnid += (UINT32_MAX == (uint32_t)txnid); +#endif + return txnid; +} + +static __inline void safe64_reset(mdbx_safe64_t *ptr, bool single_writer) { + mdbx_compiler_barrier(); +#if !MDBX_64BIT_CAS + if (!single_writer) { + STATIC_ASSERT(MDBX_TXNID_STEP > 1); + /* it is safe to increment low-part to avoid ABA, since MDBX_TXNID_STEP > 1 + * and overflow was preserved in safe64_txnid_next() */ + atomic_add32(&ptr->low, 1) /* avoid ABA in safe64_reset_compare() */; + ptr->high = UINT32_MAX /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_add32(&ptr->low, 1) /* avoid ABA in safe64_reset_compare() */; + } else +#else + (void)single_writer; +#endif /* !MDBX_64BIT_CAS */ +#if MDBX_64BIT_ATOMIC + ptr->atomic = UINT64_MAX; +#else + /* atomically make value >= SAFE64_INVALID_THRESHOLD */ + ptr->high = UINT32_MAX; +#endif /* MDBX_64BIT_ATOMIC */ + assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); + mdbx_flush_incoherent_cpu_writeback(); + mdbx_jitter4testing(true); +} + +static __inline bool safe64_reset_compare(mdbx_safe64_t *ptr, txnid_t compare) { + mdbx_compiler_barrier(); + /* LY: This function is used to reset `mr_txnid` from OOM-kick in case + * the asynchronously cancellation of read transaction. Therefore, + * there may be a collision between the cleanup performed here and + * asynchronous termination and restarting of the read transaction + * in another proces/thread. In general we MUST NOT reset the `mr_txnid` + * if a new transaction was started (i.e. if `mr_txnid` was changed). */ +#if MDBX_64BIT_CAS + bool rc = atomic_cas64(&ptr->inconsistent, compare, UINT64_MAX); + mdbx_flush_incoherent_cpu_writeback(); +#else + /* LY: There is no gold ratio here since shared mutex is too costly, + * in such way we must acquire/release it for every update of mr_txnid, + * i.e. twice for each read transaction). */ + bool rc = false; + if (likely(ptr->low == (uint32_t)compare && + atomic_cas32(&ptr->high, (uint32_t)(compare >> 32), UINT32_MAX))) { + if (unlikely(ptr->low != (uint32_t)compare)) + atomic_cas32(&ptr->high, UINT32_MAX, (uint32_t)(compare >> 32)); + else + rc = true; + } +#endif /* MDBX_64BIT_CAS */ + mdbx_jitter4testing(true); + return rc; +} + +static __inline void safe64_write(mdbx_safe64_t *ptr, const uint64_t v) { + mdbx_compiler_barrier(); + assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); +#if MDBX_64BIT_ATOMIC + ptr->atomic = v; +#else /* MDBX_64BIT_ATOMIC */ + /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ + ptr->low = (uint32_t)v; + assert(ptr->inconsistent >= SAFE64_INVALID_THRESHOLD); + mdbx_flush_incoherent_cpu_writeback(); + mdbx_jitter4testing(true); + /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ + ptr->high = (uint32_t)(v >> 32); +#endif /* MDBX_64BIT_ATOMIC */ + assert(ptr->inconsistent == v); + mdbx_flush_incoherent_cpu_writeback(); + mdbx_jitter4testing(true); +} + +static __always_inline uint64_t safe64_read(const mdbx_safe64_t *ptr) { + mdbx_compiler_barrier(); + mdbx_jitter4testing(true); + uint64_t v; +#if MDBX_64BIT_ATOMIC + v = ptr->atomic; +#else /* MDBX_64BIT_ATOMIC */ + uint32_t hi, lo; + do { + hi = ptr->high; + mdbx_compiler_barrier(); + mdbx_jitter4testing(true); + lo = ptr->low; + mdbx_compiler_barrier(); + mdbx_jitter4testing(true); + } while (unlikely(hi != ptr->high)); + v = lo | (uint64_t)hi << 32; +#endif /* MDBX_64BIT_ATOMIC */ + mdbx_jitter4testing(true); + return v; +} + +static __inline void safe64_update(mdbx_safe64_t *ptr, const uint64_t v) { + safe64_reset(ptr, true); + safe64_write(ptr, v); +} + /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ typedef struct rthc_entry_t { MDBX_reader *begin; MDBX_reader *end; - mdbx_thread_key_t key; + mdbx_thread_key_t thr_tls_key; + bool key_valid; } rthc_entry_t; #if MDBX_DEBUG @@ -287,8 +921,11 @@ typedef struct rthc_entry_t { #define RTHC_INITIAL_LIMIT 16 #endif +static bin128_t bootid; + #if defined(_WIN32) || defined(_WIN64) static CRITICAL_SECTION rthc_critical_section; +static CRITICAL_SECTION lcklist_critical_section; #else int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, void *dso_symbol) __attribute__((__weak__)); @@ -302,12 +939,13 @@ int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, } #endif /* __APPLE__ */ -static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t mdbx_rthc_cond = PTHREAD_COND_INITIALIZER; -static mdbx_thread_key_t mdbx_rthc_key; -static volatile uint32_t mdbx_rthc_pending; +static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; +static mdbx_thread_key_t rthc_key; +static volatile uint32_t rthc_pending; -static void __cold mdbx_workaround_glibc_bug21031(void) { +static void __cold workaround_glibc_bug21031(void) { /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 * * Due race between pthread_key_delete() and __nptl_deallocate_tsd() @@ -327,23 +965,23 @@ static unsigned rthc_count, rthc_limit; static rthc_entry_t *rthc_table; static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; -static __cold void mdbx_rthc_lock(void) { +static __inline void rthc_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); + mdbx_ensure(nullptr, pthread_mutex_lock(&rthc_mutex) == 0); #endif } -static __cold void mdbx_rthc_unlock(void) { +static __inline void rthc_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); + mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); #endif } -static __inline int mdbx_thread_key_create(mdbx_thread_key_t *key) { +static __inline int thread_key_create(mdbx_thread_key_t *key) { int rc; #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); @@ -351,21 +989,22 @@ static __inline int mdbx_thread_key_create(mdbx_thread_key_t *key) { #else rc = pthread_key_create(key, nullptr); #endif - mdbx_trace("&key = %p, value 0x%x, rc %d", key, (unsigned)*key, rc); + mdbx_trace("&key = %p, value 0x%x, rc %d", __Wpedantic_format_voidptr(key), + (unsigned)*key, rc); return rc; } -static __inline void mdbx_thread_key_delete(mdbx_thread_key_t key) { +static __inline void thread_key_delete(mdbx_thread_key_t key) { mdbx_trace("key = 0x%x", (unsigned)key); #if defined(_WIN32) || defined(_WIN64) mdbx_ensure(nullptr, TlsFree(key)); #else mdbx_ensure(nullptr, pthread_key_delete(key) == 0); - mdbx_workaround_glibc_bug21031(); + workaround_glibc_bug21031(); #endif } -static __inline void *mdbx_thread_rthc_get(mdbx_thread_key_t key) { +static __inline void *thread_rthc_get(mdbx_thread_key_t key) { #if defined(_WIN32) || defined(_WIN64) return TlsGetValue(key); #else @@ -373,7 +1012,7 @@ static __inline void *mdbx_thread_rthc_get(mdbx_thread_key_t key) { #endif } -static void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value) { +static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { #if defined(_WIN32) || defined(_WIN64) mdbx_ensure(nullptr, TlsSetValue(key, (void *)value)); #else @@ -389,12 +1028,12 @@ static void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value) { &thread_registration_state, (void *)&mdbx_version /* dso_anchor */)) { mdbx_ensure(nullptr, pthread_setspecific( - mdbx_rthc_key, &thread_registration_state) == 0); + rthc_key, &thread_registration_state) == 0); thread_registration_state = MDBX_THREAD_RTHC_COUNTED; - const unsigned count_before = mdbx_atomic_add32(&mdbx_rthc_pending, 1); + const unsigned count_before = atomic_add32(&rthc_pending, 1); mdbx_ensure(nullptr, count_before < INT_MAX); mdbx_trace("fallback to pthreads' tsd, key 0x%x, count %u", - (unsigned)mdbx_rthc_key, count_before); + (unsigned)rthc_key, count_before); (void)count_before; } } @@ -407,24 +1046,43 @@ __cold void mdbx_rthc_global_init(void) { rthc_table = rthc_table_static; #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(&rthc_critical_section); + InitializeCriticalSection(&lcklist_critical_section); #else mdbx_ensure(nullptr, - pthread_key_create(&mdbx_rthc_key, mdbx_rthc_thread_dtor) == 0); + pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0); mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(), - &mdbx_rthc_key, (unsigned)mdbx_rthc_key); + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); #endif + /* checking time conversion, this also avoids racing on 32-bit architectures + * during writing calculated 64-bit ratio(s) into memory. */ + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + mdbx_ensure(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; + } + + bootid = mdbx_osal_bootid(); } /* dtor called for thread, i.e. for all mdbx's environment objects */ __cold void mdbx_rthc_thread_dtor(void *ptr) { - mdbx_rthc_lock(); + rthc_lock(); mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(), (uintptr_t)mdbx_thread_self(), ptr); - const mdbx_pid_t self_pid = mdbx_getpid(); + const uint32_t self_pid = mdbx_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].key; - MDBX_reader *const rthc = mdbx_thread_rthc_get(key); + if (!rthc_table[i].key_valid) + continue; + const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + MDBX_reader *const rthc = thread_rthc_get(key); if (rthc < rthc_table[i].begin || rthc >= rthc_table[i].end) continue; #if !defined(_WIN32) && !defined(_WIN64) @@ -439,12 +1097,14 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " "current-pid %i", - (uintptr_t)mdbx_thread_self(), rthc, i, rthc_table[i].begin, - rthc_table[i].end, (int)(rthc - rthc_table[i].begin), - rthc->mr_pid, self_pid); + (uintptr_t)mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), + i, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + (int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid); if (rthc->mr_pid == self_pid) { mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", - (uintptr_t)mdbx_thread_self(), rthc); + (uintptr_t)mdbx_thread_self(), + __Wpedantic_format_voidptr(rthc)); rthc->mr_pid = 0; } } @@ -452,7 +1112,7 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { #if defined(_WIN32) || defined(_WIN64) mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", (uintptr_t)mdbx_thread_self(), ptr); - mdbx_rthc_unlock(); + rthc_unlock(); #else const char self_registration = *(char *)ptr; *(char *)ptr = MDBX_THREAD_RTHC_ZERO; @@ -460,12 +1120,12 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { (uintptr_t)mdbx_thread_self(), ptr, mdbx_getpid(), self_registration); if (self_registration == MDBX_THREAD_RTHC_COUNTED) - mdbx_ensure(nullptr, mdbx_atomic_sub32(&mdbx_rthc_pending, 1) > 0); + mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); - if (mdbx_rthc_pending == 0) { + if (rthc_pending == 0) { mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", (uintptr_t)mdbx_thread_self(), ptr, mdbx_getpid()); - mdbx_ensure(nullptr, pthread_cond_broadcast(&mdbx_rthc_cond) == 0); + mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); } mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", (uintptr_t)mdbx_thread_self(), @@ -474,28 +1134,24 @@ __cold void mdbx_rthc_thread_dtor(void *ptr) { * instead of a call for pthread_mutex_unlock() and therefore CPU could not * return to current DSO's code section, which may be unloaded immediately * after the mutex got released. */ - pthread_mutex_unlock(&mdbx_rthc_mutex); + pthread_mutex_unlock(&rthc_mutex); #endif } __cold void mdbx_rthc_global_dtor(void) { - mdbx_trace( - ">> pid %d, &mdbx_rthc_global_dtor %p, &mdbx_rthc_thread_dtor = %p, " - "&mdbx_rthc_remove = %p", - mdbx_getpid(), &mdbx_rthc_global_dtor, &mdbx_rthc_thread_dtor, - &mdbx_rthc_remove); + mdbx_trace(">> pid %d", mdbx_getpid()); - mdbx_rthc_lock(); + rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) - char *rthc = (char *)pthread_getspecific(mdbx_rthc_key); + char *rthc = (char *)pthread_getspecific(rthc_key); mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %d", - (uintptr_t)mdbx_thread_self(), rthc, mdbx_getpid(), - rthc ? *rthc : -1); + (uintptr_t)mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), + mdbx_getpid(), rthc ? *rthc : -1); if (rthc) { const char self_registration = *(char *)rthc; *rthc = MDBX_THREAD_RTHC_ZERO; if (self_registration == MDBX_THREAD_RTHC_COUNTED) - mdbx_ensure(nullptr, mdbx_atomic_sub32(&mdbx_rthc_pending, 1) > 0); + mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } struct timespec abstime; @@ -509,29 +1165,33 @@ __cold void mdbx_rthc_global_dtor(void) { abstime.tv_sec += 600; #endif - for (unsigned left; (left = mdbx_rthc_pending) > 0;) { + for (unsigned left; (left = rthc_pending) > 0;) { mdbx_trace("pid %d, pending %u, wait for...", mdbx_getpid(), left); - const int rc = - pthread_cond_timedwait(&mdbx_rthc_cond, &mdbx_rthc_mutex, &abstime); + const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; } - mdbx_thread_key_delete(mdbx_rthc_key); + thread_key_delete(rthc_key); #endif - const mdbx_pid_t self_pid = mdbx_getpid(); + const uint32_t self_pid = mdbx_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].key; - mdbx_thread_key_delete(key); + if (!rthc_table[i].key_valid) + continue; + const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { mdbx_trace("== [%i] = key %zu, %p ... %p, rthc %p (%+i), " "rthc-pid %i, current-pid %i", - i, (size_t)key, rthc_table[i].begin, rthc_table[i].end, rthc, + i, (size_t)key, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), rthc->mr_pid, self_pid); if (rthc->mr_pid == self_pid) { rthc->mr_pid = 0; - mdbx_trace("== cleanup %p", rthc); + mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } } @@ -540,14 +1200,15 @@ __cold void mdbx_rthc_global_dtor(void) { if (rthc_table != rthc_table_static) mdbx_free(rthc_table); rthc_table = nullptr; - mdbx_rthc_unlock(); + rthc_unlock(); #if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(&lcklist_critical_section); DeleteCriticalSection(&rthc_critical_section); #else /* LY: yielding a few timeslices to give a more chance * to racing destructor(s) for completion. */ - mdbx_workaround_glibc_bug21031(); + workaround_glibc_bug21031(); #endif mdbx_trace("<< pid %d\n", mdbx_getpid()); @@ -555,15 +1216,19 @@ __cold void mdbx_rthc_global_dtor(void) { __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, MDBX_reader *end) { + int rc; + if (key) { #ifndef NDEBUG - *key = (mdbx_thread_key_t)0xBADBADBAD; + *key = (mdbx_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ - int rc = mdbx_thread_key_create(key); - if (rc != MDBX_SUCCESS) - return rc; + rc = thread_key_create(key); + if (rc != MDBX_SUCCESS) + return rc; + } - mdbx_rthc_lock(); - mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)*key, + rthc_lock(); + const mdbx_thread_key_t new_key = key ? *key : 0; + mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)new_key, rthc_count, rthc_limit); if (rthc_count == rthc_limit) { rthc_entry_t *new_table = @@ -578,40 +1243,44 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, rthc_table = new_table; rthc_limit *= 2; } - mdbx_trace("== [%i] = key %zu, %p ... %p", rthc_count, (size_t)*key, begin, - end); - rthc_table[rthc_count].key = *key; + mdbx_trace("== [%i] = key %zu, %p ... %p", rthc_count, (size_t)new_key, + __Wpedantic_format_voidptr(begin), + __Wpedantic_format_voidptr(end)); + rthc_table[rthc_count].key_valid = key ? true : false; + rthc_table[rthc_count].thr_tls_key = key ? new_key : 0; rthc_table[rthc_count].begin = begin; rthc_table[rthc_count].end = end; ++rthc_count; - mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)*key, + mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)new_key, rthc_count, rthc_limit); - mdbx_rthc_unlock(); + rthc_unlock(); return MDBX_SUCCESS; bailout: - mdbx_thread_key_delete(*key); - mdbx_rthc_unlock(); + if (key) + thread_key_delete(*key); + rthc_unlock(); return rc; } __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { - mdbx_thread_key_delete(key); - mdbx_rthc_lock(); + thread_key_delete(key); + rthc_lock(); mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, rthc_limit); for (unsigned i = 0; i < rthc_count; ++i) { - if (key == rthc_table[i].key) { - const mdbx_pid_t self_pid = mdbx_getpid(); - mdbx_trace("== [%i], %p ...%p, current-pid %d", i, rthc_table[i].begin, - rthc_table[i].end, self_pid); + if (rthc_table[i].key_valid && key == rthc_table[i].thr_tls_key) { + const uint32_t self_pid = mdbx_getpid(); + mdbx_trace("== [%i], %p ...%p, current-pid %d", i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { if (rthc->mr_pid == self_pid) { rthc->mr_pid = 0; - mdbx_trace("== cleanup %p", rthc); + mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } if (--rthc_count > 0) @@ -627,17 +1296,320 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, rthc_limit); - mdbx_rthc_unlock(); + rthc_unlock(); +} + +//------------------------------------------------------------------------------ + +#define RTHC_ENVLIST_END ((MDBX_env *)((size_t)50459)) +static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END; + +static __inline void lcklist_lock(void) { +#if defined(_WIN32) || defined(_WIN64) + EnterCriticalSection(&lcklist_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_lock(&lcklist_mutex) == 0); +#endif +} + +static __inline void lcklist_unlock(void) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(&lcklist_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); +#endif +} + +static uint64_t rrxmrrxmsx_0(uint64_t v) { + /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ + v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); + v *= UINT64_C(0xA24BAED4963EE407); + v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49); + v *= UINT64_C(0x9FB21C651E98DF25); + return v ^ v >> 28; +} + +static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { + int rc; + uint64_t bait; + if (pending->address) { + bait = pending->lck->mti_bait_uniqueness; + rc = MDBX_SUCCESS; + } else { + bait = 0 /* hush MSVC warning */; + rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), true); + if (rc == MDBX_SUCCESS) + rc = + mdbx_pread(pending->fd, &bait, sizeof(scan->lck->mti_bait_uniqueness), + offsetof(MDBX_lockinfo, mti_bait_uniqueness)); + } + if (likely(rc == MDBX_SUCCESS) && bait == scan->lck->mti_bait_uniqueness) + rc = MDBX_RESULT_TRUE; + + mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending->lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + return rc; +} + +static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, + uint64_t *abra) { + if (*abra == 0) { + const size_t tid = mdbx_thread_self(); + size_t uit = 0; + memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); + *abra = + rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); + } + const uint64_t cadabra = + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + << 24 | + *abra >> 40; + scan->lck->mti_bait_uniqueness = cadabra; + mdbx_flush_incoherent_cpu_writeback(); + *abra = *abra * UINT64_C(6364136223846793005) + 1; + return uniq_peek(pending, scan); +} + +__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { + *found = nullptr; + uint64_t salt = 0; + for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; + scan = scan->me_lcklist_next) { + int err = scan->me_lck_mmap.lck->mti_bait_uniqueness + ? uniq_peek(pending, &scan->me_lck_mmap) + : uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_ENODATA) { + uint64_t length; + if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS && + length == 0)) { + /* LY: skip checking since LCK-file is empty, i.e. just created. */ + mdbx_debug("uniq-probe: %s", "unique (new/empty lck)"); + return MDBX_RESULT_TRUE; + } + } + if (err == MDBX_RESULT_TRUE) + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_RESULT_TRUE) { + (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), false); + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + } + if (err == MDBX_RESULT_TRUE) { + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + *found = scan; + mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); + return MDBX_RESULT_FALSE; + } + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_debug("uniq-probe: failed rc %d", err); + return err; + } + } + + mdbx_debug("uniq-probe: %s", "unique"); + return MDBX_RESULT_TRUE; } +static int lcklist_detach_locked(MDBX_env *env) { + MDBX_env *inprocess_neighbor = nullptr; + int rc = MDBX_SUCCESS; + if (env->me_lcklist_next != nullptr) { + mdbx_ensure(env, env->me_lcklist_next != nullptr); + mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END); + for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; + ptr = &(*ptr)->me_lcklist_next) { + if (*ptr == env) { + *ptr = env->me_lcklist_next; + env->me_lcklist_next = nullptr; + break; + } + } + mdbx_ensure(env, env->me_lcklist_next == nullptr); + } + + rc = likely(mdbx_getpid() == env->me_pid) + ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) + : MDBX_PANIC; + if (!inprocess_neighbor && env->me_live_reader) + (void)mdbx_rpid_clear(env); + if (!MDBX_IS_ERROR(rc)) + rc = mdbx_lck_destroy(env, inprocess_neighbor); + return rc; +} + +/*------------------------------------------------------------------------------ + * LY: State of the art quicksort-based sorting, with internal stack and + * shell-insertion-sort for small chunks (less than half of SORT_THRESHOLD). + */ + +/* LY: Large threshold give some boost due less overhead in the inner qsort + * loops, but also a penalty in cases reverse-sorted data. + * So, 42 is magically but reasonable: + * - 0-3% faster than std::sort (from GNU C++ STL 2018) in most cases. + * - slower by a few ticks in a few cases for sequences shorter than 21. */ +#define SORT_THRESHOLD 42 + +#define SORT_SWAP(TYPE, a, b) \ + do { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } while (0) + +#define SORT_SHELLPASS(TYPE, CMP, begin, end, gap) \ + for (TYPE *i = begin + gap; i < end; ++i) { \ + for (TYPE *j = i - (gap); j >= begin && CMP(*i, *j); j -= gap) { \ + const TYPE tmp = *i; \ + do { \ + j[gap] = *j; \ + j -= gap; \ + } while (j >= begin && CMP(tmp, *j)); \ + j[gap] = tmp; \ + break; \ + } \ + } + +#define SORT_PUSH(low, high) \ + do { \ + top->lo = (low); \ + top->hi = (high); \ + ++top; \ + } while (0) + +#define SORT_POP(low, high) \ + do { \ + --top; \ + low = top->lo; \ + high = top->hi; \ + } while (0) + +#define SORT_IMPL(NAME, TYPE, CMP) \ + \ + typedef struct { \ + TYPE *lo, *hi; \ + } NAME##_stack; \ + \ + static __hot void NAME(TYPE *const begin, TYPE *const end) { \ + const ptrdiff_t length = end - begin; \ + if (length < 2) \ + return; \ + \ + if (length > SORT_THRESHOLD / 2) { \ + NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack; \ + \ + TYPE *hi = end - 1; \ + TYPE *lo = begin; \ + while (true) { \ + TYPE *mid = lo + ((hi - lo) >> 1); \ + if (CMP(*mid, *lo)) \ + SORT_SWAP(TYPE, *mid, *lo); \ + if (CMP(*hi, *mid)) { \ + SORT_SWAP(TYPE, *hi, *mid); \ + if (CMP(*mid, *lo)) \ + SORT_SWAP(TYPE, *mid, *lo); \ + } \ + \ + TYPE *right = hi - 1; \ + TYPE *left = lo + 1; \ + do { \ + while (CMP(*mid, *right)) \ + --right; \ + while (CMP(*left, *mid)) \ + ++left; \ + if (left < right) { \ + SORT_SWAP(TYPE, *left, *right); \ + if (mid == left) \ + mid = right; \ + else if (mid == right) \ + mid = left; \ + ++left; \ + --right; \ + } else if (left == right) { \ + ++left; \ + --right; \ + break; \ + } \ + } while (left <= right); \ + \ + if (lo + SORT_THRESHOLD > right) { \ + if (left + SORT_THRESHOLD > hi) { \ + if (top == stack) \ + break; \ + else \ + SORT_POP(lo, hi); \ + } else \ + lo = left; \ + } else if (left + SORT_THRESHOLD > hi) \ + hi = right; \ + else if (right - lo > hi - left) { \ + SORT_PUSH(lo, right); \ + lo = left; \ + } else { \ + SORT_PUSH(left, hi); \ + hi = right; \ + } \ + } \ + } \ + \ + SORT_SHELLPASS(TYPE, CMP, begin, end, 8); \ + SORT_SHELLPASS(TYPE, CMP, begin, end, 1); \ + for (TYPE *scan = begin + 1; scan < end; ++scan) \ + assert(CMP(scan[-1], scan[0])); \ + } + +/*------------------------------------------------------------------------------ + * LY: Binary search */ + +#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ + static __always_inline TYPE_LIST *NAME(TYPE_LIST *first, unsigned length, \ + const TYPE_ARG item) { \ + TYPE_LIST *const begin = first, *const end = begin + length; \ + \ + while (length > 3) { \ + const unsigned half = length >> 1; \ + TYPE_LIST *const middle = first + half; \ + if (CMP(*middle, item)) { \ + first = middle + 1; \ + length -= half + 1; \ + } else \ + length = half; \ + } \ + \ + switch (length) { \ + case 3: \ + if (!CMP(*first, item)) \ + break; \ + ++first; \ + /* fall through */ \ + __fallthrough; \ + case 2: \ + if (!CMP(*first, item)) \ + break; \ + ++first; \ + /* fall through */ \ + __fallthrough; \ + case 1: \ + if (CMP(*first, item)) \ + ++first; \ + } \ + \ + for (TYPE_LIST *scan = begin; scan < first; ++scan) \ + assert(CMP(*scan, item)); \ + for (TYPE_LIST *scan = first; scan < end; ++scan) \ + assert(!CMP(*scan, item)); \ + (void)begin, (void)end; \ + \ + return first; \ + } + /*----------------------------------------------------------------------------*/ static __inline size_t pnl2bytes(const size_t size) { assert(size > 0 && size <= MDBX_PNL_MAX * 2); - size_t bytes = - mdbx_roundup2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), - MDBX_PNL_GRANULATE * sizeof(pgno_t)) - - MDBX_ASSUME_MALLOC_OVERHEAD; + size_t bytes = roundup_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + + sizeof(pgno_t) * (size + 2), + MDBX_PNL_GRANULATE * sizeof(pgno_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; return bytes; } @@ -648,12 +1620,12 @@ static __inline pgno_t bytes2pnl(const size_t bytes) { } static MDBX_PNL mdbx_pnl_alloc(size_t size) { - const size_t bytes = pnl2bytes(size); + size_t bytes = pnl2bytes(size); MDBX_PNL pl = mdbx_malloc(bytes); if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) - const size_t bytes = malloc_usable_size(pl); -#endif +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ pl[0] = bytes2pnl(bytes); assert(pl[0] >= size); pl[1] = 0; @@ -674,13 +1646,13 @@ static void mdbx_pnl_shrink(MDBX_PNL *ppl) { MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); MDBX_PNL_SIZE(*ppl) = 0; if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > - MDBX_PNL_INITIAL + MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - const size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); + MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) - const size_t bytes = malloc_usable_size(pl); -#endif +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ *pl = bytes2pnl(bytes); *ppl = pl + 1; } @@ -701,12 +1673,12 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t size = (wanna + wanna - allocated < MDBX_PNL_MAX) ? wanna + wanna - allocated : MDBX_PNL_MAX; - const size_t bytes = pnl2bytes(size); + size_t bytes = pnl2bytes(size); MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) - const size_t bytes = malloc_usable_size(pl); -#endif +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ *pl = bytes2pnl(bytes); assert(*pl >= wanna); *ppl = pl + 1; @@ -727,211 +1699,184 @@ static __inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, : mdbx_pnl_reserve(ppl, wanna); } -static __inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t id) { +static __inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); + if (mdbx_audit_enabled()) { + for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) + assert(pgno != pl[i]); + } MDBX_PNL_SIZE(pl) += 1; - MDBX_PNL_LAST(pl) = id; + MDBX_PNL_LAST(pl) = pgno; } -/* Append an ID onto an PNL */ -static int __must_check_result mdbx_pnl_append(MDBX_PNL *ppl, pgno_t id) { +/* Append an pgno onto an unsorted PNL */ +static __hot int __must_check_result mdbx_pnl_append(MDBX_PNL *ppl, + pgno_t pgno) { /* Too big? */ if (unlikely(MDBX_PNL_SIZE(*ppl) == MDBX_PNL_ALLOCLEN(*ppl))) { int rc = mdbx_pnl_need(ppl, MDBX_PNL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_pnl_xappend(*ppl, id); + mdbx_pnl_xappend(*ppl, pgno); return MDBX_SUCCESS; } -/* Append an PNL onto an PNL */ +/* Append an PNL onto an unsorted PNL */ static int __must_check_result mdbx_pnl_append_list(MDBX_PNL *ppl, MDBX_PNL append) { - int rc = mdbx_pnl_need(ppl, MDBX_PNL_SIZE(append)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const unsigned len = MDBX_PNL_SIZE(append); + if (likely(len)) { + int rc = mdbx_pnl_need(ppl, MDBX_PNL_SIZE(append)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - memcpy(MDBX_PNL_END(*ppl), MDBX_PNL_BEGIN(append), - MDBX_PNL_SIZE(append) * sizeof(pgno_t)); - MDBX_PNL_SIZE(*ppl) += MDBX_PNL_SIZE(append); + const MDBX_PNL pnl = *ppl; + unsigned w = MDBX_PNL_SIZE(pnl), r = 1; + do + pnl[++w] = append[r]; + while (++r <= len); + MDBX_PNL_SIZE(pnl) = w; + } return MDBX_SUCCESS; } -/* Append an ID range onto an PNL */ -static int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl, pgno_t id, - size_t n) { +/* Append an pgno range onto an unsorted PNL */ +static __hot int __must_check_result mdbx_pnl_append_range(MDBX_PNL *ppl, + pgno_t pgno, + unsigned n) { + assert(n > 0); int rc = mdbx_pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; - pgno_t *ap = MDBX_PNL_END(*ppl); - MDBX_PNL_SIZE(*ppl) += (unsigned)n; - for (pgno_t *const end = MDBX_PNL_END(*ppl); ap < end;) - *ap++ = id++; + const MDBX_PNL pnl = *ppl; +#if MDBX_PNL_ASCENDING + unsigned w = MDBX_PNL_SIZE(pnl); + do + pnl[++w] = pgno++; + while (--n); + MDBX_PNL_SIZE(pnl) = w; +#else + unsigned w = MDBX_PNL_SIZE(pnl) + n; + MDBX_PNL_SIZE(pnl) = w; + do + pnl[w--] = --n + pgno; + while (n); +#endif + return MDBX_SUCCESS; } -static bool mdbx_pnl_check(MDBX_PNL pl, bool allocated) { - if (pl) { - assert(MDBX_PNL_SIZE(pl) <= MDBX_PNL_MAX); - if (allocated) { - assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); - } - for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { - assert(MDBX_PNL_ORDERED(scan[0], scan[1])); - assert(scan[0] >= NUM_METAS); - if (unlikely(MDBX_PNL_DISORDERED(scan[0], scan[1]) || - scan[0] < NUM_METAS)) - return false; - } - } - return true; -} +/* Append an pgno range into the sorted PNL */ +static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, + pgno_t pgno, + unsigned n) { + assert(n > 0); + int rc = mdbx_pnl_need(ppl, n); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -/* Merge an PNL onto an PNL. The destination PNL must be big enough */ -static void __hot mdbx_pnl_xmerge(MDBX_PNL pnl, MDBX_PNL merge) { - assert(mdbx_pnl_check(pnl, true)); - assert(mdbx_pnl_check(merge, false)); - pgno_t old_id, merge_id, i = MDBX_PNL_SIZE(merge), j = MDBX_PNL_SIZE(pnl), - k = i + j, total = k; - pnl[0] = - MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0; /* delimiter for pl scan below */ - old_id = pnl[j]; - while (i) { - merge_id = merge[i--]; - for (; MDBX_PNL_ORDERED(merge_id, old_id); old_id = pnl[--j]) - pnl[k--] = old_id; - pnl[k--] = merge_id; - } - MDBX_PNL_SIZE(pnl) = total; - assert(mdbx_pnl_check(pnl, true)); -} - -/* Sort an PNL */ -static void __hot mdbx_pnl_sort(MDBX_PNL pnl) { - /* Max possible depth of int-indexed tree * 2 items/level */ - int istack[sizeof(int) * CHAR_BIT * 2]; - int i, j, k, l, ir, jstack; - pgno_t a; - -/* Quicksort + Insertion sort for small arrays */ -#define PNL_SMALL 8 -#define PNL_SWAP(a, b) \ - do { \ - pgno_t tmp_pgno = (a); \ - (a) = (b); \ - (b) = tmp_pgno; \ - } while (0) + const MDBX_PNL pnl = *ppl; + unsigned r = MDBX_PNL_SIZE(pnl), w = r + n; + MDBX_PNL_SIZE(pnl) = w; + while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) + pnl[w--] = pnl[r--]; - ir = (int)MDBX_PNL_SIZE(pnl); - l = 1; - jstack = 0; - while (1) { - if (ir - l < PNL_SMALL) { /* Insertion sort */ - for (j = l + 1; j <= ir; j++) { - a = pnl[j]; - for (i = j - 1; i >= 1; i--) { - if (MDBX_PNL_DISORDERED(a, pnl[i])) - break; - pnl[i + 1] = pnl[i]; - } - pnl[i + 1] = a; - } - if (jstack == 0) - break; - ir = istack[jstack--]; - l = istack[jstack--]; - } else { - k = (l + ir) >> 1; /* Choose median of left, center, right */ - PNL_SWAP(pnl[k], pnl[l + 1]); - if (MDBX_PNL_ORDERED(pnl[ir], pnl[l])) - PNL_SWAP(pnl[l], pnl[ir]); + for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w) + pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++; - if (MDBX_PNL_ORDERED(pnl[ir], pnl[l + 1])) - PNL_SWAP(pnl[l + 1], pnl[ir]); - - if (MDBX_PNL_ORDERED(pnl[l + 1], pnl[l])) - PNL_SWAP(pnl[l], pnl[l + 1]); + return MDBX_SUCCESS; +} - i = l + 1; - j = ir; - a = pnl[l + 1]; - while (1) { - do - i++; - while (MDBX_PNL_ORDERED(pnl[i], a)); - do - j--; - while (MDBX_PNL_ORDERED(a, pnl[j])); - if (j < i) - break; - PNL_SWAP(pnl[i], pnl[j]); - } - pnl[l + 1] = pnl[j]; - pnl[j] = a; - jstack += 2; - if (ir - i + 1 >= j - l) { - istack[jstack] = ir; - istack[jstack - 1] = i; - ir = j - 1; - } else { - istack[jstack] = j - 1; - istack[jstack - 1] = l; - l = i; +static bool __hot mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { + assert(limit >= MIN_PAGENO && limit <= MAX_PAGENO + 1); + if (likely(MDBX_PNL_SIZE(pl))) { + assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); + assert(MDBX_PNL_MOST(pl) < limit); + assert(MDBX_PNL_SIZE(pl) <= MDBX_PNL_MAX); + if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PNL_MAX * 3 / 2)) + return false; + if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) + return false; + if (unlikely(MDBX_PNL_MOST(pl) >= limit)) + return false; + if (mdbx_audit_enabled()) { + for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { + assert(MDBX_PNL_ORDERED(scan[0], scan[1])); + if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) + return false; } } } -#undef PNL_SMALL -#undef PNL_SWAP - assert(mdbx_pnl_check(pnl, false)); + return true; } -/* Search for an ID in an PNL. - * [in] pl The PNL to search. - * [in] id The ID to search for. - * Returns The index of the first ID greater than or equal to id. */ -static unsigned __hot mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) { - assert(mdbx_pnl_check(pnl, true)); - - /* binary search of id in pl - * if found, returns position of id - * if not found, returns first position greater than id */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = MDBX_PNL_SIZE(pnl); - - while (n > 0) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = MDBX_PNL_ASCENDING ? mdbx_cmp2int(id, pnl[cursor]) - : mdbx_cmp2int(pnl[cursor], id); - - if (val < 0) { - n = pivot; - } else if (val > 0) { - base = cursor; - n -= pivot + 1; - } else { - return cursor; - } - } - - if (val > 0) - ++cursor; +static __inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, + const pgno_t limit) { + if (unlikely(pl == nullptr)) + return true; + assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); + if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl))) + return false; + return mdbx_pnl_check(pl, limit); +} - return cursor; +/* Merge an PNL onto an PNL. The destination PNL must be big enough */ +static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { + assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); + assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); + const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); + assert(MDBX_PNL_ALLOCLEN(dst) >= total); + pgno_t *w = dst + total; + pgno_t *d = dst + MDBX_PNL_SIZE(dst); + const pgno_t *s = src + MDBX_PNL_SIZE(src); + dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); + while (s > src) { + while (MDBX_PNL_ORDERED(*s, *d)) + *w-- = *d--; + *w-- = *s--; + } + MDBX_PNL_SIZE(dst) = (pgno_t)total; + assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); +} + +SORT_IMPL(pgno_sort, pgno_t, MDBX_PNL_ORDERED) +static __hot void mdbx_pnl_sort(MDBX_PNL pnl) { + pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); + assert(mdbx_pnl_check(pnl, MAX_PAGENO + 1)); +} + +/* Search for an pgno in an PNL. + * Returns The index of the first item greater than or equal to pgno. */ +SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) + +static __hot unsigned mdbx_pnl_search(MDBX_PNL pnl, pgno_t id) { + assert(mdbx_pnl_check4assert(pnl, MAX_PAGENO + 1)); + pgno_t *begin = MDBX_PNL_BEGIN(pnl); + pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), id); + pgno_t *end = begin + MDBX_PNL_SIZE(pnl); + assert(it >= begin && it <= end); + if (it != begin) + assert(MDBX_PNL_ORDERED(it[-1], id)); + if (it != end) + assert(!MDBX_PNL_ORDERED(it[0], id)); + return (unsigned)(it - begin + 1); +} + +static __hot unsigned mdbx_pnl_exist(MDBX_PNL pnl, pgno_t id) { + unsigned n = mdbx_pnl_search(pnl, id); + return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == id) ? n : 0; } /*----------------------------------------------------------------------------*/ static __inline size_t txl2bytes(const size_t size) { assert(size > 0 && size <= MDBX_TXL_MAX * 2); - size_t bytes = - mdbx_roundup2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), - MDBX_TXL_GRANULATE * sizeof(txnid_t)) - - MDBX_ASSUME_MALLOC_OVERHEAD; + size_t bytes = roundup_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + + sizeof(txnid_t) * (size + 2), + MDBX_TXL_GRANULATE * sizeof(txnid_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; return bytes; } @@ -942,12 +1887,12 @@ static __inline size_t bytes2txl(const size_t bytes) { } static MDBX_TXL mdbx_txl_alloc(void) { - const size_t bytes = txl2bytes(MDBX_TXL_INITIAL); + size_t bytes = txl2bytes(MDBX_TXL_INITIAL); MDBX_TXL tl = mdbx_malloc(bytes); if (likely(tl)) { -#if __GLIBC_PREREQ(2, 12) - const size_t bytes = malloc_usable_size(tl); -#endif +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(tl); +#endif /* malloc_usable_size */ tl[0] = bytes2txl(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); tl[1] = 0; @@ -974,12 +1919,12 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) ? wanna + wanna - allocated : MDBX_TXL_MAX; - const size_t bytes = txl2bytes(size); + size_t bytes = txl2bytes(size); MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes); if (likely(tl)) { -#if __GLIBC_PREREQ(2, 12) - const size_t bytes = malloc_usable_size(tl); -#endif +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(tl); +#endif /* malloc_usable_size */ *tl = bytes2txl(bytes); assert(*tl >= wanna); *ptl = tl + 1; @@ -1005,15 +1950,10 @@ static __inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { MDBX_PNL_LAST(tl) = id; } -static int mdbx_txl_cmp(const void *pa, const void *pb) { - const txnid_t a = *(MDBX_TXL)pa; - const txnid_t b = *(MDBX_TXL)pb; - return mdbx_cmp2int(b, a); -} - -static void mdbx_txl_sort(MDBX_TXL ptr) { - /* LY: temporary */ - qsort(ptr + 1, (size_t)ptr[0], sizeof(*ptr), mdbx_txl_cmp); +#define TXNID_SORT_CMP(first, last) ((first) > (last)) +SORT_IMPL(txnid_sort, txnid_t, TXNID_SORT_CMP) +static void mdbx_txl_sort(MDBX_TXL tl) { + txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { @@ -1026,133 +1966,149 @@ static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { return MDBX_SUCCESS; } -static int __must_check_result mdbx_txl_append_list(MDBX_TXL *ptl, - MDBX_TXL append) { - int rc = mdbx_txl_need(ptl, (size_t)MDBX_PNL_SIZE(append)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +/*----------------------------------------------------------------------------*/ - memcpy(MDBX_PNL_END(*ptl), MDBX_PNL_BEGIN(append), - (size_t)MDBX_PNL_SIZE(append) * sizeof(txnid_t)); - MDBX_PNL_SIZE(*ptl) += MDBX_PNL_SIZE(append); - return MDBX_SUCCESS; +#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) +SORT_IMPL(dp_sort, MDBX_DP, DP_SORT_CMP) +static __inline MDBX_DPL mdbx_dpl_sort(MDBX_DPL dl) { + assert(dl->length <= MDBX_DPL_TXNFULL); + assert(dl->sorted <= dl->length); + if (dl->sorted != dl->length) { + dl->sorted = dl->length; + dp_sort(dl + 1, dl + dl->length + 1); + } + return dl; } -/*----------------------------------------------------------------------------*/ - /* Returns the index of the first dirty-page whose pgno * member is greater than or equal to id. */ -static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t id) { - /* binary search of id in array - * if found, returns position of id - * if not found, returns first position greater than id */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = dl->length; +#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) +SEARCH_IMPL(dp_bsearch, MDBX_DP, pgno_t, DP_SEARCH_CMP) -#if MDBX_DEBUG - for (const MDBX_DP *ptr = dl + dl->length; --ptr > dl;) { - assert(ptr[0].pgno < ptr[1].pgno); - assert(ptr[0].pgno >= NUM_METAS); - } -#endif +static unsigned __hot mdbx_dpl_search(MDBX_DPL dl, pgno_t pgno) { + if (dl->sorted < dl->length) { + /* unsorted tail case */ + if (mdbx_audit_enabled()) { + for (const MDBX_DP *ptr = dl + dl->sorted; --ptr > dl;) { + assert(ptr[0].pgno < ptr[1].pgno); + assert(ptr[0].pgno >= NUM_METAS); + } + } - while (n > 0) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int(id, dl[cursor].pgno); + /* try linear search until the threshold */ + if (dl->length - dl->sorted < SORT_THRESHOLD / 2) { + unsigned i = dl->length; + while (i - dl->sorted > 7) { + if (dl[i].pgno == pgno) + return i; + if (dl[i - 1].pgno == pgno) + return i - 1; + if (dl[i - 2].pgno == pgno) + return i - 2; + if (dl[i - 3].pgno == pgno) + return i - 3; + if (dl[i - 4].pgno == pgno) + return i - 4; + if (dl[i - 5].pgno == pgno) + return i - 5; + if (dl[i - 6].pgno == pgno) + return i - 6; + if (dl[i - 7].pgno == pgno) + return i - 7; + i -= 8; + } + while (i > dl->sorted) { + if (dl[i].pgno == pgno) + return i; + --i; + } - if (val < 0) { - n = pivot; - } else if (val > 0) { - base = cursor; - n -= pivot + 1; - } else { - return cursor; + MDBX_DPL it = dp_bsearch(dl + 1, i, pgno); + return (unsigned)(it - dl); } - } - if (val > 0) - ++cursor; + /* sort a whole */ + dl->sorted = dl->length; + dp_sort(dl + 1, dl + dl->length + 1); + } - return cursor; -} + if (mdbx_audit_enabled()) { + for (const MDBX_DP *ptr = dl + dl->length; --ptr > dl;) { + assert(ptr[0].pgno < ptr[1].pgno); + assert(ptr[0].pgno >= NUM_METAS); + } + } -static int mdbx_dpl_cmp(const void *pa, const void *pb) { - const MDBX_DP a = *(MDBX_DPL)pa; - const MDBX_DP b = *(MDBX_DPL)pb; - return mdbx_cmp2int(a.pgno, b.pgno); + MDBX_DPL it = dp_bsearch(dl + 1, dl->length, pgno); + return (unsigned)(it - dl); } -static void mdbx_dpl_sort(MDBX_DPL dl) { - assert(dl->length <= MDBX_DPL_TXNFULL); - /* LY: temporary */ - qsort(dl + 1, dl->length, sizeof(*dl), mdbx_dpl_cmp); +static __inline MDBX_page *mdbx_dpl_find(MDBX_DPL dl, pgno_t pgno) { + const unsigned i = mdbx_dpl_search(dl, pgno); + assert((int)i > 0); + return (i <= dl->length && dl[i].pgno == pgno) ? dl[i].ptr : nullptr; } -static int __must_check_result mdbx_dpl_insert(MDBX_DPL dl, pgno_t pgno, - MDBX_page *page) { - assert(dl->length <= MDBX_DPL_TXNFULL); - unsigned x = mdbx_dpl_search(dl, pgno); - assert((int)x > 0); - if (unlikely(dl[x].pgno == pgno && x <= dl->length)) - return /* duplicate */ MDBX_PROBLEM; - - if (unlikely(dl->length == MDBX_DPL_TXNFULL)) - return MDBX_TXN_FULL; - - /* insert page */ - for (unsigned i = dl->length += 1; i > x; --i) - dl[i] = dl[i - 1]; - - dl[x].pgno = pgno; - dl[x].ptr = page; - return MDBX_SUCCESS; +static __hot MDBX_page *mdbx_dpl_remove(MDBX_DPL dl, pgno_t prno) { + unsigned i = mdbx_dpl_search(dl, prno); + assert((int)i > 0); + MDBX_page *mp = nullptr; + if (i <= dl->length && dl[i].pgno == prno) { + dl->sorted -= dl->sorted >= i; + mp = dl[i].ptr; + while (i < dl->length) { + dl[i] = dl[i + 1]; + ++i; + } + dl->length -= 1; + } + return mp; } -static int __must_check_result mdbx_dpl_append(MDBX_DPL dl, pgno_t pgno, - MDBX_page *page) { +static __inline int __must_check_result mdbx_dpl_append(MDBX_DPL dl, + pgno_t pgno, + MDBX_page *page) { assert(dl->length <= MDBX_DPL_TXNFULL); -#if MDBX_DEBUG - for (unsigned i = dl->length; i > 0; --i) { - assert(dl[i].pgno != pgno); - if (unlikely(dl[i].pgno == pgno)) - return MDBX_PROBLEM; + if (mdbx_audit_enabled()) { + for (unsigned i = dl->length; i > 0; --i) { + assert(dl[i].pgno != pgno); + if (unlikely(dl[i].pgno == pgno)) + return MDBX_PROBLEM; + } } -#endif if (unlikely(dl->length == MDBX_DPL_TXNFULL)) return MDBX_TXN_FULL; /* append page */ - const unsigned i = dl->length += 1; - dl[i].pgno = pgno; - dl[i].ptr = page; + const unsigned n = dl->length + 1; + if (n == 1 || (dl->sorted >= dl->length && dl[n - 1].pgno < pgno)) + dl->sorted = n; + dl->length = n; + dl[n].pgno = pgno; + dl[n].ptr = page; return MDBX_SUCCESS; } -/*----------------------------------------------------------------------------*/ +static __inline void mdbx_dpl_clear(MDBX_DPL dl) { + dl->sorted = dl->length = 0; +} -int mdbx_runtime_flags = MDBX_DBG_PRINT -#if MDBX_DEBUG - | MDBX_DBG_ASSERT -#endif -#if MDBX_DEBUG > 1 - | MDBX_DBG_TRACE -#endif -#if MDBX_DEBUG > 2 - | MDBX_DBG_AUDIT -#endif -#if MDBX_DEBUG > 3 - | MDBX_DBG_EXTRA -#endif - ; +/*----------------------------------------------------------------------------*/ +#ifndef MDBX_ALLOY +uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +uint8_t mdbx_loglevel = MDBX_DEBUG; MDBX_debug_func *mdbx_debug_logger; +#endif /* MDBX_ALLOY */ + +static bool mdbx_refund(MDBX_txn *txn); +static __must_check_result int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp); +static __must_check_result int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp); +static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, + MDBX_page **const mp, int flags); +static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard); -static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, - int flags); static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num, MDBX_page **mp); static int mdbx_page_touch(MDBX_cursor *mc); @@ -1193,6 +2149,8 @@ static int __must_check_result mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags); static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); +static int __must_check_result mdbx_page_flush(MDBX_txn *txn, + const unsigned keep); #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ static int __must_check_result mdbx_page_split(MDBX_cursor *mc, @@ -1201,10 +2159,11 @@ static int __must_check_result mdbx_page_split(MDBX_cursor *mc, pgno_t newpgno, unsigned nflags); static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, - uint64_t *filesize); + uint64_t *filesize, + const int lck_exclusive); static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending); -static void mdbx_env_close0(MDBX_env *env); +static int mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); @@ -1227,10 +2186,6 @@ static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft); static int __must_check_result mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, MDBX_val *data); -static size_t mdbx_leaf_size(MDBX_env *env, const MDBX_val *key, - const MDBX_val *data); -static size_t mdbx_branch_size(MDBX_env *env, const MDBX_val *key); - static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); static int __must_check_result mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key); @@ -1238,6 +2193,17 @@ static int __must_check_result mdbx_update_key(MDBX_cursor *mc, static void mdbx_cursor_pop(MDBX_cursor *mc); static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); +static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, + unsigned retired_stored, + bool dont_filter_gc); +static __maybe_unused __inline int __must_check_result +mdbx_audit(MDBX_txn *txn) { + return mdbx_audit_ex(txn, 0, (txn->mt_flags & MDBX_RDONLY) != 0); +} + +static int __must_check_result mdbx_page_check(MDBX_env *env, + const MDBX_page *const mp, + bool maybe_unfinished); static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc, bool pending); static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, @@ -1267,11 +2233,13 @@ static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, int force); +static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result mdbx_drop0(MDBX_cursor *mc, int subs); +static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); -static MDBX_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, - mdbx_cmp_int_a2, mdbx_cmp_int_ua; +static MDBX_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_align4, + mdbx_cmp_int_align2, mdbx_cmp_int_unaligned; static const char *__mdbx_strerr(int errnum) { /* Table of descriptions for MDBX errors */ @@ -1280,15 +2248,15 @@ static const char *__mdbx_strerr(int errnum) { "MDBX_NOTFOUND: No matching key/data pair found", "MDBX_PAGE_NOTFOUND: Requested page not found", "MDBX_CORRUPTED: Database is corrupted", - "MDBX_PANIC: Update of meta page failed or environment had fatal error", + "MDBX_PANIC: Environment had fatal error", "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", "MDBX_INVALID: File is not an MDBX file", "MDBX_MAP_FULL: Environment mapsize limit reached", "MDBX_DBS_FULL: Too may DBI (maxdbs reached)", "MDBX_READERS_FULL: Too many readers (maxreaders reached)", NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */, - "MDBX_TXN_FULL: Transaction has too many dirty pages - transaction too " - "big", + "MDBX_TXN_FULL: Transaction has too many dirty pages, " + "i.e transaction too big", "MDBX_CURSOR_FULL: Internal error - cursor stack limit reached", "MDBX_PAGE_FULL: Internal error - page has no more space", "MDBX_MAP_RESIZED: Database contents grew beyond environment mapsize", @@ -1299,7 +2267,8 @@ static const char *__mdbx_strerr(int errnum) { "DUPFIXED size", "MDBX_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", "MDBX_PROBLEM: Unexpected problem - txn should abort", - "MDBX_BUSY: Another write transaction is started", + "MDBX_BUSY: Another write transaction is running or " + "environment is already used while opening with MDBX_EXCLUSIVE flag", }; if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_LAST_ERRCODE) { @@ -1333,69 +2302,85 @@ static const char *__mdbx_strerr(int errnum) { const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { const char *msg = __mdbx_strerr(errnum); - if (!msg) { - if (!buflen || buflen > INT_MAX) - return NULL; + if (!msg && buflen > 0 && buflen < INT_MAX) { #if defined(_WIN32) || defined(_WIN64) - size_t size = FormatMessageA( + const DWORD size = FormatMessageA( FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, NULL); return size ? buf : NULL; #elif defined(_GNU_SOURCE) && defined(__GLIBC__) /* GNU-specific */ - msg = strerror_r(errnum, buf, buflen); + if (errnum > 0) + msg = strerror_r(errnum, buf, buflen); #elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) /* XSI-compliant */ - int rc = strerror_r(errnum, buf, buflen); - if (rc) { - rc = snprintf(buf, buflen, "error %d", errnum); - assert(rc > 0); - } - return buf; + if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0) + msg = buf; #else - strncpy(buf, strerror(errnum), buflen); - buf[buflen - 1] = '\0'; - return buf; + if (errnum > 0) { + msg = strerror(errnum); + if (msg) { + strncpy(buf, msg, buflen); + msg = buf; + } + } #endif + if (!msg) { + (void)snprintf(buf, buflen, "error %d", errnum); + msg = buf; + } + buf[buflen - 1] = '\0'; } return msg; } const char *__cold mdbx_strerror(int errnum) { - const char *msg = __mdbx_strerr(errnum); - if (!msg) { #if defined(_WIN32) || defined(_WIN64) - static char buffer[1024]; - size_t size = FormatMessageA( - FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, - errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buffer, - sizeof(buffer), NULL); - if (size) - msg = buffer; + static char buf[1024]; + return mdbx_strerror_r(errnum, buf, sizeof(buf)); #else - if (errnum < 0) { - static char buffer[32]; - int rc = snprintf(buffer, sizeof(buffer) - 1, "unknown error %d", errnum); - assert(rc > 0); - (void)rc; - return buffer; - } - msg = strerror(errnum); + const char *msg = __mdbx_strerr(errnum); + if (!msg) { + if (errnum > 0) + msg = strerror(errnum); + if (!msg) { + static char buf[32]; + (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum); + msg = buf; + } + } + return msg; #endif +} + +#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */ +const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) { + const char *msg = __mdbx_strerr(errnum); + if (!msg && buflen > 0 && buflen < INT_MAX) { + const DWORD size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + NULL); + if (size && CharToOemBuffA(buf, buf, size)) + msg = buf; } return msg; } -static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard); +const char *mdbx_strerror_ANSI2OEM(int errnum) { + static char buf[1024]; + return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf)); +} +#endif /* Bit of madness for Windows */ -void __cold mdbx_debug_log(int type, const char *function, int line, +void __cold mdbx_debug_log(int level, const char *function, int line, const char *fmt, ...) { va_list args; va_start(args, fmt); if (mdbx_debug_logger) - mdbx_debug_logger(type, function, line, fmt, args); + mdbx_debug_logger(level, function, line, fmt, args); else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { @@ -1433,7 +2418,8 @@ void __cold mdbx_debug_log(int type, const char *function, int line, } /* Dump a key in ascii or hexadecimal. */ -char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize) { +const char *mdbx_dump_val(const MDBX_val *key, char *const buf, + const size_t bufsize) { if (!key) return "<null>"; if (!buf || bufsize < 4) @@ -1474,16 +2460,20 @@ char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize) { return buf; } -#if 0 /* LY: debug stuff */ +/*------------------------------------------------------------------------------ + LY: debug stuff */ + static const char *mdbx_leafnode_type(MDBX_node *n) { - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" - : tp[F_ISSET(n->mn_flags, F_DUPDATA)] - [F_ISSET(n->mn_flags, F_SUBDATA)]; + static const char *const tp[2][2] = {{"", ": DB"}, + {": sub-page", ": sub-DB"}}; + return F_ISSET(node_flags(n), F_BIGDATA) + ? ": overflow page" + : tp[F_ISSET(node_flags(n), F_DUPDATA)] + [F_ISSET(node_flags(n), F_SUBDATA)]; } /* Display all the keys in the page. */ -static void mdbx_page_list(MDBX_page *mp) { +static __maybe_unused void mdbx_page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type, *state = IS_DIRTY(mp) ? ", dirty" : ""; MDBX_node *node; @@ -1509,78 +2499,88 @@ static void mdbx_page_list(MDBX_page *mp) { type = "Leaf2 sub-page"; break; case P_OVERFLOW: - mdbx_print("Overflow page %" PRIu64 " pages %u%s\n", pgno, mp->mp_pages, - state); + mdbx_verbose("Overflow page %" PRIaPGNO " pages %u%s\n", pgno, mp->mp_pages, + state); return; case P_META: - mdbx_print("Meta-page %" PRIu64 " txnid %" PRIu64 "\n", pgno, - ((MDBX_meta *)PAGEDATA(mp))->mm_txnid); + mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + page_meta(mp)->mm_txnid_a.inconsistent); return; default: - mdbx_print("Bad page %" PRIu64 " flags 0x%X\n", pgno, mp->mp_flags); + mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); return; } - nkeys = NUMKEYS(mp); - mdbx_print("%s %" PRIu64 " numkeys %u%s\n", type, pgno, nkeys, state); + nkeys = page_numkeys(mp); + mdbx_verbose("%s %" PRIaPGNO " numkeys %u%s\n", type, pgno, nkeys, state); for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ key.iov_len = nsize = mp->mp_leaf2_ksize; - key.iov_base = LEAF2KEY(mp, i, nsize); + key.iov_base = page_leaf2key(mp, i, nsize); total += nsize; - mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } - node = NODEPTR(mp, i); - key.iov_len = node->mn_ksize; + node = page_node(mp, i); + key.iov_len = node_ks(node); key.iov_base = node->mn_data; - nsize = NODESIZE + key.iov_len; + nsize = (unsigned)(NODESIZE + key.iov_len); if (IS_BRANCH(mp)) { - mdbx_print("key %u: page %" PRIu64 ", %s\n", i, NODEPGNO(node), - DKEY(&key)); + mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); total += nsize; } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) + if (F_ISSET(node_flags(node), F_BIGDATA)) nsize += sizeof(pgno_t); else - nsize += NODEDSZ(node); + nsize += (unsigned)node_ds(node); total += nsize; nsize += sizeof(indx_t); - mdbx_print("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), - mdbx_leafnode_type(node)); + mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + mdbx_leafnode_type(node)); } total = EVEN(total); } - mdbx_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - SIZELEFT(mp)); + mdbx_verbose("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, + page_room(mp)); } -static void mdbx_cursor_chk(MDBX_cursor *mc) { - unsigned i; - MDBX_node *node; - MDBX_page *mp; +/*----------------------------------------------------------------------------*/ - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) - return; - for (i = 0; i < mc->mc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (unlikely(NODEPGNO(node) != mc->mc_pg[i + 1]->mp_pgno)) - mdbx_print("oops!\n"); - } - if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) - mdbx_print("ack!\n"); - if (XCURSOR_INITED(mc)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - mdbx_print("blah!\n"); - } - } -} -#endif /* 0 */ +/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/* Update sub-page pointer, if any, in mc->mc_xcursor. + * Needed when the node which contains the sub-page may have moved. + * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDBX_page *xr_pg = (mp); \ + MDBX_node *xr_node = page_node(xr_pg, ki); \ + if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ + } while (0) + +/* Perform act while tracking temporary cursor mn */ +#define WITH_CURSOR_TRACKING(mn, act) \ + do { \ + mdbx_cassert(&(mn), \ + mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + MDBX_cursor mc_dummy, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + MDBX_cursor *tracked = &(mn); \ + if ((mn).mc_flags & C_SUB) { \ + mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ + tracked = &mc_dummy; \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ + } while (0) int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { @@ -1637,7 +2637,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { /* Free a dirty page */ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) { #if MDBX_DEBUG - dp->mp_pgno = MAX_PAGENO; + dp->mp_pgno = MAX_PAGENO + 1; #endif if (pages == 1) { dp->mp_next = env->me_dpages; @@ -1653,33 +2653,15 @@ static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned pages) { /* Return all dirty pages to dpage list */ static void mdbx_dlist_free(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; - MDBX_DPL dl = txn->mt_rw_dirtylist; - size_t i, n = dl->length; + const MDBX_DPL dl = txn->tw.dirtylist; + const size_t n = dl->length; - for (i = 1; i <= n; i++) { + for (size_t i = 1; i <= n; i++) { MDBX_page *dp = dl[i].ptr; mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); } - dl->length = 0; -} - -static size_t bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { - return mdbx_roundup2(mdbx_roundup2(bytes, env->me_psize), env->me_os_psize); -} - -static void __cold mdbx_kill_page(MDBX_env *env, MDBX_page *mp) { - const size_t len = env->me_psize - PAGEHDRSZ; - void *ptr = (env->me_flags & MDBX_WRITEMAP) - ? &mp->mp_data - : (void *)((uint8_t *)env->me_pbuf + env->me_psize); - memset(ptr, 0x6F /* 'o', 111 */, len); - if (ptr != &mp->mp_data) - (void)mdbx_pwrite(env->me_fd, ptr, len, - pgno2bytes(env, mp->mp_pgno) + PAGEHDRSZ); - - VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_data, len); - ASAN_POISON_MEMORY_REGION(&mp->mp_data, len); + mdbx_dpl_clear(dl); } static __inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { @@ -1691,105 +2673,437 @@ static __inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { return couple->outer.mc_db; } -static int mdbx_page_befree(MDBX_cursor *mc, MDBX_page *mp) { - MDBX_txn *txn = mc->mc_txn; +static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { + if (!mdbx_audit_enabled()) + return true; + + unsigned loose = 0; + for (unsigned i = txn->tw.dirtylist->length; i > 0; --i) { + const MDBX_page *const dp = txn->tw.dirtylist[i].ptr; + if (!dp) + continue; + mdbx_tassert(txn, dp->mp_pgno == txn->tw.dirtylist[i].pgno); + if (unlikely(dp->mp_pgno != txn->tw.dirtylist[i].pgno)) + return false; + + mdbx_tassert(txn, dp->mp_flags & P_DIRTY); + if (unlikely((dp->mp_flags & P_DIRTY) == 0)) + return false; + if (dp->mp_flags & P_LOOSE) { + mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); + if (unlikely(dp->mp_flags != (P_LOOSE | P_DIRTY))) + return false; + loose += 1; + } + + const unsigned num = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num); + if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) + return false; + + if (i < txn->tw.dirtylist->sorted) { + mdbx_tassert(txn, txn->tw.dirtylist[i + 1].pgno >= dp->mp_pgno + num); + if (unlikely(txn->tw.dirtylist[i + 1].pgno < dp->mp_pgno + num)) + return false; + } + + const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno); + mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || + txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); + if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && + unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) + return false; + if (num > 1) { + const unsigned rpb = + mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1); + mdbx_tassert(txn, rpa == rpb); + if (unlikely(rpa != rpb)) + return false; + } + } + + mdbx_tassert(txn, loose == txn->tw.loose_count); + if (unlikely(loose != txn->tw.loose_count)) + return false; + + if (txn->tw.dirtylist->length - txn->tw.dirtylist->sorted < + SORT_THRESHOLD / 2) { + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { + const MDBX_page *const dp = + mdbx_dpl_find(txn->tw.dirtylist, txn->tw.retired_pages[i]); + mdbx_tassert(txn, !dp); + if (unlikely(dp)) + return false; + } + } + + return true; +} + +static void mdbx_refund_reclaimed(MDBX_txn *txn) { + /* Scanning in descend order */ + pgno_t next_pgno = txn->mt_next_pgno; + const MDBX_PNL pnl = txn->tw.reclaimed_pglist; + mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); +#if MDBX_PNL_ASCENDING + unsigned i = MDBX_PNL_SIZE(pnl); + mdbx_tassert(txn, pnl[i] == next_pgno - 1); + while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) + ; + MDBX_PNL_SIZE(pnl) = i; +#else + unsigned i = 1; + mdbx_tassert(txn, pnl[i] == next_pgno - 1); + unsigned len = MDBX_PNL_SIZE(pnl); + while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) + ; + MDBX_PNL_SIZE(pnl) = len -= i - 1; + for (unsigned move = 0; move < len; ++move) + pnl[1 + move] = pnl[i + move]; +#endif + mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); + txn->mt_next_pgno = next_pgno; + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); +} + +static void mdbx_refund_loose(MDBX_txn *txn) { + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_tassert(txn, txn->tw.loose_pages != nullptr); + mdbx_tassert(txn, txn->tw.loose_count > 0); + + const MDBX_DPL dl = txn->tw.dirtylist; + mdbx_tassert(txn, dl->length >= txn->tw.loose_count); + mdbx_tassert(txn, txn->tw.spill_pages == nullptr || + dl->length >= MDBX_PNL_SIZE(txn->tw.spill_pages)); + + pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; + MDBX_PNL suitable = onstack; + + if (dl->length - dl->sorted > txn->tw.loose_count) { + /* Dirty list is useless since unsorted. */ + if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { + suitable = mdbx_pnl_alloc(txn->tw.loose_count); + if (unlikely(!suitable)) + return /* this is not a reason for transaction fail */; + } + + /* Collect loose-pages which may be refunded. */ + mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); + pgno_t most = MIN_PAGENO; + unsigned w = 0; + for (const MDBX_page *dp = txn->tw.loose_pages; dp; dp = dp->mp_next) { + mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); + mdbx_tassert(txn, txn->mt_next_pgno > dp->mp_pgno); + if (likely(txn->mt_next_pgno - txn->tw.loose_count <= dp->mp_pgno)) { + mdbx_tassert(txn, + w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); + suitable[++w] = dp->mp_pgno; + most = (dp->mp_pgno > most) ? dp->mp_pgno : most; + } + } + + if (most + 1 == txn->mt_next_pgno) { + /* Sort suitable list and refund pages at the tail. */ + MDBX_PNL_SIZE(suitable) = w; + mdbx_pnl_sort(suitable); - mdbx_cassert(mc, (mc->mc_flags & C_SUB) == 0); - if (IS_BRANCH(mp)) { - mc->mc_db->md_branch_pages--; - } else if (IS_LEAF(mp)) { - mc->mc_db->md_leaf_pages--; + /* Scanning in descend order */ + const int step = MDBX_PNL_ASCENDING ? -1 : 1; + const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; + const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; + mdbx_tassert(txn, suitable[begin] >= suitable[end - step]); + mdbx_tassert(txn, most == suitable[begin]); + + for (int i = begin + step; i != end; i += step) { + if (suitable[i] != most - 1) + break; + most -= 1; + } + const unsigned refunded = txn->mt_next_pgno - most; + mdbx_verbose("refund-sorted %u pages %" PRIaPGNO " -> %" PRIaPGNO, + refunded, most, txn->mt_next_pgno); + txn->tw.loose_count -= refunded; + txn->tw.dirtyroom += refunded; + txn->mt_next_pgno = most; + + /* Filter-out dirty list */ + unsigned r = 0; + w = 0; + if (dl->sorted) { + do { + if (dl[++r].pgno < most) { + if (++w != r) + dl[w] = dl[r]; + } + } while (r < dl->sorted); + dl->sorted = w; + } + while (r < dl->length) { + if (dl[++r].pgno < most) { + if (++w != r) + dl[w] = dl[r]; + } + } + dl->length = w; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + goto unlink_loose; + } } else { - mdbx_cassert(mc, IS_OVERFLOW(mp)); - mc->mc_db->md_overflow_pages -= mp->mp_pages; - return mdbx_pnl_append_range(&txn->mt_befree_pages, mp->mp_pgno, - mp->mp_pages); + /* Dirtylist is mostly sorted, just refund loose pages at the end. */ + mdbx_dpl_sort(dl); + mdbx_tassert(txn, dl->length < 2 || dl[1].pgno < dl[dl->length].pgno); + mdbx_tassert(txn, dl->sorted == dl->length); + + /* Scan dirtylist tail-forward and cutoff suitable pages. */ + while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 && + dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { + MDBX_page *dp = dl[dl->length].ptr; + mdbx_verbose("refund-unsorted page %" PRIaPGNO, dp->mp_pgno); + mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno); + dl->length -= 1; + } + + if (dl->sorted != dl->length) { + const unsigned refunded = dl->sorted - dl->length; + dl->sorted = dl->length; + txn->tw.loose_count -= refunded; + txn->tw.dirtyroom += refunded; + txn->mt_next_pgno -= refunded; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + + /* Filter-out loose chain & dispose refunded pages. */ + unlink_loose: + for (MDBX_page **link = &txn->tw.loose_pages; *link;) { + MDBX_page *dp = *link; + mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); + if (txn->mt_next_pgno > dp->mp_pgno) { + link = &dp->mp_next; + } else { + *link = dp->mp_next; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, dp, 1); + } + } + } } - return mdbx_pnl_append(&txn->mt_befree_pages, mp->mp_pgno); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + if (suitable != onstack) + mdbx_pnl_free(suitable); + txn->tw.loose_refund_wl = txn->mt_next_pgno; } -/* Loosen or free a single page. +static bool mdbx_refund(MDBX_txn *txn) { + const pgno_t before = txn->mt_next_pgno; + + if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) + mdbx_refund_loose(txn); + + while (true) { + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || + MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) + break; + + mdbx_refund_reclaimed(txn); + if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) + break; + + const pgno_t memo = txn->mt_next_pgno; + mdbx_refund_loose(txn); + if (memo == txn->mt_next_pgno) + break; + } + + return before != txn->mt_next_pgno; +} + +static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, + unsigned npages) { + mdbx_assert(env, pgno >= NUM_METAS && npages); + if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) { + const size_t bytes = pgno2bytes(env, npages); + memset(mp, 0, bytes); + mp->mp_pgno = pgno; + if ((env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_pwrite(env->me_fd, mp, bytes, pgno2bytes(env, pgno)); + } else { + struct iovec iov[MDBX_COMMIT_PAGES]; + iov[0].iov_len = env->me_psize; + iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; + size_t iov_off = pgno2bytes(env, pgno); + unsigned n = 1; + while (--npages) { + iov[n] = iov[0]; + if (++n == MDBX_COMMIT_PAGES) { + mdbx_pwritev(env->me_fd, iov, MDBX_COMMIT_PAGES, iov_off, + pgno2bytes(env, MDBX_COMMIT_PAGES)); + iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); + n = 0; + } + } + mdbx_pwritev(env->me_fd, iov, n, iov_off, pgno2bytes(env, n)); + } +} + +/* Retire, loosen or free a single page. * * Saves single pages to a list for future reuse - * in this same txn. It has been pulled from the freeDB + * in this same txn. It has been pulled from the GC * and already resides on the dirty list, but has been * deleted. Use these pages first before pulling again - * from the freeDB. + * from the GC. * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_loose(MDBX_cursor *mc, MDBX_page *mp) { - int loose = 0; + +static __hot int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { + const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const pgno_t pgno = mp->mp_pgno; - MDBX_txn *txn = mc->mc_txn; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - if (IS_BRANCH(mp)) - outer->md_branch_pages--; - else { - mdbx_cassert(mc, IS_LEAF(mp)); - outer->md_leaf_pages--; + if (txn->mt_parent) { + mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); + mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno)); + /* If txn has a parent, make sure the page is in our dirty list. */ + MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); + /* TODO: use extended flag-mask to track parent's dirty-pages */ + if (dp == nullptr) { + mp->mp_next = txn->tw.retired2parent_pages; + txn->tw.retired2parent_pages = mp; + txn->tw.retired2parent_count += npages; + return MDBX_SUCCESS; } + if (unlikely(mp != dp)) { /* bad cursor? */ + mdbx_error( + "wrong page 0x%p #%" PRIaPGNO " in the dirtylist, expecting %p", + __Wpedantic_format_voidptr(dp), pgno, __Wpedantic_format_voidptr(mp)); + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + /* ok, it's ours */ } - if (IS_BRANCH(mp)) - mc->mc_db->md_branch_pages--; - else { - mdbx_cassert(mc, IS_LEAF(mp)); - mc->mc_db->md_leaf_pages--; + mdbx_debug("loosen page %" PRIaPGNO, pgno); + const bool is_dirty = IS_DIRTY(mp); + if (MDBX_DEBUG || unlikely((txn->mt_env->me_flags & MDBX_PAGEPERTURB) != 0)) { + mdbx_kill_page(txn->mt_env, mp, pgno, npages); + VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); } - - if (IS_DIRTY(mp)) { - if (txn->mt_parent) { - /* LY: TODO: use dedicated flag for tracking parent's dirty pages */ - mdbx_cassert(mc, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); - MDBX_DP *dl = txn->mt_rw_dirtylist; - /* If txn has a parent, - * make sure the page is in our dirty list. */ - if (dl->length) { - unsigned x = mdbx_dpl_search(dl, pgno); - if (x <= dl->length && dl[x].pgno == pgno) { - if (unlikely(mp != dl[x].ptr)) { /* bad cursor? */ - mdbx_error("wrong page 0x%p #%" PRIaPGNO - " in the dirtylist[%d], expecting %p", - dl[x].ptr, pgno, x, mp); - mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; - } - /* ok, it's ours */ - loose = 1; - } + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + + if (unlikely(npages > + 1 /* overflow pages doesn't comes to the loose-list */)) { + if (is_dirty) { + /* Remove from dirty list */ + MDBX_page *dp = mdbx_dpl_remove(txn->tw.dirtylist, pgno); + if (unlikely(dp != mp)) { + mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", + __Wpedantic_format_voidptr(mp), pgno); + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } - } else { - /* no parent txn, so it's just ours */ - loose = 1; + txn->tw.dirtyroom++; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, mp, npages); } - } - if (loose) { - mdbx_debug("loosen db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - MDBX_page **link = &NEXT_LOOSE_PAGE(mp); - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - mdbx_kill_page(txn->mt_env, mp); - mp->mp_flags = P_LOOSE | P_DIRTY; - VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); - ASAN_UNPOISON_MEMORY_REGION(link, sizeof(*link)); - *link = txn->mt_loose_pages; - txn->mt_loose_pages = mp; - txn->mt_loose_count++; - } else { - int rc = mdbx_pnl_append(&txn->mt_befree_pages, pgno); - mdbx_tassert(txn, rc == MDBX_SUCCESS); + if (unlikely(pgno + npages == txn->mt_next_pgno)) { + txn->mt_next_pgno = pgno; + mdbx_refund(txn); + return MDBX_SUCCESS; + } + + int rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); if (unlikely(rc != MDBX_SUCCESS)) return rc; + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + return MDBX_SUCCESS; } + mp->mp_flags = P_LOOSE | P_DIRTY; + mp->mp_next = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; + if (unlikely(txn->mt_next_pgno == mp->mp_pgno + 1)) + mdbx_refund(txn); + return MDBX_SUCCESS; } +static __hot int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { + const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; + const pgno_t pgno = mp->mp_pgno; + MDBX_txn *const txn = mc->mc_txn; + + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = mdbx_outer_db(mc); + mdbx_cassert(mc, !IS_BRANCH(mp) || outer->md_branch_pages > 0); + outer->md_branch_pages -= IS_BRANCH(mp); + mdbx_cassert(mc, !IS_LEAF(mp) || outer->md_leaf_pages > 0); + outer->md_leaf_pages -= IS_LEAF(mp); + mdbx_cassert(mc, !IS_OVERFLOW(mp)); + } + mdbx_cassert(mc, !IS_BRANCH(mp) || mc->mc_db->md_branch_pages > 0); + mc->mc_db->md_branch_pages -= IS_BRANCH(mp); + mdbx_cassert(mc, !IS_LEAF(mp) || mc->mc_db->md_leaf_pages > 0); + mc->mc_db->md_leaf_pages -= IS_LEAF(mp); + mdbx_cassert(mc, !IS_OVERFLOW(mp) || mc->mc_db->md_overflow_pages >= npages); + mc->mc_db->md_overflow_pages -= IS_OVERFLOW(mp) ? npages : 0; + + if (IS_DIRTY(mp)) { + int rc = mdbx_page_loose(txn, mp); + if (unlikely(rc != MDBX_SUCCESS)) + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + return rc; + } + + if (txn->tw.spill_pages) { + const unsigned i = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1); + if (i) { + /* This page is no longer spilled */ + mdbx_tassert(txn, i == MDBX_PNL_SIZE(txn->tw.spill_pages) || + txn->tw.spill_pages[i + 1] >= (pgno + npages) << 1); + txn->tw.spill_pages[i] |= 1; + if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) + MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; + int rc = mdbx_page_loose(txn, mp); + if (unlikely(rc != MDBX_SUCCESS)) + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + return rc; + } + } + + mdbx_tassert(txn, mp == pgno2page(txn->mt_env, pgno)); + int rc = mdbx_pnl_append_range(&txn->tw.retired_pages, pgno, npages); + mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, pgno) == nullptr); + return rc; +} + +static __must_check_result __inline int mdbx_retire_pgno(MDBX_cursor *mc, + const pgno_t pgno) { + MDBX_page *mp; + int rc = mdbx_page_get(mc, pgno, &mp, NULL); + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_page_retire(mc, mp); + return rc; +} + /* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * * [in] mc A cursor handle for the current operation. @@ -1805,9 +3119,8 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { MDBX_cursor *m3, *m0 = mc; MDBX_xcursor *mx; MDBX_page *dp, *mp; - MDBX_node *leaf; unsigned i, j; - int rc = MDBX_SUCCESS, level; + int rc = MDBX_SUCCESS; /* Mark pages seen by cursors: First m0, then tracked cursors */ for (i = txn->mt_numdbs;;) { @@ -1825,8 +3138,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { break; if (!(mp && IS_LEAF(mp))) break; - leaf = NODEPTR(mp, m3->mc_ki[j - 1]); - if (!(leaf->mn_flags & F_SUBDATA)) + if (!(node_flags(page_node(mp, m3->mc_ki[j - 1])) & F_SUBDATA)) break; } } @@ -1844,6 +3156,7 @@ mark_done: pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; + int level; if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level)) != MDBX_SUCCESS)) break; @@ -1856,8 +3169,6 @@ mark_done: return rc; } -static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep); - /* Spill pages from the dirty list back to disk. * This is intended to prevent running into MDBX_TXN_FULL situations, * but note that they may still occur in a few cases: @@ -1886,40 +3197,40 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep); * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. * - * [in] m0 cursor A cursor handle identifying the transaction and + * [in] mc cursor A cursor handle identifying the transaction and * database for which we are checking space. * [in] key For a put operation, the key being stored. * [in] data For a put operation, the data being stored. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { - MDBX_txn *txn = m0->mc_txn; - MDBX_DPL dl = txn->mt_rw_dirtylist; - - if (m0->mc_flags & C_SUB) +static int mdbx_page_spill(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { + if (mc->mc_flags & C_SUB) return MDBX_SUCCESS; + MDBX_txn *txn = mc->mc_txn; + MDBX_DPL dl = txn->tw.dirtylist; + /* Estimate how much space this op will take */ - pgno_t i = m0->mc_db->md_depth; + pgno_t i = mc->mc_db->md_depth; /* Named DBs also dirty the main DB */ - if (m0->mc_dbi >= CORE_DBS) + if (mc->mc_dbi >= CORE_DBS) i += txn->mt_dbs[MAIN_DBI].md_depth; /* For puts, roughly factor in the key+data size */ if (key) - i += bytes2pgno(txn->mt_env, LEAFSIZE(key, data) + txn->mt_env->me_psize); + i += bytes2pgno(txn->mt_env, node_size(key, data) + txn->mt_env->me_psize); i += i; /* double it for good measure */ pgno_t need = i; - if (txn->mt_dirtyroom > i) + if (txn->tw.dirtyroom > i) return MDBX_SUCCESS; - if (!txn->mt_spill_pages) { - txn->mt_spill_pages = mdbx_pnl_alloc(MDBX_DPL_TXNFULL); - if (unlikely(!txn->mt_spill_pages)) + if (!txn->tw.spill_pages) { + txn->tw.spill_pages = mdbx_pnl_alloc(MDBX_DPL_TXNFULL / 8); + if (unlikely(!txn->tw.spill_pages)) return MDBX_ENOMEM; } else { /* purge deleted slots */ - MDBX_PNL sl = txn->mt_spill_pages; + MDBX_PNL sl = txn->tw.spill_pages; pgno_t num = MDBX_PNL_SIZE(sl), j = 0; for (i = 1; i <= num; i++) { if ((sl[i] & 1) == 0) @@ -1929,7 +3240,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { } /* Preserve pages which may soon be dirtied again */ - int rc = mdbx_pages_xkeep(m0, P_DIRTY, true); + int rc = mdbx_pages_xkeep(mc, P_DIRTY, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -1952,26 +3263,23 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { /* Can't spill twice, * make sure it's not already in a parent's spill list. */ if (txn->mt_parent) { - MDBX_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pages) { - unsigned j = mdbx_pnl_search(tx2->mt_spill_pages, pn); - if (j <= MDBX_PNL_SIZE(tx2->mt_spill_pages) && - tx2->mt_spill_pages[j] == pn) { - dp->mp_flags |= P_KEEP; - break; - } + MDBX_txn *parent; + for (parent = txn->mt_parent; parent; parent = parent->mt_parent) { + if (parent->tw.spill_pages && + mdbx_pnl_exist(parent->tw.spill_pages, pn)) { + dp->mp_flags |= P_KEEP; + break; } } - if (tx2) + if (parent) continue; } - rc = mdbx_pnl_append(&txn->mt_spill_pages, pn); + rc = mdbx_pnl_append(&txn->tw.spill_pages, pn); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; need--; } - mdbx_pnl_sort(txn->mt_spill_pages); + mdbx_pnl_sort(txn->tw.spill_pages); /* Flush the spilled part of dirty list */ rc = mdbx_page_flush(txn, i); @@ -1979,7 +3287,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { goto bailout; /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i != 0); + rc = mdbx_pages_xkeep(mc, P_DIRTY | P_KEEP, i != 0); bailout: txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS; @@ -1988,15 +3296,26 @@ bailout: /*----------------------------------------------------------------------------*/ -#define METAPAGE(env, n) (&pgno2page(env, n)->mp_meta) +static __inline bool meta_bootid_match(const MDBX_meta *meta) { + return meta->mm_bootid.x == bootid.x && meta->mm_bootid.y == bootid.y && + (bootid.x | bootid.y) != 0; +} + +static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, + const int lck_exlusive) { + return lck_exlusive ? /* exclusive lock */ meta_bootid_match(meta) + : /* db already opened */ env->me_lck && + (env->me_lck->mti_envmode & MDBX_RDONLY) == 0; +} +#define METAPAGE(env, n) page_meta(pgno2page(env, n)) #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, - bool allow_volatile) { + const bool allow_volatile) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - txnid_t a = meta->mm_txnid_a; - txnid_t b = meta->mm_txnid_b; + txnid_t a = safe64_read(&meta->mm_txnid_a); + txnid_t b = safe64_read(&meta->mm_txnid_b); if (allow_volatile) return (a == b) ? a : 0; mdbx_assert(env, a == b); @@ -2016,28 +3335,32 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, static __inline void mdbx_meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid); - meta->mm_txnid_a = txnid; + mdbx_assert(env, meta->mm_txnid_a.inconsistent < txnid && + meta->mm_txnid_b.inconsistent < txnid); (void)env; - mdbx_flush_noncoherent_cpu_writeback(); + safe64_update(&meta->mm_txnid_a, txnid); } static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_a == txnid); - mdbx_assert(env, meta->mm_txnid_b < txnid); - + mdbx_assert(env, meta->mm_txnid_a.inconsistent == txnid); + mdbx_assert(env, meta->mm_txnid_b.inconsistent < txnid); + (void)env; mdbx_jitter4testing(true); - meta->mm_txnid_b = txnid; - mdbx_flush_noncoherent_cpu_writeback(); + meta->mm_bootid = bootid; + safe64_update(&meta->mm_txnid_b, txnid); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env)); - meta->mm_txnid_a = txnid; - meta->mm_txnid_b = txnid; + (void)env; + /* update inconsistent since this function used ONLY for filling meta-image + * for writing, but not the actual meta-page */ + meta->mm_bootid = bootid; + meta->mm_txnid_a.inconsistent = txnid; + meta->mm_txnid_b.inconsistent = txnid; } static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { @@ -2051,7 +3374,7 @@ static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -enum meta_choise_mode { prefer_last, prefer_noweak, prefer_steady }; +enum meta_choise_mode { prefer_last, prefer_steady }; static __inline bool mdbx_meta_ot(const enum meta_choise_mode mode, const MDBX_env *env, const MDBX_meta *a, @@ -2072,15 +3395,10 @@ static __inline bool mdbx_meta_ot(const enum meta_choise_mode mode, return META_IS_STEADY(b); /* fall through */ __fallthrough; - case prefer_noweak: - if (META_IS_WEAK(a) != META_IS_WEAK(b)) - return !META_IS_WEAK(b); - /* fall through */ - __fallthrough; case prefer_last: mdbx_jitter4testing(true); if (txnid_a == txnid_b) - return META_IS_STEADY(b) || (META_IS_WEAK(a) && !META_IS_WEAK(b)); + return META_IS_STEADY(b); return txnid_a < txnid_b; } } @@ -2148,22 +3466,33 @@ static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { return mdbx_meta_mostrecent(prefer_last, env); } -static __hot txnid_t mdbx_reclaiming_detent(const MDBX_env *env) { - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) - return likely(env->me_txn0->mt_owner == mdbx_thread_self()) - ? env->me_txn0->mt_txnid - 1 - : mdbx_meta_txnid_fluid(env, mdbx_meta_head(env)); +static __hot txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { + while (true) { + const MDBX_meta *head = mdbx_meta_head(env); + const txnid_t recent = mdbx_meta_txnid_fluid(env, head); + mdbx_compiler_barrier(); + if (likely(head == mdbx_meta_head(env) && + recent == mdbx_meta_txnid_fluid(env, head))) + return recent; + } +} - return mdbx_meta_txnid_stable(env, mdbx_meta_steady(env)); +static __hot txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { + while (true) { + const MDBX_meta *head = mdbx_meta_steady(env); + const txnid_t recent = mdbx_meta_txnid_fluid(env, head); + mdbx_compiler_barrier(); + if (likely(head == mdbx_meta_steady(env) && + recent == mdbx_meta_txnid_fluid(env, head))) + return recent; + } } static const char *mdbx_durable_str(const MDBX_meta *const meta) { - if (META_IS_WEAK(meta)) - return "Weak"; if (META_IS_STEADY(meta)) return (meta->mm_datasync_sign == mdbx_meta_sign(meta)) ? "Steady" : "Tainted"; - return "Legacy"; + return "Weak"; } /*----------------------------------------------------------------------------*/ @@ -2172,7 +3501,7 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { static txnid_t mdbx_find_oldest(MDBX_txn *txn) { mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0); MDBX_env *env = txn->mt_env; - const txnid_t edge = mdbx_reclaiming_detent(env); + const txnid_t edge = mdbx_recent_steady_txnid(env); mdbx_tassert(txn, edge <= txn->mt_txnid); MDBX_lockinfo *const lck = env->me_lck; @@ -2192,12 +3521,12 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { txnid_t oldest = edge; lck->mti_readers_refresh_flag = nothing_changed; - mdbx_flush_noncoherent_cpu_writeback(); + mdbx_flush_incoherent_cpu_writeback(); const unsigned snap_nreaders = lck->mti_numreaders; for (unsigned i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid) { /* mdbx_jitter4testing(true); */ - const txnid_t snap = lck->mti_readers[i].mr_txnid; + const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { oldest = snap; if (oldest == last_oldest) @@ -2224,10 +3553,10 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { if (lck->mti_readers[i].mr_pid) { /* mdbx_jitter4testing(true); */ const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used; - const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid; + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); mdbx_memory_barrier(); if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used || - snap_txnid != lck->mti_readers[i].mr_txnid)) + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; if (largest < snap_pages && lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid && @@ -2242,35 +3571,136 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { /* Add a page to the txn's dirty list */ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { - int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) = - (txn->mt_flags & MDBX_TXN_WRITEMAP) ? mdbx_dpl_append : mdbx_dpl_insert; - const int rc = adder(txn->mt_rw_dirtylist, mp->mp_pgno, mp); + const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; return rc; } - txn->mt_dirtyroom--; + txn->tw.dirtyroom--; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); return MDBX_SUCCESS; } -__cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, +#if !(defined(_WIN32) || defined(_WIN64)) +static __inline __maybe_unused int ignore_enosys(int err) { +#ifdef ENOSYS + if (err == ENOSYS) + return MDBX_RESULT_TRUE; +#endif /* ENOSYS */ +#ifdef ENOIMPL + if (err == ENOIMPL) + return MDBX_RESULT_TRUE; +#endif /* ENOIMPL */ +#ifdef ENOTSUP + if (err == ENOTSUP) + return MDBX_RESULT_TRUE; +#endif /* ENOTSUP */ +#ifdef ENOSUPP + if (err == ENOSUPP) + return MDBX_RESULT_TRUE; +#endif /* ENOSUPP */ +#ifdef EOPNOTSUPP + if (err == EOPNOTSUPP) + return MDBX_RESULT_TRUE; +#endif /* EOPNOTSUPP */ + if (err == EAGAIN) + return MDBX_RESULT_TRUE; + return err; +} +#endif /* defined(_WIN32) || defined(_WIN64) */ + +/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ +static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, + const size_t length, const bool enable) { + assert(length > 0); + mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", + bytes2pgno(env, offset), bytes2pgno(env, offset + length)); + +#if defined(F_RDAHEAD) + if (unlikely(fcntl(env->me_fd, F_RDAHEAD, enable) == -1)) + return errno; +#endif /* F_RDAHEAD */ + + if (enable) { +#if defined(F_RDADVISE) + struct radvisory hint; + hint.ra_offset = offset; + hint.ra_count = length; + (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( + env->me_fd, F_RDADVISE, &hint); +#endif /* F_RDADVISE */ +#if defined(MADV_WILLNEED) + int err = madvise(env->me_map + offset, length, MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_WILLNEED) + int err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map + offset; + hint.NumberOfBytes = length; + (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); + } +#elif defined(POSIX_FADV_WILLNEED) + int err = ignore_enosys( + posix_fadvise(env->me_fd, offset, length, POSIX_FADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_WILLNEED */ + } else { +#if defined(MADV_RANDOM) + int err = madvise(env->me_map + offset, length, MADV_RANDOM) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_RANDOM) + int err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_RANDOM) + int err = ignore_enosys( + posix_fadvise(env->me_fd, offset, length, POSIX_FADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_RANDOM */ + } + return MDBX_SUCCESS; +} + +static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, const pgno_t limit_pgno) { -#ifdef USE_VALGRIND - const size_t prev_mapsize = env->me_mapsize; - void *const prev_mapaddr = env->me_map; -#endif + if ((env->me_flags & MDBX_WRITEMAP) && *env->me_unsynced_pages) { + int err = mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, used_pgno), true); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); - mdbx_info("resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR, - env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes); + mdbx_verbose("resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, + limit_bytes); mdbx_assert(env, limit_bytes >= size_bytes); - mdbx_assert(env, bytes2pgno(env, size_bytes) == size_pgno); - mdbx_assert(env, bytes2pgno(env, limit_bytes) == limit_pgno); + mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); + mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + const size_t prev_limit = env->me_dxb_mmap.limit; + const void *const prev_addr = env->me_map; #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: @@ -2280,9 +3710,9 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, mdbx_handle_array_t *suspended = NULL; mdbx_handle_array_t array_onstack; int rc = MDBX_SUCCESS; - if (limit_bytes == env->me_dxb_mmap.length && + if (limit_bytes == env->me_dxb_mmap.limit && size_bytes == env->me_dxb_mmap.current && - env->me_dxb_mmap.current == env->me_dxb_mmap.filesize) + size_bytes == env->me_dxb_mmap.filesize) goto bailout; /* 1) Windows allows only extending a read-write section, but not a @@ -2304,49 +3734,100 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, /* Acquire guard to avoid collision between read and write txns * around env->me_dbgeo */ int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); - if (rc != MDBX_SUCCESS) + if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (limit_bytes == env->me_dxb_mmap.length && - bytes2pgno(env, size_bytes) == env->me_dbgeo.now) + if (limit_bytes == env->me_dxb_mmap.limit && + size_bytes == env->me_dxb_mmap.current) goto bailout; #endif /* Windows */ + const size_t prev_size = env->me_dxb_mmap.current; + if (size_bytes < prev_size) { + mdbx_notice("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", + size_pgno, bytes2pgno(env, prev_size)); + rc = MDBX_RESULT_TRUE; +#if defined(MADV_REMOVE) + if (env->me_flags & MDBX_WRITEMAP) + rc = + madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, + MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(POSIX_MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, + prev_size - size_bytes, + POSIX_MADV_DONTNEED)); +#elif defined(POSIX_FADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_fadvise( + env->me_fd, size_bytes, prev_size - size_bytes, POSIX_FADV_DONTNEED)); +#endif /* MADV_DONTNEED */ + if (unlikely(MDBX_IS_ERROR(rc))) + goto bailout; + if (*env->me_discarded_tail > size_pgno) + *env->me_discarded_tail = size_pgno; + } + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { + const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0); + if (readahead == MDBX_RESULT_FALSE) + rc = mdbx_set_readahead( + env, 0, (size_bytes > prev_size) ? size_bytes : prev_size, false); + else if (readahead == MDBX_RESULT_TRUE) { + const size_t readahead_pivot = + (limit_bytes != prev_limit || env->me_dxb_mmap.address != prev_addr +#if defined(_WIN32) || defined(_WIN64) + || prev_size > size_bytes +#endif /* Windows */ + ) + ? 0 /* reassign readahead to the entire map + because it was remapped */ + : prev_size; + if (size_bytes > readahead_pivot) { + *env->me_discarded_tail = size_pgno; + rc = mdbx_set_readahead(env, readahead_pivot, + size_bytes - readahead_pivot, true); + } + } + } bailout: if (rc == MDBX_SUCCESS) { #if defined(_WIN32) || defined(_WIN64) mdbx_assert(env, size_bytes == env->me_dxb_mmap.current); mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); - mdbx_assert(env, limit_bytes == env->me_dxb_mmap.length); -#endif - env->me_dbgeo.now = size_bytes; - env->me_dbgeo.upper = limit_bytes; - if (env->me_txn) { - mdbx_tassert(env->me_txn, size_pgno >= env->me_txn->mt_next_pgno); - env->me_txn->mt_end_pgno = env->me_txn0->mt_end_pgno = size_pgno; - } -#ifdef USE_VALGRIND - if (prev_mapsize != env->me_mapsize || prev_mapaddr != env->me_map) { + mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit); +#endif /* Windows */ +#ifdef MDBX_USE_VALGRIND + if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; - if (env->me_mapsize) + if (env->me_dxb_mmap.limit) env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); + VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); } -#endif +#endif /* MDBX_USE_VALGRIND */ } else { if (rc != MDBX_RESULT_TRUE) { mdbx_error("failed resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, + env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, limit_bytes, rc); } else { mdbx_notice("unable resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, + env->me_dxb_mmap.current, size_bytes, env->me_dxb_mmap.limit, limit_bytes, rc); } if (!env->me_dxb_mmap.address) { @@ -2375,15 +3856,61 @@ bailout: return rc; } -/* Allocate page numbers and memory for writing. Maintain me_last_reclaimed, - * me_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. +static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, + MDBX_meta *const meta) { + const uint64_t wipe = MDBX_DATASIGN_NONE; + if (META_IS_STEADY(meta) && + mdbx_meta_txnid_stable(env, meta) <= last_steady) { + mdbx_notice("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, + data_page(meta)->mp_pgno); + if (env->me_flags & MDBX_WRITEMAP) + meta->mm_datasync_sign = wipe; + else + return mdbx_pwrite(env->me_fd, &wipe, sizeof(meta->mm_datasync_sign), + (uint8_t *)&meta->mm_datasync_sign - env->me_map); + } + return MDBX_SUCCESS; +} + +__cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { + int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0)); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1)); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2)); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if (env->me_flags & MDBX_WRITEMAP) { + mdbx_flush_incoherent_cpu_writeback(); + return mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS), false); + } + +#if defined(__linux__) || defined(__gnu_linux__) + if (sync_file_range(env->me_fd, 0, pgno2bytes(env, NUM_METAS), + SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) + err = errno; +#else + err = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); +#endif + if (unlikely(err != MDBX_SUCCESS)) + return err; + mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + return MDBX_SUCCESS; +} + +/* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, + * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the freedB, just merge freeDB records into me_reclaimed_pglist - * and move me_last_reclaimed to say which records were consumed. Only this - * function can create me_reclaimed_pglist and move - * me_last_reclaimed/mt_next_pgno. + * Do not modify the GC, just merge GC records into mt_reclaimed_pglist + * and move mt_last_reclaimed to say which records were consumed. Only this + * function can create mt_reclaimed_pglist and move + * mt_last_reclaimed/mt_next_pgno. * * [in] mc cursor A cursor handle identifying the transaction and * database for which we are allocating. @@ -2396,12 +3923,10 @@ bailout: #define MDBX_ALLOC_CACHE 1 #define MDBX_ALLOC_GC 2 #define MDBX_ALLOC_NEW 4 -#define MDBX_ALLOC_KICK 8 -#define MDBX_ALLOC_ALL \ - (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) +#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) -static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, - int flags) { +static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, + MDBX_page **const mp, int flags) { int rc; MDBX_txn *txn = mc->mc_txn; MDBX_env *env = txn->mt_env; @@ -2410,10 +3935,9 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the freeDB, then the befree-list cannot play + /* If mc is updating the GC, then the retired-list cannot play * catch-up with itself by growing while trying to save it. */ - flags &= - ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); + flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE | MDBX_LIFORECLAIM); } else if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { /* avoid (recursive) search inside empty tree and while tree is updating, * https://github.com/leo-yuriev/libmdbx/issues/31 */ @@ -2421,31 +3945,40 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } - if (likely(flags & MDBX_ALLOC_CACHE)) { + if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { /* If there are any loose pages, just use them */ mdbx_assert(env, mp && num); - if (likely(num == 1 && txn->mt_loose_pages)) { - np = txn->mt_loose_pages; - txn->mt_loose_pages = NEXT_LOOSE_PAGE(np); - txn->mt_loose_count--; + if (likely(txn->tw.loose_pages)) { + if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { + mdbx_refund(txn); + if (unlikely(!txn->tw.loose_pages)) + goto skip_cache; + } + + np = txn->tw.loose_pages; + txn->tw.loose_pages = np->mp_next; + txn->tw.loose_count--; mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc), np->mp_pgno); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); mdbx_tassert(txn, np->mp_pgno < txn->mt_next_pgno); mdbx_ensure(env, np->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(np), page_space(txn->mt_env)); + ASAN_UNPOISON_MEMORY_REGION(page_data(np), page_space(txn->mt_env)); *mp = np; return MDBX_SUCCESS; } } +skip_cache: - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - pgno_t pgno, *repg_list = env->me_reclaimed_pglist; - unsigned repg_pos = 0, repg_len = repg_list ? MDBX_PNL_SIZE(repg_list) : 0; + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; + unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); txnid_t oldest = 0, last = 0; const unsigned wanna_range = num - 1; - while (1) { /* oom-kick retry loop */ + while (true) { /* oom-kick retry loop */ /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->mt_dirtyroom == 0)) { + if (unlikely(txn->tw.dirtyroom == 0)) { rc = MDBX_TXN_FULL; goto fail; } @@ -2457,23 +3990,35 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, /* Seek a big enough contiguous page range. * Prefer pages with lower pgno. */ - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (likely(flags & MDBX_ALLOC_CACHE) && repg_len > wanna_range && + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + if (likely(flags & MDBX_ALLOC_CACHE) && re_len > wanna_range && (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { + mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && + MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); + range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; + pgno = MDBX_PNL_LEAST(re_list); + if (likely(wanna_range == 0)) + goto done; #if MDBX_PNL_ASCENDING - for (repg_pos = 1; repg_pos <= repg_len - wanna_range; ++repg_pos) { - pgno = repg_list[repg_pos]; - if (likely(repg_list[repg_pos + wanna_range - 1] == - pgno + wanna_range - 1)) + mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); + while (true) { + unsigned range_end = range_begin + wanna_range; + if (re_list[range_end] - pgno == wanna_range) goto done; + if (range_end == re_len) + break; + pgno = re_list[++range_begin]; } #else - repg_pos = repg_len; - do { - pgno = repg_list[repg_pos]; - if (likely(repg_list[repg_pos - wanna_range] == pgno + wanna_range)) + mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); + while (true) { + if (re_list[range_begin - wanna_range] - pgno == wanna_range) goto done; - } while (--repg_pos > wanna_range); + if (range_begin == wanna_range) + break; + pgno = re_list[--range_begin]; + } #endif /* MDBX_PNL sort-order */ } @@ -2493,9 +4038,9 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, last = oldest - 1; op = MDBX_SET_RANGE; } - } else if (env->me_last_reclaimed) { - /* Continue lookup from env->me_last_reclaimed to oldest reader */ - last = env->me_last_reclaimed; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ + last = txn->tw.last_reclaimed; op = MDBX_SET_RANGE; } @@ -2516,8 +4061,9 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (rc == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - if (oldest < mdbx_find_oldest(txn)) { - oldest = *env->me_oldest; + txnid_t snap = mdbx_find_oldest(txn); + if (oldest < snap) { + oldest = snap; last = oldest - 1; key.iov_base = &last; key.iov_len = sizeof(last); @@ -2531,7 +4077,15 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, goto fail; } - last = *(txnid_t *)key.iov_base; + if (unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto fail; + } + last = unaligned_peek_u64(4, key.iov_base); + if (unlikely(last < 1 || last >= SAFE64_INVALID_THRESHOLD)) { + rc = MDBX_CORRUPTED; + goto fail; + } if (oldest <= last) { oldest = mdbx_find_oldest(txn); if (oldest <= last) { @@ -2543,215 +4097,234 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (flags & MDBX_LIFORECLAIM) { /* skip IDs of records that already reclaimed */ - if (txn->mt_lifo_reclaimed) { - unsigned i; - for (i = (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed); i > 0; --i) - if (txn->mt_lifo_reclaimed[i] == last) + if (txn->tw.lifo_reclaimed) { + size_t i; + for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i) + if (txn->tw.lifo_reclaimed[i] == last) break; if (i) continue; } } - /* Reading next FreeDB record */ + /* Reading next GC record */ np = recur.mc_pg[recur.mc_top]; - MDBX_node *leaf = NODEPTR(np, recur.mc_ki[recur.mc_top]); - if (unlikely((rc = mdbx_node_read(&recur, leaf, &data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read( + &recur, page_node(np, recur.mc_ki[recur.mc_top]), + &data)) != MDBX_SUCCESS)) goto fail; - if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed = mdbx_txl_alloc(); - if (unlikely(!txn->mt_lifo_reclaimed)) { + if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { rc = MDBX_ENOMEM; goto fail; } } - /* Append PNL from FreeDB record to me_reclaimed_pglist */ + /* Append PNL from GC record to me_reclaimed_pglist */ mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - pgno_t *re_pnl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(re_pnl)); - mdbx_tassert(txn, mdbx_pnl_check(re_pnl, false)); - repg_pos = MDBX_PNL_SIZE(re_pnl); - if (!repg_list) { - if (unlikely(!(env->me_reclaimed_pglist = repg_list = - mdbx_pnl_alloc(repg_pos)))) { - rc = MDBX_ENOMEM; - goto fail; - } - } else { - if (unlikely( - (rc = mdbx_pnl_need(&env->me_reclaimed_pglist, repg_pos)) != 0)) - goto fail; - repg_list = env->me_reclaimed_pglist; + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); + if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { + rc = MDBX_CORRUPTED; + goto fail; } + const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); + rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + re_list = txn->tw.reclaimed_pglist; - /* Remember ID of FreeDB record */ + /* Remember ID of GC record */ if (flags & MDBX_LIFORECLAIM) { - if ((rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, last)) != 0) + rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, last); + if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - env->me_last_reclaimed = last; + txn->tw.last_reclaimed = last; - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, repg_pos); + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); unsigned i; - for (i = repg_pos; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, re_pnl[i]); - mdbx_debug_extra_print("\n"); + for (i = gc_len; i; i--) + mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); + mdbx_debug_extra_print("%s", "\n"); } /* Merge in descending sorted order */ - mdbx_pnl_xmerge(repg_list, re_pnl); - repg_len = MDBX_PNL_SIZE(repg_list); - if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { - /* Done for a kick-reclaim mode, actually no page needed */ - return MDBX_SUCCESS; + const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); + mdbx_pnl_xmerge(re_list, gc_pnl); + /* re-check to avoid duplicates */ + if (unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { + rc = MDBX_CORRUPTED; + goto fail; } - mdbx_tassert(txn, - repg_len == 0 || repg_list[repg_len] < txn->mt_next_pgno); - if (repg_len) { + re_len = MDBX_PNL_SIZE(re_list); + mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); + if (re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { /* Refund suitable pages into "unallocated" space */ - pgno_t tail = txn->mt_next_pgno; - pgno_t *const begin = repg_list + 1; - pgno_t *const end = begin + repg_len; - pgno_t *higest; -#if MDBX_PNL_ASCENDING - for (higest = end; --higest >= begin;) { -#else - for (higest = begin; higest < end; ++higest) { -#endif /* MDBX_PNL sort-order */ - mdbx_tassert(txn, *higest >= NUM_METAS && *higest < tail); - if (*higest != tail - 1) - break; - tail -= 1; - } - if (tail != txn->mt_next_pgno) { -#if MDBX_PNL_ASCENDING - repg_len = (unsigned)(higest + 1 - begin); -#else - repg_len -= (unsigned)(higest - begin); - for (pgno_t *move = begin; higest < end; ++move, ++higest) - *move = *higest; -#endif /* MDBX_PNL sort-order */ - MDBX_PNL_SIZE(repg_list) = repg_len; - mdbx_info("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - tail - txn->mt_next_pgno, tail, txn->mt_next_pgno); - txn->mt_next_pgno = tail; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - } + mdbx_refund(txn); + re_list = txn->tw.reclaimed_pglist; + re_len = MDBX_PNL_SIZE(re_list); + } + + if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { + /* Done for a kick-reclaim mode, actually no page needed */ + return MDBX_SUCCESS; } /* Don't try to coalesce too much. */ - if (unlikely(repg_len > MDBX_DPL_TXNFULL / 4)) + if (unlikely(re_len > MDBX_DPL_TXNFULL / 4)) break; - if (repg_len /* current size */ >= env->me_maxgc_ov1page || - repg_pos /* prev size */ >= env->me_maxgc_ov1page / 2) + if (re_len /* current size */ >= env->me_maxgc_ov1page || + (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= + env->me_maxgc_ov1page / 2)) flags &= ~MDBX_COALESCE; } - if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == - (MDBX_COALESCE | MDBX_ALLOC_CACHE) && - repg_len > wanna_range) { + if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE && + re_len > wanna_range) { + range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; + pgno = MDBX_PNL_LEAST(re_list); + if (likely(wanna_range == 0)) + goto done; #if MDBX_PNL_ASCENDING - for (repg_pos = 1; repg_pos <= repg_len - wanna_range; ++repg_pos) { - pgno = repg_list[repg_pos]; - if (likely(repg_list[repg_pos + wanna_range - 1] == - pgno + wanna_range - 1)) + mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); + while (true) { + unsigned range_end = range_begin + wanna_range; + if (re_list[range_end] - pgno == wanna_range) goto done; + if (range_end == re_len) + break; + pgno = re_list[++range_begin]; } #else - repg_pos = repg_len; - do { - pgno = repg_list[repg_pos]; - if (likely(repg_list[repg_pos - wanna_range] == pgno + wanna_range)) + mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); + while (true) { + if (re_list[range_begin - wanna_range] - pgno == wanna_range) goto done; - } while (--repg_pos > wanna_range); + if (range_begin == wanna_range) + break; + pgno = re_list[--range_begin]; + } #endif /* MDBX_PNL sort-order */ } - /* Use new pages from the map when nothing suitable in the freeDB */ - repg_pos = 0; + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ + + /* Will use new pages from the map if nothing is suitable in the GC. */ + range_begin = 0; pgno = txn->mt_next_pgno; - rc = MDBX_MAP_FULL; const pgno_t next = pgno_add(pgno, num); - if (likely(next <= txn->mt_end_pgno)) { - rc = MDBX_NOTFOUND; - if (likely(flags & MDBX_ALLOC_NEW)) - goto done; - } - const MDBX_meta *head = mdbx_meta_head(env); - if ((flags & MDBX_ALLOC_GC) && - ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { - MDBX_meta *steady = mdbx_meta_steady(env); - - if (oldest == mdbx_meta_txnid_stable(env, steady) && - !META_IS_STEADY(head) && META_IS_STEADY(steady)) { - /* LY: Here an oom was happened: - * - all pages had allocated; - * - reclaiming was stopped at the last steady-sync; - * - the head-sync is weak. - * Now we need make a sync to resume reclaiming. If both - * MDBX_NOSYNC and MDBX_MAPASYNC flags are set, then assume that - * utterly no-sync write mode was requested. In such case - * don't make a steady-sync, but only a legacy-mode checkpoint, - * just for resume reclaiming only, not for data consistency. */ - - mdbx_debug("kick-gc: head %" PRIaTXN "-%s, tail %" PRIaTXN + if (flags & MDBX_ALLOC_GC) { + const MDBX_meta *const head = mdbx_meta_head(env); + MDBX_meta *const steady = mdbx_meta_steady(env); + /* does reclaiming stopped at the last steady point? */ + if (head != steady && META_IS_STEADY(steady) && + oldest == mdbx_meta_txnid_stable(env, steady)) { + mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN "-%s, oldest %" PRIaTXN, mdbx_meta_txnid_stable(env, head), mdbx_durable_str(head), mdbx_meta_txnid_stable(env, steady), mdbx_durable_str(steady), oldest); - - const unsigned syncflags = F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) - ? env->me_flags - : env->me_flags & MDBX_WRITEMAP; - MDBX_meta meta = *head; - if (mdbx_sync_locked(env, syncflags, &meta) == MDBX_SUCCESS) { - txnid_t snap = mdbx_find_oldest(txn); - if (snap > oldest) - continue; + rc = MDBX_RESULT_TRUE; + const pgno_t autosync_threshold = *env->me_autosync_threshold; + const uint64_t autosync_period = *env->me_autosync_period; + /* wipe the last steady-point if: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * otherwise, make a new steady-point if: + * - auto-sync threshold is specified and reached; + * - OR upper limit of database size is reached; + * - OR database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + (autosync_threshold | autosync_period) == 0) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync treshold(s). */ + rc = mdbx_wipe_steady(env, oldest); + mdbx_debug("gc-wipe-steady, rc %d", rc); + mdbx_assert(env, steady != mdbx_meta_steady(env)); + } else if ((flags & MDBX_ALLOC_NEW) == 0 || + (autosync_threshold && + *env->me_unsynced_pages >= autosync_threshold) || + (autosync_period && + mdbx_osal_monotime() - *env->me_sync_timestamp >= + autosync_period) || + next >= txn->mt_geo.upper || + (next >= txn->mt_end_pgno && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ + MDBX_meta meta = *head; + rc = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); + mdbx_debug("gc-make-steady, rc %d", rc); + mdbx_assert(env, steady != mdbx_meta_steady(env)); } - } - - if (rc == MDBX_MAP_FULL && oldest < txn->mt_txnid - 1) { - if (mdbx_oomkick(env, oldest) > oldest) - continue; + if (rc == MDBX_SUCCESS) { + if (mdbx_find_oldest(txn) > oldest) + continue; + /* it is reasonable check/kick lagging reader(s) here, + * since we made a new steady point or wipe the last. */ + if (oldest < txn->mt_txnid - MDBX_TXNID_STEP && + mdbx_oomkick(env, oldest) > oldest) + continue; + } else if (unlikely(rc != MDBX_RESULT_TRUE)) + goto fail; } } - if (rc == MDBX_MAP_FULL && next < head->mm_geo.upper) { - mdbx_assert(env, next > txn->mt_end_pgno); - pgno_t aligned = pgno_align2os_pgno( - env, pgno_add(next, head->mm_geo.grow - next % head->mm_geo.grow)); + /* don't kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) + goto done; + if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - MDBX_TXNID_STEP && + mdbx_oomkick(env, oldest) > oldest) + continue; - if (aligned > head->mm_geo.upper) - aligned = head->mm_geo.upper; - mdbx_assert(env, aligned > txn->mt_end_pgno); + rc = MDBX_NOTFOUND; + if (flags & MDBX_ALLOC_NEW) { + rc = MDBX_MAP_FULL; + if (next <= txn->mt_geo.upper) { + mdbx_assert(env, next > txn->mt_end_pgno); + pgno_t aligned = pgno_align2os_pgno( + env, pgno_add(next, txn->mt_geo.grow - next % txn->mt_geo.grow)); + + if (aligned > txn->mt_geo.upper) + aligned = txn->mt_geo.upper; + mdbx_assert(env, aligned > txn->mt_end_pgno); + + mdbx_verbose("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO + ")", + aligned, aligned - txn->mt_end_pgno); + rc = mdbx_mapresize(env, txn->mt_next_pgno, aligned, txn->mt_geo.upper); + if (rc == MDBX_SUCCESS) { + env->me_txn->mt_end_pgno = aligned; + goto done; + } - mdbx_info("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO ")", - aligned, aligned - txn->mt_end_pgno); - rc = mdbx_mapresize(env, aligned, head->mm_geo.upper); - if (rc == MDBX_SUCCESS) { - mdbx_tassert(env->me_txn, txn->mt_end_pgno >= next); - if (!mp) - return rc; - goto done; + mdbx_warning("unable growth datafile to %" PRIaPGNO + " pages (+%" PRIaPGNO "), errcode %d", + aligned, aligned - txn->mt_end_pgno, rc); + } else { + mdbx_debug("gc-alloc: next %u > upper %u", next, txn->mt_geo.upper); } - - mdbx_warning("unable growth datafile to %" PRIaPGNO "pages (+%" PRIaPGNO - "), errcode %d", - aligned, aligned - txn->mt_end_pgno, rc); } fail: - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (mp) { - *mp = NULL; + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + if (likely(mp)) { + *mp = nullptr; txn->mt_flags |= MDBX_TXN_ERROR; } mdbx_assert(env, rc != MDBX_SUCCESS); @@ -2759,11 +4332,12 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } done: - mdbx_tassert(txn, mp && num); + if (unlikely(mp == nullptr)) + return MDBX_SUCCESS; mdbx_ensure(env, pgno >= NUM_METAS); if (env->me_flags & MDBX_WRITEMAP) { np = pgno2page(env, pgno); - /* LY: reset no-access flag from mdbx_kill_page() */ + /* LY: reset no-access flag from mdbx_loose_page() */ VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); ASAN_UNPOISON_MEMORY_REGION(np, pgno2bytes(env, num)); } else { @@ -2773,22 +4347,29 @@ done: } } - if (repg_pos) { + if (range_begin) { mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); mdbx_tassert(txn, pgno < txn->mt_next_pgno); - mdbx_tassert(txn, pgno == repg_list[repg_pos]); + mdbx_tassert(txn, pgno == re_list[range_begin]); /* Cutoff allocated pages from me_reclaimed_pglist */ - MDBX_PNL_SIZE(repg_list) = repg_len -= num; - for (unsigned i = repg_pos - num; i < repg_len;) - repg_list[++i] = repg_list[++repg_pos]; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); +#if MDBX_PNL_ASCENDING + for (unsigned i = range_begin + num; i <= re_len;) + re_list[range_begin++] = re_list[i++]; + MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; +#else + MDBX_PNL_SIZE(re_list) = re_len -= num; + for (unsigned i = range_begin - num; i < re_len;) + re_list[++i] = re_list[++range_begin]; +#endif + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); } else { txn->mt_next_pgno = pgno + num; mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); } - if (env->me_flags & MDBX_PAGEPERTURB) - memset(np, 0x71 /* 'q', 113 */, pgno2bytes(env, num)); + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(np, -1, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); np->mp_pgno = pgno; @@ -2800,31 +4381,30 @@ done: goto fail; *mp = np; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); return MDBX_SUCCESS; } -/* Copy the used portions of a non-overflow page. - * [in] dst page to copy into - * [in] src page to copy from - * [in] psize size of a page */ +/* Copy the used portions of a non-overflow page. */ __hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); - STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 42); - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; - - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. */ - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEHDRSZ) & -Align; - memcpy(dst, src, (lower + PAGEHDRSZ + (Align - 1)) & -Align); - memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); + STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + if (!IS_LEAF2(src)) { + size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if (unused > sizeof(void *) * 42) { + lower = roundup_powerof2(lower + PAGEHDRSZ, sizeof(void *)); + upper = (upper + PAGEHDRSZ) & ~(sizeof(void *) - 1); + memcpy(dst, src, lower); + memcpy((char *)dst + upper, (char *)src + upper, psize - upper); + return; + } } + memcpy(dst, src, psize); } /* Pull a page off the txn's spill list, if present. @@ -2840,52 +4420,44 @@ __hot static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { MDBX_env *env = txn->mt_env; - const MDBX_txn *tx2; - unsigned x; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - for (tx2 = txn; tx2; tx2 = tx2->mt_parent) { - if (!tx2->mt_spill_pages) + for (const MDBX_txn *tx2 = txn; tx2; tx2 = tx2->mt_parent) { + if (!tx2->tw.spill_pages) continue; - x = mdbx_pnl_search(tx2->mt_spill_pages, pn); - if (x <= MDBX_PNL_SIZE(tx2->mt_spill_pages) && - tx2->mt_spill_pages[x] == pn) { - MDBX_page *np; - int num; - if (txn->mt_dirtyroom == 0) - return MDBX_TXN_FULL; - num = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - if (env->me_flags & MDBX_WRITEMAP) { - np = mp; - } else { - np = mdbx_page_malloc(txn, num); - if (unlikely(!np)) - return MDBX_ENOMEM; - if (unlikely(num > 1)) - memcpy(np, mp, pgno2bytes(env, num)); - else - mdbx_page_copy(np, mp, env->me_psize); - } - mdbx_debug("unspill page %" PRIaPGNO, mp->mp_pgno); - if (tx2 == txn) { - /* If in current txn, this page is no longer spilled. - * If it happens to be the last page, truncate the spill list. - * Otherwise mark it as deleted by setting the LSB. */ - if (x == MDBX_PNL_SIZE(txn->mt_spill_pages)) - MDBX_PNL_SIZE(txn->mt_spill_pages)--; - else - txn->mt_spill_pages[x] |= 1; - } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits */ - - int rc = mdbx_page_dirty(txn, np); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - + unsigned i = mdbx_pnl_exist(tx2->tw.spill_pages, pn); + if (!i) + continue; + if (txn->tw.dirtyroom == 0) + return MDBX_TXN_FULL; + unsigned num = IS_OVERFLOW(mp) ? mp->mp_pages : 1; + MDBX_page *np = mp; + if ((env->me_flags & MDBX_WRITEMAP) == 0) { + np = mdbx_page_malloc(txn, num); + if (unlikely(!np)) + return MDBX_ENOMEM; + if (unlikely(num > 1)) + memcpy(np, mp, pgno2bytes(env, num)); + else + mdbx_page_copy(np, mp, env->me_psize); + } + mdbx_debug("unspill page %" PRIaPGNO, mp->mp_pgno); + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. */ + txn->tw.spill_pages[i] |= 1; + if (i == MDBX_PNL_SIZE(txn->tw.spill_pages)) + MDBX_PNL_SIZE(txn->tw.spill_pages) -= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits */ + + int rc = mdbx_page_dirty(txn, np); + if (likely(rc == MDBX_SUCCESS)) { np->mp_flags |= P_DIRTY; *ret = np; - break; } + return rc; } return MDBX_SUCCESS; } @@ -2914,51 +4486,50 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { goto done; } - if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, 1)) || + if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, 1)) || (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) goto fail; pgno = np->mp_pgno; mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), mp->mp_pgno, pgno); mdbx_cassert(mc, mp->mp_pgno != pgno); - mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno); + mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + mdbx_tassert(txn, mdbx_dpl_find(txn->tw.dirtylist, mp->mp_pgno) == nullptr); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; - MDBX_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); - SETPGNO(node, pgno); + MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]); + node_set_pgno(node, pgno); } else { mc->mc_db->md_root = pgno; } } else if (txn->mt_parent && !IS_SUBP(mp)) { mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); - MDBX_DP *dl = txn->mt_rw_dirtylist; pgno = mp->mp_pgno; /* If txn has a parent, make sure the page is in our dirty list. */ - if (dl->length) { - unsigned x = mdbx_dpl_search(dl, pgno); - if (x <= dl->length && dl[x].pgno == pgno) { - if (unlikely(mp != dl[x].ptr)) { /* bad cursor? */ - mdbx_error("wrong page 0x%p #%" PRIaPGNO - " in the dirtylist[%d], expecting %p", - dl[x].ptr, pgno, x, mp); - mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - rc = MDBX_PROBLEM; - goto fail; - } - return MDBX_SUCCESS; + const MDBX_page *const dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); + if (dp) { + if (unlikely(mp != dp)) { /* bad cursor? */ + mdbx_error("wrong page 0x%p #%" PRIaPGNO + " in the dirtylist, expecting %p", + __Wpedantic_format_voidptr(dp), pgno, + __Wpedantic_format_voidptr(mp)); + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + rc = MDBX_PROBLEM; + goto fail; } + return MDBX_SUCCESS; } mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - mdbx_cassert(mc, dl->length <= MDBX_DPL_TXNFULL); + mdbx_cassert(mc, txn->tw.dirtylist->length <= MDBX_DPL_TXNFULL); /* No - copy it */ np = mdbx_page_malloc(txn, 1); if (unlikely(!np)) { rc = MDBX_ENOMEM; goto fail; } - rc = mdbx_dpl_insert(dl, pgno, np); + rc = mdbx_dpl_append(txn->tw.dirtylist, pgno, np); if (unlikely(rc)) { mdbx_dpage_free(txn->mt_env, np, 1); goto fail; @@ -3003,33 +4574,48 @@ fail: return rc; } -__cold static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { +__cold int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { if (unlikely(!env)) return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; +#if MDBX_TXN_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_TXN_CHECKPID */ + unsigned flags = env->me_flags & ~MDBX_NOMETASYNC; if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; - const bool outside_txn = - (!env->me_txn0 || env->me_txn0->mt_owner != mdbx_thread_self()); + if (unlikely(!env->me_map)) + return MDBX_EPERM; + + int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; + bool need_unlock = false; + if (nonblock && *env->me_unsynced_pages == 0) + goto fastpath; + const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self()); if (outside_txn) { - int rc = mdbx_txn_lock(env, nonblock); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int err = mdbx_txn_lock(env, nonblock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + need_unlock = true; } const MDBX_meta *head = mdbx_meta_head(env); pgno_t unsynced_pages = *env->me_unsynced_pages; if (!META_IS_STEADY(head) || unsynced_pages) { const pgno_t autosync_threshold = *env->me_autosync_threshold; - const uint64_t unsynced_timeout = *env->me_unsynced_timeout; + const uint64_t autosync_period = *env->me_autosync_period; if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || - (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout)) + (autosync_period && + mdbx_osal_monotime() - *env->me_sync_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (outside_txn) { @@ -3041,15 +4627,15 @@ __cold static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { mdbx_txn_unlock(env); /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) - : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int err = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) + : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA); + if (unlikely(err != MDBX_SUCCESS)) + return err; - rc = mdbx_txn_lock(env, nonblock); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = mdbx_txn_lock(env, nonblock); + if (unlikely(err != MDBX_SUCCESS)) + return err; /* LY: head and unsynced_pages may be changed. */ head = mdbx_meta_head(env); @@ -3057,42 +4643,61 @@ __cold static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { } env->me_txn0->mt_txnid = meta_txnid(env, head, false); mdbx_find_oldest(env->me_txn0); + rc = MDBX_RESULT_FALSE /* means "some data was synced" */; } if (!META_IS_STEADY(head) || ((flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) == 0 && unsynced_pages)) { mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, - container_of(head, MDBX_page, mp_data)->mp_pgno, - mdbx_durable_str(head), unsynced_pages); + data_page(head)->mp_pgno, mdbx_durable_str(head), + unsynced_pages); MDBX_meta meta = *head; - int rc = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta); - if (unlikely(rc != MDBX_SUCCESS)) { - if (outside_txn) + int err = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta); + if (unlikely(err != MDBX_SUCCESS)) { + if (need_unlock) mdbx_txn_unlock(env); - return rc; + return err; } + rc = MDBX_RESULT_FALSE /* means "some data was synced" */; } } - if (outside_txn) +fastpath: + /* LY: sync meta-pages if MDBX_NOMETASYNC enabled + * and someone was not synced above. */ + if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) { + const txnid_t head_txnid = mdbx_recent_committed_txnid(env); + if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) { + rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS), + false) + : mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (likely(rc == MDBX_SUCCESS)) + *env->me_meta_sync_txnid = (uint32_t)head_txnid; + } + } + if (need_unlock) mdbx_txn_unlock(env); - return MDBX_SUCCESS; + return rc; +} + +__cold int mdbx_env_sync(MDBX_env *env) { + return mdbx_env_sync_ex(env, true, false); } -__cold int mdbx_env_sync(MDBX_env *env, int force) { - return mdbx_env_sync_ex(env, force, false); +__cold int mdbx_env_sync_poll(MDBX_env *env) { + return mdbx_env_sync_ex(env, false, true); } /* Back up parent txn's cursors, then grab the originals for tracking */ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { MDBX_cursor *mc, *bk; MDBX_xcursor *mx; - size_t size; - int i; - for (i = src->mt_numdbs; --i >= 0;) { + for (int i = src->mt_numdbs; --i >= 0;) { + dst->mt_cursors[i] = NULL; if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDBX_cursor); + size_t size = sizeof(MDBX_cursor); if (mc->mc_xcursor) size += sizeof(MDBX_xcursor); for (; mc; mc = bk->mc_next) { @@ -3133,9 +4738,11 @@ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { for (i = txn->mt_numdbs; --i >= 0;) { for (mc = cursors[i]; mc; mc = next) { unsigned stage = mc->mc_signature; - mdbx_ensure(NULL, + mdbx_ensure(txn->mt_env, stage == MDBX_MC_SIGNATURE || stage == MDBX_MC_WAIT4EOT); next = mc->mc_next; + mdbx_tassert(txn, !next || next->mc_signature == MDBX_MC_SIGNATURE || + next->mc_signature == MDBX_MC_WAIT4EOT); if ((bk = mc->mc_backup) != NULL) { if (merge) { /* Commit changes to parent txn */ @@ -3167,6 +4774,77 @@ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { } } +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +/* Find largest mvcc-snapshot still referenced by this process. */ +static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid == env->me_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used; + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + mdbx_memory_barrier(); + if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + goto retry; + if (largest < snap_pages && + lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid && + snap_txnid < SAFE64_INVALID_THRESHOLD) + largest = snap_pages; + } + } + } + return largest; +} + +static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { +#if !defined(__SANITIZE_ADDRESS__) + if (!RUNNING_ON_VALGRIND) + return; +#endif + + if (txn) { /* transaction start */ + if (env->me_poison_edge < txn->mt_next_pgno) + env->me_poison_edge = txn->mt_next_pgno; + VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); + ASAN_UNPOISON_MEMORY_REGION(env->me_map, + pgno2bytes(env, txn->mt_next_pgno)); + /* don't touch more, it should be already poisoned */ + } else { /* transaction end */ + bool should_unlock = false; + pgno_t last = MAX_PAGENO; + if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) { + /* inside write-txn */ + MDBX_meta *head = mdbx_meta_head(env); + last = head->mm_geo.next; + } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) { + /* no write-txn */ + last = NUM_METAS; + should_unlock = true; + } else { + /* write txn is running, therefore shouldn't poison any memory range */ + return; + } + + last = mdbx_find_largest_this(env, last); + const pgno_t edge = env->me_poison_edge; + if (edge > last) { + mdbx_assert(env, last >= NUM_METAS); + env->me_poison_edge = last; + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), + pgno2bytes(env, edge - last)); + ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), + pgno2bytes(env, edge - last)); + } + if (should_unlock) + mdbx_txn_unlock(env); + } +} +#endif /* MDBX_USE_VALGRIND */ + /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { MDBX_env *env = txn->mt_env; @@ -3180,25 +4858,27 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { #endif /* MDBX_TXN_CHECKPID */ STATIC_ASSERT(sizeof(MDBX_reader) == 32); -#ifdef MDBX_OSAL_LOCK - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wmutex) % MDBX_CACHELINE_SIZE == 0); - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rmutex) % MDBX_CACHELINE_SIZE == 0); +#if MDBX_LOCKING > 0 + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0); #else STATIC_ASSERT( offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0); STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == 0); -#endif +#endif /* MDBX_LOCKING */ STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - pgno_t upper_limit_pgno = 0; - if (flags & MDBX_TXN_RDONLY) { - txn->mt_flags = MDBX_TXN_RDONLY; - MDBX_reader *r = txn->mt_ro_reader; + mdbx_assert(env, (flags & ~(MDBX_TXN_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_WRITEMAP)) == 0); + if (flags & MDBX_RDONLY) { + txn->mt_flags = MDBX_RDONLY | (env->me_flags & MDBX_NOTLS); + MDBX_reader *r = txn->to.reader; + STATIC_ASSERT(sizeof(size_t) == sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); - r = mdbx_thread_rthc_get(env->me_txkey); + r = thread_rthc_get(env->me_txkey); if (likely(r)) { mdbx_assert(env, r->mr_pid == env->me_pid); mdbx_assert(env, r->mr_tid == mdbx_thread_self()); @@ -3208,11 +4888,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } if (likely(r)) { - if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) + if (unlikely(r->mr_pid != env->me_pid || + r->mr_txnid.inconsistent < SAFE64_INVALID_THRESHOLD)) return MDBX_BAD_RSLOT; } else if (env->me_lck) { unsigned slot, nreaders; - const mdbx_tid_t tid = mdbx_thread_self(); + const size_t tid = mdbx_thread_self(); mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); @@ -3263,17 +4944,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ r->mr_pid = 0; - r->mr_txnid = ~(txnid_t)0; - r->mr_tid = tid; - mdbx_flush_noncoherent_cpu_writeback(); + safe64_reset(&r->mr_txnid, true); if (slot == nreaders) env->me_lck->mti_numreaders = ++nreaders; + r->mr_tid = tid; r->mr_pid = env->me_pid; mdbx_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { mdbx_assert(env, env->me_live_reader == env->me_pid); - mdbx_thread_rthc_set(env->me_txkey, r); + thread_rthc_set(env->me_txkey, r); } } @@ -3283,23 +4963,23 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); if (likely(r)) { + safe64_reset(&r->mr_txnid, false); r->mr_snapshot_pages_used = meta->mm_geo.next; - r->mr_txnid = snap; + r->mr_snapshot_pages_retired = meta->mm_pages_retired; + safe64_write(&r->mr_txnid, snap); mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_tid == mdbx_thread_self()); - mdbx_assert(env, r->mr_txnid == snap); + mdbx_assert(env, r->mr_txnid.inconsistent == snap); mdbx_compiler_barrier(); env->me_lck->mti_readers_refresh_flag = true; - mdbx_flush_noncoherent_cpu_writeback(); + mdbx_flush_incoherent_cpu_writeback(); } mdbx_jitter4testing(true); /* Snap the state from current meta-head */ txn->mt_txnid = snap; - txn->mt_next_pgno = meta->mm_geo.next; - txn->mt_end_pgno = meta->mm_geo.now; - upper_limit_pgno = meta->mm_geo.upper; + txn->mt_geo = meta->mm_geo; memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; @@ -3313,13 +4993,14 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } } - if (unlikely(txn->mt_txnid == 0)) { - mdbx_error("environment corrupted by died writer, must shutdown!"); + if (unlikely(txn->mt_txnid == 0 || + txn->mt_txnid >= SAFE64_INVALID_THRESHOLD)) { + mdbx_error("%s", "environment corrupted by died writer, must shutdown!"); rc = MDBX_WANNA_RECOVERY; goto bailout; } mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); - txn->mt_ro_reader = r; + txn->to.reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ mdbx_ensure(env, txn->mt_txnid >= /* paranoia is appropriate here */ *env->me_oldest); @@ -3345,33 +5026,32 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; const txnid_t snap = mdbx_meta_txnid_stable(env, meta); - txn->mt_txnid = snap + 1; - if (unlikely(txn->mt_txnid < snap)) { - mdbx_debug("txnid overflow!"); + txn->mt_txnid = safe64_txnid_next(snap); + if (unlikely(txn->mt_txnid >= SAFE64_INVALID_THRESHOLD)) { + mdbx_debug("%s", "txnid overflow!"); rc = MDBX_TXN_FULL; goto bailout; } txn->mt_flags = flags; txn->mt_child = NULL; - txn->mt_loose_pages = NULL; - txn->mt_loose_count = 0; - txn->mt_dirtyroom = MDBX_DPL_TXNFULL; - txn->mt_rw_dirtylist = env->me_dirtylist; - txn->mt_rw_dirtylist->length = 0; - txn->mt_befree_pages = env->me_free_pgs; - MDBX_PNL_SIZE(txn->mt_befree_pages) = 0; - txn->mt_spill_pages = NULL; - if (txn->mt_lifo_reclaimed) - MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) = 0; + txn->tw.loose_pages = NULL; + txn->tw.loose_count = 0; + txn->tw.dirtyroom = MDBX_DPL_TXNFULL; + txn->tw.dirtylist = env->me_dirtylist; + mdbx_dpl_clear(txn->tw.dirtylist); + MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; + txn->tw.spill_pages = NULL; + txn->tw.last_reclaimed = 0; + if (txn->tw.lifo_reclaimed) + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; env->me_txn = txn; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); /* Copy the DB info and flags */ memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = meta->mm_geo.next; - txn->mt_end_pgno = meta->mm_geo.now; - upper_limit_pgno = meta->mm_geo.upper; + txn->mt_geo = meta->mm_geo; + txn->tw.loose_refund_wl = txn->mt_next_pgno; } /* Setup db info */ @@ -3387,32 +5067,39 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_dbflags[FREE_DBI] = DB_VALID; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_warning("environment had fatal error, must shutdown!"); + mdbx_warning("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { const size_t size = pgno2bytes(env, txn->mt_end_pgno); - if (unlikely(size > env->me_mapsize)) { - if (upper_limit_pgno > MAX_PAGENO || - bytes2pgno(env, pgno2bytes(env, upper_limit_pgno)) != - upper_limit_pgno) { + if (unlikely(size > env->me_dxb_mmap.limit)) { + if (txn->mt_geo.upper > MAX_PAGENO || + bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != + txn->mt_geo.upper) { rc = MDBX_MAP_RESIZED; goto bailout; } - rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_limit_pgno); + rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper); if (rc != MDBX_SUCCESS) { if (rc == MDBX_RESULT_TRUE) rc = MDBX_MAP_RESIZED; goto bailout; } } - txn->mt_owner = mdbx_thread_self(); + if (txn->mt_flags & MDBX_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if ((txn->mt_flags & MDBX_TXN_RDONLY) != 0 && size > env->me_dbgeo.lower && - env->me_dbgeo.shrink) { - txn->mt_flags |= MDBX_SHRINK_ALLOWED; - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + if (size > env->me_dbgeo.lower && env->me_dbgeo.shrink) { + txn->mt_flags |= MDBX_SHRINK_ALLOWED; + mdbx_srwlock_AcquireShared(&env->me_remap_guard); + } +#endif + } else { + env->me_dxb_mmap.current = size; } +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, txn); #endif + txn->mt_owner = mdbx_thread_self(); return MDBX_SUCCESS; } bailout: @@ -3421,6 +5108,46 @@ bailout: return rc; } +static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { + if (unlikely(!txn)) + return MDBX_EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(txn->mt_flags & bad_bits)) + return MDBX_BAD_TXN; + +#if MDBX_TXN_CHECKOWNER + if ((txn->mt_flags & MDBX_NOTLS) == 0 && + unlikely(txn->mt_owner != mdbx_thread_self())) + return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; +#endif /* MDBX_TXN_CHECKOWNER */ + + return MDBX_SUCCESS; +} + +static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { + if (unlikely(!txn)) + return MDBX_EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(txn->mt_flags & bad_bits)) + return MDBX_BAD_TXN; + + if (unlikely(F_ISSET(txn->mt_flags, MDBX_RDONLY))) + return MDBX_EACCESS; + +#if MDBX_TXN_CHECKOWNER + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; +#endif /* MDBX_TXN_CHECKOWNER */ + + return MDBX_SUCCESS; +} + int mdbx_txn_renew(MDBX_txn *txn) { int rc; @@ -3430,18 +5157,18 @@ int mdbx_txn_renew(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) + if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) return MDBX_EINVAL; if (unlikely(txn->mt_owner != 0)) return MDBX_THREAD_MISMATCH; - rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + rc = mdbx_txn_renew0(txn, MDBX_RDONLY); if (rc == MDBX_SUCCESS) { txn->mt_owner = mdbx_thread_self(); mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, - txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + txn->mt_txnid, (txn->mt_flags & MDBX_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); } @@ -3451,14 +5178,15 @@ int mdbx_txn_renew(MDBX_txn *txn) { int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, MDBX_txn **ret) { MDBX_txn *txn; - MDBX_ntxn *ntxn; int rc; unsigned size, tsize; - if (unlikely(!env || !ret)) + if (unlikely(!ret)) return MDBX_EINVAL; - *ret = NULL; + + if (unlikely(!env)) + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -3471,39 +5199,36 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, return MDBX_PANIC; #if !defined(_WIN32) && !defined(_WIN64) - /* Don't check env->me_map until lock to avoid race with re-mapping for - * shrinking */ + /* Don't check env->me_map until lock to + * avoid race with re-mapping for shrinking */ if (unlikely(!env->me_map)) return MDBX_EPERM; #endif /* Windows */ - flags &= MDBX_TXN_BEGIN_FLAGS; - flags |= env->me_flags & MDBX_WRITEMAP; + if (unlikely(flags & ~MDBX_TXN_BEGIN_FLAGS)) + return MDBX_EINVAL; if (unlikely(env->me_flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */ return MDBX_EACCESS; - if (parent) { - if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + flags |= env->me_flags & MDBX_WRITEMAP; - if (unlikely(parent->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + rc = check_txn_rw(parent, MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; #if defined(_WIN32) || defined(_WIN64) if (unlikely(!env->me_map)) return MDBX_EPERM; #endif /* Windows */ - /* Nested transactions: Max 1 child, write txns only, no writemap */ - flags |= parent->mt_flags; - if (unlikely(flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED))) - return (parent->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_BAD_TXN; - + flags |= parent->mt_flags & (MDBX_TXN_BEGIN_FLAGS | MDBX_TXN_SPILLS); /* Child txns save MDBX_pgstate and use own copy of cursors */ size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); - size += tsize = sizeof(MDBX_ntxn); + size += tsize = sizeof(MDBX_txn); } else if (flags & MDBX_RDONLY) { if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) return MDBX_BUSY; @@ -3529,48 +5254,59 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, txn->mt_env = env; if (parent) { - unsigned i; + mdbx_tassert(txn, mdbx_dirtylist_check(parent)); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_rw_dirtylist = - mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1)); - if (!txn->mt_rw_dirtylist || - !(txn->mt_befree_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL))) { - mdbx_free(txn->mt_rw_dirtylist); + txn->tw.dirtylist = mdbx_malloc(sizeof(MDBX_DP) * (MDBX_DPL_TXNFULL + 1)); + txn->tw.reclaimed_pglist = + mdbx_pnl_alloc(MDBX_PNL_ALLOCLEN(parent->tw.reclaimed_pglist)); + if (!txn->tw.dirtylist || !txn->tw.reclaimed_pglist) { + mdbx_pnl_free(txn->tw.reclaimed_pglist); + mdbx_free(txn->tw.dirtylist); mdbx_free(txn); return MDBX_ENOMEM; } + mdbx_dpl_clear(txn->tw.dirtylist); + memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, + MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); + mdbx_assert(env, mdbx_pnl_check4assert( + txn->tw.reclaimed_pglist, + (txn->mt_next_pgno /* LY: intentional assigment here, + only for assertion */ + = parent->mt_next_pgno))); + + txn->tw.last_reclaimed = parent->tw.last_reclaimed; + if (parent->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; + parent->tw.lifo_reclaimed = + (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed); + } + + txn->tw.retired_pages = parent->tw.retired_pages; + parent->tw.retired_pages = + (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); + txn->mt_txnid = parent->mt_txnid; - txn->mt_dirtyroom = parent->mt_dirtyroom; - txn->mt_rw_dirtylist->length = 0; - txn->mt_spill_pages = NULL; - txn->mt_next_pgno = parent->mt_next_pgno; - txn->mt_end_pgno = parent->mt_end_pgno; + txn->tw.dirtyroom = parent->tw.dirtyroom; + txn->mt_geo = parent->mt_geo; + txn->tw.loose_refund_wl = parent->tw.loose_refund_wl; + txn->mt_canary = parent->mt_canary; parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; + txn->mt_owner = parent->mt_owner; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ - for (i = 0; i < txn->mt_numdbs; i++) + for (unsigned i = 0; i < txn->mt_numdbs; i++) txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~(DB_FRESH | DB_CREAT); - rc = 0; - ntxn = (MDBX_ntxn *)txn; - ntxn->mnt_pgstate = - env->me_pgstate; /* save parent me_reclaimed_pglist & co */ - if (env->me_reclaimed_pglist) { - size = MDBX_PNL_SIZEOF(env->me_reclaimed_pglist); - env->me_reclaimed_pglist = - mdbx_pnl_alloc(MDBX_PNL_SIZE(env->me_reclaimed_pglist)); - if (likely(env->me_reclaimed_pglist)) - memcpy(env->me_reclaimed_pglist, ntxn->mnt_pgstate.mf_reclaimed_pglist, - size); - else - rc = MDBX_ENOMEM; - } - if (likely(!rc)) - rc = mdbx_cursor_shadow(parent, txn); - if (unlikely(rc)) + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + env->me_txn = txn; + rc = mdbx_cursor_shadow(parent, txn); + if (unlikely(rc != MDBX_SUCCESS)) mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); } else { /* MDBX_RDONLY */ txn->mt_dbiseqs = env->me_dbiseqs; @@ -3582,8 +5318,9 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (txn != env->me_txn0) mdbx_free(txn); } else { - mdbx_assert(env, (txn->mt_flags & ~(MDBX_TXN_RDONLY | MDBX_TXN_WRITEMAP | - MDBX_SHRINK_ALLOWED)) == 0); + mdbx_assert(env, (txn->mt_flags & + ~(MDBX_RDONLY | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | + MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_MAPASYNC)) == 0); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO @@ -3596,22 +5333,136 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, return rc; } +int mdbx_txn_info(MDBX_txn *txn, MDBX_txn_info *info, int scan_rlt) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!info)) + return MDBX_EINVAL; + + MDBX_env *const env = txn->mt_env; +#if MDBX_TXN_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_TXN_CHECKPID */ + + info->txn_id = txn->mt_txnid; + info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); + + if (txn->mt_flags & MDBX_RDONLY) { + const MDBX_meta *head_meta; + txnid_t head_txnid; + uint64_t head_retired; + do { + /* fetch info from volatile head */ + head_meta = mdbx_meta_head(env); + head_txnid = mdbx_meta_txnid_fluid(env, head_meta); + head_retired = head_meta->mm_pages_retired; + info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); + info->txn_space_leftover = + pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); + mdbx_compiler_barrier(); + } while (unlikely(head_meta != mdbx_meta_head(env) || + head_txnid != mdbx_meta_txnid_fluid(env, head_meta))); + + info->txn_reader_lag = head_txnid - info->txn_id; + info->txn_space_dirty = info->txn_space_retired = 0; + if (txn->to.reader && + head_retired > txn->to.reader->mr_snapshot_pages_retired) { + info->txn_space_dirty = info->txn_space_retired = + pgno2bytes(env, (pgno_t)(head_retired - + txn->to.reader->mr_snapshot_pages_retired)); + + size_t retired_next_reader = 0; + MDBX_lockinfo *const lck = env->me_lck; + if (scan_rlt && info->txn_reader_lag > 1 && lck) { + /* find next more recent reader */ + txnid_t next_reader = head_txnid; + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + mdbx_jitter4testing(true); + const txnid_t snap_txnid = + safe64_read(&lck->mti_readers[i].mr_txnid); + const uint64_t snap_retired = + lck->mti_readers[i].mr_snapshot_pages_retired; + mdbx_compiler_barrier(); + if (unlikely(snap_retired != + lck->mti_readers[i].mr_snapshot_pages_retired) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)) + goto retry; + if (snap_txnid <= txn->mt_txnid) { + retired_next_reader = 0; + break; + } + if (snap_txnid < next_reader) { + next_reader = snap_txnid; + retired_next_reader = pgno2bytes( + env, (pgno_t)(snap_retired - + txn->to.reader->mr_snapshot_pages_retired)); + } + } + } + } + info->txn_space_dirty = retired_next_reader; + } + } else { + info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); + info->txn_space_retired = pgno2bytes( + env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages + : MDBX_PNL_SIZE(txn->tw.retired_pages)); + info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); + info->txn_space_dirty = + pgno2bytes(env, MDBX_DPL_TXNFULL - txn->tw.dirtyroom); + info->txn_reader_lag = INT64_MAX; + MDBX_lockinfo *const lck = env->me_lck; + if (scan_rlt && lck) { + txnid_t oldest_snapshot = txn->mt_txnid; + const unsigned snap_nreaders = lck->mti_numreaders; + if (snap_nreaders) { + oldest_snapshot = mdbx_find_oldest(txn); + if (oldest_snapshot == txn->mt_txnid - 1) { + /* check if there is at least one reader */ + bool exists = false; + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid && + txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { + exists = true; + break; + } + } + oldest_snapshot += !exists; + } + } + info->txn_reader_lag = txn->mt_txnid - oldest_snapshot; + } + } + + return MDBX_SUCCESS; +} + MDBX_env *mdbx_txn_env(MDBX_txn *txn) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE || + txn->mt_env->me_signature != MDBX_ME_SIGNATURE)) return NULL; return txn->mt_env; } uint64_t mdbx_txn_id(MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return ~(txnid_t)0; + return 0; return txn->mt_txnid; } int mdbx_txn_flags(MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) return -1; - return txn->mt_flags; } @@ -3673,80 +5524,121 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO "/%" PRIaPGNO, names[mode & MDBX_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (txn->mt_flags & MDBX_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); mdbx_ensure(env, txn->mt_txnid >= /* paranoia is appropriate here */ *env->me_oldest); - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { -#if defined(_WIN32) || defined(_WIN64) - if (txn->mt_flags & MDBX_SHRINK_ALLOWED) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + + int rc = MDBX_SUCCESS; + if (F_ISSET(txn->mt_flags, MDBX_RDONLY)) { + if (txn->to.reader) { + MDBX_reader *slot = txn->to.reader; + mdbx_assert(env, slot->mr_pid == env->me_pid); + if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { + mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.inconsistent && + slot->mr_txnid.inconsistent >= + env->me_lck->mti_oldest_reader); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, nullptr); #endif - if (txn->mt_ro_reader) { - mdbx_ensure(env, /* paranoia is appropriate here */ - txn->mt_txnid == txn->mt_ro_reader->mr_txnid && - txn->mt_ro_reader->mr_txnid >= - env->me_lck->mti_oldest_reader); - txn->mt_ro_reader->mr_snapshot_pages_used = 0; - txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; - mdbx_memory_barrier(); - env->me_lck->mti_readers_refresh_flag = true; + slot->mr_snapshot_pages_used = 0; + safe64_reset(&slot->mr_txnid, false); + env->me_lck->mti_readers_refresh_flag = true; + mdbx_flush_incoherent_cpu_writeback(); + } else { + mdbx_assert(env, slot->mr_pid == env->me_pid); + mdbx_assert(env, + slot->mr_txnid.inconsistent >= SAFE64_INVALID_THRESHOLD); + } if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) - txn->mt_ro_reader->mr_pid = 0; - txn->mt_ro_reader = NULL; + slot->mr_pid = 0; + txn->to.reader = NULL; } } - mdbx_flush_noncoherent_cpu_writeback(); +#if defined(_WIN32) || defined(_WIN64) + if (txn->mt_flags & MDBX_SHRINK_ALLOWED) + mdbx_srwlock_ReleaseShared(&env->me_remap_guard); +#endif txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; + txn->mt_flags = MDBX_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (txn == env->me_txn0) + mdbx_txn_valgrind(env, nullptr); +#endif /* Export or close DBI handles created in this txn */ mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); - - pgno_t *pghead = env->me_reclaimed_pglist; - if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ mdbx_cursors_eot(txn, 0); - if (!(env->me_flags & MDBX_WRITEMAP)) { + if (!(env->me_flags & MDBX_WRITEMAP)) mdbx_dlist_free(txn); - } - if (txn->mt_lifo_reclaimed) { - MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) = 0; - if (txn != env->me_txn0) { - mdbx_txl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } - } - txn->mt_numdbs = 0; txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; - - if (!txn->mt_parent) { - mdbx_pnl_shrink(&txn->mt_befree_pages); - env->me_free_pgs = txn->mt_befree_pages; - /* me_pgstate: */ - env->me_reclaimed_pglist = NULL; - env->me_last_reclaimed = 0; - - env->me_txn = NULL; - + env->me_txn = txn->mt_parent; + if (txn == env->me_txn0) { + mdbx_assert(env, txn->mt_parent == NULL); + mdbx_pnl_shrink(&txn->tw.retired_pages); + mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { - txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - env->me_pgstate = ((MDBX_ntxn *)txn)->mnt_pgstate; - mdbx_pnl_free(txn->mt_befree_pages); - mdbx_pnl_free(txn->mt_spill_pages); - mdbx_free(txn->mt_rw_dirtylist); - } + mdbx_assert(env, txn->mt_parent != NULL); + mdbx_assert(env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + MDBX_txn *const parent = txn->mt_parent; + env->me_txn->mt_child = NULL; + env->me_txn->mt_flags &= ~MDBX_TXN_HAS_CHILD; + mdbx_pnl_free(txn->tw.reclaimed_pglist); + mdbx_pnl_free(txn->tw.spill_pages); + + if (txn->tw.lifo_reclaimed) { + mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; + parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; + } + + if (txn->tw.retired_pages) { + mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= + (unsigned)(uintptr_t)parent->tw.retired_pages); + MDBX_PNL_SIZE(txn->tw.retired_pages) = + (unsigned)(uintptr_t)parent->tw.retired_pages; + parent->tw.retired_pages = txn->tw.retired_pages; + } - mdbx_pnl_free(pghead); + mdbx_free(txn->tw.dirtylist); + + if (parent->mt_geo.upper != txn->mt_geo.upper || + parent->mt_geo.now != txn->mt_geo.now) { + /* undo resize performed by child txn */ + rc = mdbx_mapresize(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper); + if (rc == MDBX_RESULT_TRUE) { + /* unable undo resize (it is regular for Windows), + * therefore promote size changes from child to the parent txn */ + mdbx_notice("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, + parent->mt_geo.upper); + parent->mt_geo.now = txn->mt_geo.now; + parent->mt_geo.upper = txn->mt_geo.upper; + rc = MDBX_SUCCESS; + } else if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_error("error %d while undo resize performed by child txn, fail " + "the parent", + rc); + parent->mt_flags |= MDBX_TXN_ERROR; + if (!env->me_dxb_mmap.address) + env->me_flags |= MDBX_FATAL_ERROR; + } + } + } } mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0); @@ -3755,25 +5647,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_free(txn); } - return MDBX_SUCCESS; + return rc; } int mdbx_txn_reset(MDBX_txn *txn) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; /* This call is only valid for read-only txns */ - if (unlikely(!(txn->mt_flags & MDBX_TXN_RDONLY))) + if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) return MDBX_EINVAL; - /* LY: don't close DBI-handles in MDBX mode */ - int rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + /* LY: don't close DBI-handles */ + rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); if (rc == MDBX_SUCCESS) { mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); mdbx_tassert(txn, txn->mt_owner == 0); @@ -3782,17 +5669,12 @@ int mdbx_txn_reset(MDBX_txn *txn) { } int mdbx_txn_abort(MDBX_txn *txn) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) - /* LY: don't close DBI-handles in MDBX mode */ + if (F_ISSET(txn->mt_flags, MDBX_RDONLY)) + /* LY: don't close DBI-handles */ return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE); @@ -3802,79 +5684,19 @@ int mdbx_txn_abort(MDBX_txn *txn) { return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } -static __inline int mdbx_backlog_size(MDBX_txn *txn) { - int reclaimed_and_loose = - txn->mt_env->me_reclaimed_pglist - ? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) + - txn->mt_loose_count - : 0; - return reclaimed_and_loose; -} - -static __inline int mdbx_backlog_extragap(MDBX_env *env) { - /* LY: extra page(s) for b-tree rebalancing */ - return (env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; -} - -/* LY: Prepare a backlog of pages to modify FreeDB itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ -static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { - /* LY: extra page(s) for b-tree rebalancing */ - const int extra = - mdbx_backlog_extragap(txn->mt_env) + - MDBX_PNL_SIZEOF(txn->mt_befree_pages) / txn->mt_env->me_maxkey_limit; - - if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { - mc->mc_flags &= ~C_RECLAIMING; - int rc = mdbx_cursor_touch(mc); - if (unlikely(rc)) - return rc; - - while (unlikely(mdbx_backlog_size(txn) < extra)) { - rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); - if (unlikely(rc)) { - if (rc != MDBX_NOTFOUND) - return rc; - break; - } - } - mc->mc_flags |= C_RECLAIMING; - } - - return MDBX_SUCCESS; -} - -static void mdbx_prep_backlog_data(MDBX_txn *txn, MDBX_cursor *mc, - size_t bytes) { - const int wanna = - (int)OVPAGES(txn->mt_env, bytes) + mdbx_backlog_extragap(txn->mt_env); - if (unlikely(wanna > mdbx_backlog_size(txn))) { - mc->mc_flags &= ~C_RECLAIMING; - do { - if (mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC) != MDBX_SUCCESS) - break; - } while (wanna > mdbx_backlog_size(txn)); - mc->mc_flags |= C_RECLAIMING; - } -} - /* Count all the pages in each DB and in the freelist and make sure * it matches the actual number of pages being used. * All named DBs must be open for a correct count. */ -static __cold int mdbx_audit(MDBX_txn *txn, unsigned befree_stored) { - MDBX_val key, data; - - const pgno_t pending = - (txn->mt_flags & MDBX_RDONLY) - ? 0 - : txn->mt_loose_count + - (txn->mt_env->me_reclaimed_pglist - ? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) - : 0) + - (txn->mt_befree_pages - ? MDBX_PNL_SIZE(txn->mt_befree_pages) - befree_stored - : 0); +static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc) { + pgno_t pending = 0; + if ((txn->mt_flags & MDBX_RDONLY) == 0) { + pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + + (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored) + + txn->tw.retired2parent_count; + for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) + pending += parent->tw.loose_count; + } MDBX_cursor_couple cx; int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI); @@ -3882,10 +5704,28 @@ static __cold int mdbx_audit(MDBX_txn *txn, unsigned befree_stored) { return rc; pgno_t freecount = 0; - while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) + MDBX_val key, data; + while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { + if (!dont_filter_gc) { + if (unlikely(key.iov_len != sizeof(txnid_t))) + return MDBX_CORRUPTED; + txnid_t id = unaligned_peek_u64(4, key.iov_base); + if (txn->tw.lifo_reclaimed) { + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i) + if (id == txn->tw.lifo_reclaimed[i]) + goto skip; + } else if (id <= txn->tw.last_reclaimed) + goto skip; + } + freecount += *(pgno_t *)data.iov_base; + skip:; + } mdbx_tassert(txn, rc == MDBX_NOTFOUND); + for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) + txn->mt_dbflags[i] &= ~DB_AUDITED; + pgno_t count = 0; for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { if (!(txn->mt_dbflags[i] & DB_VALID)) @@ -3893,27 +5733,36 @@ static __cold int mdbx_audit(MDBX_txn *txn, unsigned befree_stored) { rc = mdbx_cursor_init(&cx.outer, txn, i); if (unlikely(rc != MDBX_SUCCESS)) return rc; + txn->mt_dbflags[i] |= DB_AUDITED; if (txn->mt_dbs[i].md_root == P_INVALID) continue; count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + txn->mt_dbs[i].md_overflow_pages; + if (i != MAIN_DBI) + continue; rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (rc == MDBX_SUCCESS) { MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (unsigned j = 0; j < NUMKEYS(mp); j++) { - MDBX_node *leaf = NODEPTR(mp, j); - if ((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_SUBDATA) { + for (unsigned j = 0; j < page_numkeys(mp); j++) { + MDBX_node *node = page_node(mp, j); + if (node_flags(node) == F_SUBDATA) { + if (unlikely(node_ds(node) < sizeof(MDBX_db))) + return MDBX_CORRUPTED; MDBX_db db_copy, *db; - memcpy(db = &db_copy, NODEDATA(leaf), sizeof(db_copy)); + memcpy(db = &db_copy, node_data(node), sizeof(db_copy)); if ((txn->mt_flags & MDBX_RDONLY) == 0) { for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { - if ((txn->mt_dbflags[k] & MDBX_TBL_DIRTY) && + if ((txn->mt_dbflags[k] & DB_VALID) && /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ - NODEKSZ(leaf) == txn->mt_dbxs[k].md_name.iov_len && - memcmp(NODEKEY(leaf), txn->mt_dbxs[k].md_name.iov_base, - NODEKSZ(leaf)) == 0) { - db = txn->mt_dbs + k; + node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && + memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, + node_ks(node)) == 0) { + txn->mt_dbflags[k] |= DB_AUDITED; + if (txn->mt_dbflags[k] & DB_DIRTY) { + mdbx_tassert(txn, (txn->mt_dbflags[k] & DB_STALE) == 0); + db = txn->mt_dbs + k; + } break; } } @@ -3927,18 +5776,33 @@ static __cold int mdbx_audit(MDBX_txn *txn, unsigned befree_stored) { mdbx_tassert(txn, rc == MDBX_NOTFOUND); } + for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { + if ((txn->mt_dbflags[i] & (DB_VALID | DB_AUDITED | DB_STALE)) != DB_VALID) + continue; + if (F_ISSET(txn->mt_dbflags[i], DB_DIRTY | DB_CREAT)) { + count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + } else { + mdbx_warning("audit %s@%" PRIaTXN + ": unable account dbi %d / \"%*s\", state 0x%02x", + txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, + (int)txn->mt_dbxs[i].md_name.iov_len, + (const char *)txn->mt_dbxs[i].md_name.iov_base, + txn->mt_dbflags[i]); + } + } + if (pending + freecount + count + NUM_METAS == txn->mt_next_pgno) return MDBX_SUCCESS; if ((txn->mt_flags & MDBX_RDONLY) == 0) mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose-count) + " - "%u(reclaimed-list) + %u(befree-pending) - %u(befree-stored)", - txn->mt_txnid, pending, txn->mt_loose_count, - txn->mt_env->me_reclaimed_pglist - ? MDBX_PNL_SIZE(txn->mt_env->me_reclaimed_pglist) - : 0, - txn->mt_befree_pages ? MDBX_PNL_SIZE(txn->mt_befree_pages) : 0, - befree_stored); + "%u(reclaimed-list) + %u(retired-pending) - %u(retired-stored) " + "+ %u(retired2parent)", + txn->mt_txnid, pending, txn->tw.loose_count, + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, + retired_stored, txn->tw.retired2parent_count); mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO "(free) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO "(next-pgno)", @@ -3947,20 +5811,70 @@ static __cold int mdbx_audit(MDBX_txn *txn, unsigned befree_stored) { return MDBX_PROBLEM; } -/* Cleanup reclaimed GC records, than save the befree-list as of this +static __inline unsigned backlog_size(MDBX_txn *txn) { + return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; +} + +static __inline unsigned gctree_backlog(MDBX_txn *txn) { + return /* for split upto root page */ txn->mt_dbs[FREE_DBI].md_depth + + /* for rebalance */ 2 + /* for grow */ 1; +} + +/* LY: Prepare a backlog of pages to modify GC itself, + * while reclaiming is prohibited. It should be enough to prevent search + * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ +static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, + const size_t pnl_bytes) { + const unsigned linear = number_of_ovpages( + txn->mt_env, + pnl_bytes ? pnl_bytes : MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + const unsigned backlog = linear + gctree_backlog(txn); + + if (likely( + linear == 1 && + backlog_size(txn) > + (pnl_bytes + ? backlog + : backlog + /* for COW */ txn->mt_dbs[FREE_DBI].md_depth))) + return MDBX_SUCCESS; + + gc_cursor->mc_flags &= ~C_RECLAIMING; + + int err = mdbx_cursor_touch(gc_cursor); + if (err == MDBX_SUCCESS && linear > 1) + err = mdbx_page_alloc(gc_cursor, linear, nullptr, MDBX_ALLOC_ALL); + + while (err == MDBX_SUCCESS && backlog_size(txn) < backlog) + err = mdbx_page_alloc(gc_cursor, 1, NULL, MDBX_ALLOC_GC); + + gc_cursor->mc_flags |= C_RECLAIMING; + return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; +} + +static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { + /* PNL is initially empty, zero out at least the length */ + memset(pnl.iov_base, 0, sizeof(pgno_t)); + if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) + /* zero out to avoid leaking values from uninitialized malloc'ed memory + * to the file in non-writemap mode if length of the saving page-list + * was changed during space reservation. */ + memset(pnl.iov_base, 0, pnl.iov_len); +} + +/* Cleanup reclaimed GC records, than save the retired-list as of this * transaction to GC (aka freeDB). This recursive changes the reclaimed-list - * loose-list and befree-list. Keep trying until it stabilizes. */ + * loose-list and retired-list. Keep trying until it stabilizes. */ static int mdbx_update_gc(MDBX_txn *txn) { - /* env->me_reclaimed_pglist[] can grow and shrink during this call. - * env->me_last_reclaimed and txn->mt_befree_pages[] can only grow. - * Page numbers cannot disappear from txn->mt_befree_pages[]. */ + /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. + * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. + * Page numbers cannot disappear from txn->tw.retired_pages[]. */ MDBX_env *const env = txn->mt_env; const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); - unsigned befree_stored = 0, loop = 0; + unsigned retired_stored = 0, loop = 0; MDBX_cursor mc; int rc = mdbx_cursor_init(&mc, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) @@ -3971,8 +5885,12 @@ static int mdbx_update_gc(MDBX_txn *txn) { txn->mt_cursors[FREE_DBI] = &mc; retry: - mdbx_trace(" >> restart"); - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + mdbx_trace("%s", " >> restart"); + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); if (unlikely(/* paranoia */ ++loop > 42)) { mdbx_error("too more loops %u, bailout", loop); rc = MDBX_PROBLEM; @@ -3981,22 +5899,23 @@ retry: unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, filled_gc_slot = ~0u; - txnid_t cleaned_gc_id = 0, head_gc_id = env->me_last_reclaimed; - while (1) { - /* Come back here after each Put() in case befree-list changed */ + txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; + while (true) { + /* Come back here after each Put() in case retired-list changed */ MDBX_val key, data; - mdbx_trace(" >> continue"); + mdbx_trace("%s", " >> continue"); - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (txn->mt_lifo_reclaimed) { - if (cleaned_gc_slot < MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)) { + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + if (txn->tw.lifo_reclaimed) { + if (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { settled = 0; cleaned_gc_slot = 0; reused_gc_slot = 0; filled_gc_slot = ~0u; /* LY: cleanup reclaimed records. */ do { - cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; + cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; mdbx_tassert(txn, cleaned_gc_slot > 0 && cleaned_gc_id < *env->me_oldest); key.iov_base = &cleaned_gc_id; @@ -4006,7 +5925,7 @@ retry: continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_prep_backlog(txn, &mc); + rc = mdbx_prep_backlog(txn, &mc, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); @@ -4015,325 +5934,330 @@ retry: rc = mdbx_cursor_del(&mc, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); - mdbx_txl_sort(txn->mt_lifo_reclaimed); - head_gc_id = MDBX_PNL_LAST(txn->mt_lifo_reclaimed); + } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + mdbx_txl_sort(txn->tw.lifo_reclaimed); } } else { - /* If using records from freeDB which we have not yet deleted, + /* If using records from GC which we have not yet deleted, * now delete them and any we reserved for me_reclaimed_pglist. */ - while (cleaned_gc_id < env->me_last_reclaimed) { + while (cleaned_gc_id <= txn->tw.last_reclaimed) { + gc_rid = cleaned_gc_id; + settled = 0; rc = mdbx_cursor_first(&mc, &key, NULL); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND) + break; goto bailout; - rc = mdbx_prep_backlog(txn, &mc); - if (unlikely(rc != MDBX_SUCCESS)) + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); + if (unlikely(cleaned_gc_id < 1 || + cleaned_gc_id >= SAFE64_INVALID_THRESHOLD)) { + rc = MDBX_CORRUPTED; goto bailout; - cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base; - mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed); + } + if (cleaned_gc_id > txn->tw.last_reclaimed) + break; + if (cleaned_gc_id < txn->tw.last_reclaimed) { + rc = mdbx_prep_backlog(txn, &mc, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, cleaned_gc_id); rc = mdbx_cursor_del(&mc, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - settled = 0; } } - // handle loose pages - put ones into the reclaimed- or befree-list - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, befree_stored); + rc = mdbx_audit_ex(txn, retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (txn->mt_loose_pages) { - /* Return loose page numbers to me_reclaimed_pglist, - * though usually none are left at this point. - * The pages themselves remain in dirtylist. */ - if (unlikely(!env->me_reclaimed_pglist) && !txn->mt_lifo_reclaimed && - env->me_last_reclaimed < 1) { - /* Put loose page numbers in mt_befree_pages, - * since unable to return them to me_reclaimed_pglist. */ - if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, - txn->mt_loose_count)) != 0)) - goto bailout; - for (MDBX_page *mp = txn->mt_loose_pages; mp; mp = NEXT_LOOSE_PAGE(mp)) - mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno); - mdbx_trace("%s: append %u loose-pages to befree-pages", dbg_prefix_mode, - txn->mt_loose_count); - } else { - /* Room for loose pages + temp PNL with same */ - if (likely(env->me_reclaimed_pglist != NULL)) { - rc = mdbx_pnl_need(&env->me_reclaimed_pglist, - 2 * txn->mt_loose_count + 2); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - MDBX_PNL loose = env->me_reclaimed_pglist + - MDBX_PNL_ALLOCLEN(env->me_reclaimed_pglist) - - txn->mt_loose_count - 1; - unsigned count = 0; - for (MDBX_page *mp = txn->mt_loose_pages; mp; - mp = NEXT_LOOSE_PAGE(mp)) - loose[++count] = mp->mp_pgno; - MDBX_PNL_SIZE(loose) = count; - mdbx_pnl_sort(loose); - mdbx_pnl_xmerge(env->me_reclaimed_pglist, loose); - } else { - env->me_reclaimed_pglist = mdbx_pnl_alloc(txn->mt_loose_count); - if (unlikely(env->me_reclaimed_pglist == NULL)) { - rc = MDBX_ENOMEM; - goto bailout; - } - for (MDBX_page *mp = txn->mt_loose_pages; mp; - mp = NEXT_LOOSE_PAGE(mp)) - mdbx_pnl_xappend(env->me_reclaimed_pglist, mp->mp_pgno); - mdbx_pnl_sort(env->me_reclaimed_pglist); - } - mdbx_trace("%s: append %u loose-pages to reclaimed-pages", - dbg_prefix_mode, txn->mt_loose_count); - } - - // filter-out list of dirty-pages from loose-pages - MDBX_DPL dl = txn->mt_rw_dirtylist; - mdbx_dpl_sort(dl); - unsigned left = dl->length; - for (MDBX_page *mp = txn->mt_loose_pages; mp;) { - mdbx_tassert(txn, mp->mp_pgno < txn->mt_next_pgno); - mdbx_ensure(env, mp->mp_pgno >= NUM_METAS); - - if (left > 0) { - const unsigned i = mdbx_dpl_search(dl, mp->mp_pgno); - if (i <= dl->length && dl[i].pgno == mp->mp_pgno) { - mdbx_tassert(txn, i > 0 && dl[i].ptr != dl); - dl[i].ptr = dl /* mark for deletion */; - } - left -= 1; - } - - MDBX_page *dp = mp; - mp = NEXT_LOOSE_PAGE(mp); - if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, 1); - } - - if (left > 0) { - MDBX_DPL r, w, end = dl + dl->length; - for (r = w = dl + 1; r <= end; r++) { - if (r->ptr != dl) { - if (r != w) - *w = *r; - ++w; - } - } - mdbx_tassert(txn, w - dl == (int)left + 1); - } - - if (left != dl->length) - mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", - dbg_prefix_mode, dl->length, left); - dl->length = left; - txn->mt_loose_pages = NULL; - txn->mt_loose_count = 0; + /* return suitable into unallocated space */ + if (mdbx_refund(txn)) { + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, befree_stored); + rc = mdbx_audit_ex(txn, retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } - // handle reclaimed pages - return suitable into unallocated space - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (env->me_reclaimed_pglist) { - pgno_t tail = txn->mt_next_pgno; - pgno_t *const begin = MDBX_PNL_BEGIN(env->me_reclaimed_pglist); - pgno_t *const end = MDBX_PNL_END(env->me_reclaimed_pglist); - pgno_t *higest; -#if MDBX_PNL_ASCENDING - for (higest = end; --higest >= begin;) { -#else - for (higest = begin; higest < end; ++higest) { -#endif /* MDBX_PNL sort-order */ - mdbx_tassert(txn, *higest >= NUM_METAS && *higest < tail); - if (*higest != tail - 1) - break; - tail -= 1; - } - if (tail != txn->mt_next_pgno) { -#if MDBX_PNL_ASCENDING - MDBX_PNL_SIZE(env->me_reclaimed_pglist) = - (unsigned)(higest + 1 - begin); -#else - MDBX_PNL_SIZE(env->me_reclaimed_pglist) -= (unsigned)(higest - begin); - for (pgno_t *move = begin; higest < end; ++move, ++higest) - *move = *higest; -#endif /* MDBX_PNL sort-order */ - mdbx_info( - "%s.refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - dbg_prefix_mode, txn->mt_next_pgno - tail, tail, txn->mt_next_pgno); - txn->mt_next_pgno = tail; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, befree_stored); - if (unlikely(rc != MDBX_SUCCESS)) + /* handle loose pages - put ones into the reclaimed- or retired-list */ + if (txn->tw.loose_pages) { + /* Return loose page numbers to me_reclaimed_pglist, + * though usually none are left at this point. + * The pages themselves remain in dirtylist. */ + if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { + if (txn->tw.loose_count > 0) { + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to me_reclaimed_pglist. */ + if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) goto bailout; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) + mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + mdbx_trace("%s: append %u loose-pages to retired-pages", + dbg_prefix_mode, txn->tw.loose_count); } + } else { + /* Room for loose pages + temp PNL with same */ + rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, + 2 * txn->tw.loose_count + 2); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + MDBX_PNL loose = txn->tw.reclaimed_pglist + + MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - + txn->tw.loose_count - 1; + unsigned count = 0; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { + mdbx_tassert(txn, mp->mp_flags == (P_LOOSE | P_DIRTY)); + loose[++count] = mp->mp_pgno; + } + mdbx_tassert(txn, count == txn->tw.loose_count); + MDBX_PNL_SIZE(loose) = count; + mdbx_pnl_sort(loose); + mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose); + mdbx_trace("%s: append %u loose-pages to reclaimed-pages", + dbg_prefix_mode, txn->tw.loose_count); } - } - // handle befree-list - store ones into singe gc-record - if (befree_stored < MDBX_PNL_SIZE(txn->mt_befree_pages)) { - if (unlikely(!befree_stored)) { - /* Make sure last page of freeDB is touched and on befree-list */ + /* filter-out list of dirty-pages from loose-pages */ + const MDBX_DPL dl = txn->tw.dirtylist; + unsigned w = 0; + for (unsigned r = w; ++r <= dl->length;) { + MDBX_page *dp = dl[r].ptr; + mdbx_tassert(txn, (dp->mp_flags & P_DIRTY)); + mdbx_tassert(txn, dl[r].pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1) <= + txn->mt_next_pgno); + if ((dp->mp_flags & P_LOOSE) == 0) { + if (++w != r) + dl[w] = dl[r]; + } else { + mdbx_tassert(txn, dp->mp_flags == (P_LOOSE | P_DIRTY)); + if ((env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(env, dp, 1); + } + } + mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", + dbg_prefix_mode, dl->length, w); + mdbx_tassert(txn, txn->tw.loose_count == dl->length - w); + dl->length = w; + dl->sorted = 0; + txn->tw.dirtyroom += txn->tw.loose_count; + txn->tw.loose_pages = NULL; + txn->tw.loose_count = 0; + } + + const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); + /* handle retired-list - store ones into single gc-record */ + if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { + if (unlikely(!retired_stored)) { + /* Make sure last page of GC is touched and on retired-list */ mc.mc_flags &= ~C_RECLAIMING; rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); mc.mc_flags |= C_RECLAIMING; if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } - /* Write to last page of freeDB */ + /* Write to last page of GC */ key.iov_len = sizeof(txn->mt_txnid); key.iov_base = &txn->mt_txnid; do { - data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages); - mdbx_prep_backlog_data(txn, &mc, data.iov_len); + data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); + mdbx_prep_backlog(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - /* Retry if mt_befree_pages[] grew during the Put() */ - } while (data.iov_len < MDBX_PNL_SIZEOF(txn->mt_befree_pages)); + /* Retry if tw.retired_pages[] grew during the Put() */ + } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - befree_stored = (unsigned)MDBX_PNL_SIZE(txn->mt_befree_pages); - mdbx_pnl_sort(txn->mt_befree_pages); - memcpy(data.iov_base, txn->mt_befree_pages, data.iov_len); + retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); + mdbx_pnl_sort(txn->tw.retired_pages); + mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - mdbx_trace("%s.put-befree #%u @ %" PRIaTXN, dbg_prefix_mode, - befree_stored, txn->mt_txnid); + mdbx_trace("%s.put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + retired_stored, txn->mt_txnid); - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { - unsigned i = befree_stored; + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + unsigned i = retired_stored; mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO " num %u, PNL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, txn->mt_befree_pages[i]); - mdbx_debug_extra_print("\n"); + mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); + mdbx_debug_extra_print("%s", "\n"); } + if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) + goto retry /* rare case, but avoids GC fragmentation and one loop. */; continue; } - // handle reclaimed and loost pages - merge and store both into gc - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - mdbx_tassert(txn, txn->mt_loose_count == 0); + /* handle reclaimed and loost pages - merge and store both into gc */ + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + mdbx_tassert(txn, txn->tw.loose_count == 0); - mdbx_trace(" >> reserving"); + mdbx_trace("%s", " >> reserving"); if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, befree_stored); + rc = mdbx_audit_ex(txn, retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const unsigned amount = - env->me_reclaimed_pglist ? MDBX_PNL_SIZE(env->me_reclaimed_pglist) : 0; const unsigned left = amount - settled; mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " "reused-gc-slots %u", dbg_prefix_mode, amount, settled, (int)left, - txn->mt_lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0, reused_gc_slot); if (0 >= (int)left) break; - if (unlikely(head_gc_id == 0)) { - head_gc_id = mdbx_find_oldest(txn) - 1; - if (txn->mt_lifo_reclaimed == NULL) { - rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_FIRST); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc != MDBX_NOTFOUND) - goto bailout; - } else if (unlikely(key.iov_len != sizeof(txnid_t))) { - rc = MDBX_CORRUPTED; - goto bailout; - } else { - txnid_t first_txn; - memcpy(&first_txn, key.iov_base, sizeof(txnid_t)); - if (head_gc_id >= first_txn) - head_gc_id = first_txn - 1; - } - } - } - const unsigned prefer_max_scatter = 257; txnid_t reservation_gc_id; if (lifo) { - mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); - if (unlikely(!txn->mt_lifo_reclaimed)) { - txn->mt_lifo_reclaimed = mdbx_txl_alloc(); - if (unlikely(!txn->mt_lifo_reclaimed)) { + if (txn->tw.lifo_reclaimed == nullptr) { + txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { rc = MDBX_ENOMEM; goto bailout; } } - - if (head_gc_id > 1 && - MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) < prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - + if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < + prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot) * env->me_maxgc_ov1page) { + /* LY: need just a txn-id for save page list. */ mc.mc_flags &= ~C_RECLAIMING; - rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); + bool need_cleanup = false; + do { + rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC); + if (likely(rc == MDBX_SUCCESS)) { + mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, + MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); + need_cleanup = true; + } + } while (rc == MDBX_SUCCESS && + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < + prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot) * + env->me_maxgc_ov1page); mc.mc_flags |= C_RECLAIMING; + if (likely(rc == MDBX_SUCCESS)) { - /* LY: ok, reclaimed from freedb. */ - mdbx_trace("%s: took @%" PRIaTXN " from GC, continue", - dbg_prefix_mode, MDBX_PNL_LAST(txn->mt_lifo_reclaimed)); + mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); continue; - } - if (unlikely(rc != MDBX_NOTFOUND)) - /* LY: other troubles... */ + } else if (unlikely(rc != MDBX_NOTFOUND)) + /* LY: some troubles... */ goto bailout; - /* LY: freedb is empty, will look any free txn-id in high2low order. */ - do { - --head_gc_id; - mdbx_assert(env, - MDBX_PNL_IS_EMPTY(txn->mt_lifo_reclaimed) || - MDBX_PNL_LAST(txn->mt_lifo_reclaimed) > head_gc_id); - rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, head_gc_id); + if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (need_cleanup) + mdbx_txl_sort(txn->tw.lifo_reclaimed); + gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); + } else { + mdbx_tassert(txn, txn->tw.last_reclaimed == 0); + txn->tw.last_reclaimed = gc_rid = mdbx_find_oldest(txn) - 1; + mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, + dbg_prefix_mode, gc_rid); + } + + /* LY: GC is empty, will look any free txn-id in high2low order. */ + while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot) * + env->me_maxgc_ov1page) { + if (unlikely(gc_rid < 2)) { + if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= + reused_gc_slot)) { + mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " + "lifo_reclaimed %u" PRIaTXN, + reused_gc_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + goto retry; + } + break; + } + + mdbx_tassert(txn, gc_rid > 1 && gc_rid < SAFE64_INVALID_THRESHOLD); + rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, --gc_rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; + + if (reused_gc_slot) + /* rare case, but it is better to clear and re-create GC entries + * with less fragmentation. */ + need_cleanup = true; + else + cleaned_gc_slot += + 1 /* mark cleanup is not needed for added slot. */; mdbx_trace("%s: append @%" PRIaTXN " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, head_gc_id, cleaned_gc_slot); - } while (head_gc_id > 1 && - MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) < prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - - reused_gc_slot) * - env->me_maxgc_ov1page); - } + dbg_prefix_mode, gc_rid, cleaned_gc_slot); + } - if ((unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) <= reused_gc_slot) { - mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " - "lifo_reclaimed %u" PRIaTXN, - reused_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); - goto retry; + if (need_cleanup) { + cleaned_gc_slot = 0; + mdbx_trace("%s: restart inner-loop to clear and re-create GC entries", + dbg_prefix_mode); + continue; + } } + const unsigned i = - (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - reused_gc_slot; - mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); - reservation_gc_id = txn->mt_lifo_reclaimed[i]; + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; + mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + reservation_gc_id = txn->tw.lifo_reclaimed[i]; mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", dbg_prefix_mode, reservation_gc_id, i); } else { - mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); - reservation_gc_id = head_gc_id--; + mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); + if (unlikely(gc_rid == 0)) { + gc_rid = mdbx_find_oldest(txn) - 1; + rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_FIRST); + if (rc == MDBX_SUCCESS) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (unlikely(gc_first < 1 || gc_first >= SAFE64_INVALID_THRESHOLD)) { + rc = MDBX_CORRUPTED; + goto bailout; + } + if (gc_rid >= gc_first) + gc_rid = gc_first - 1; + if (unlikely(gc_rid == 0)) { + mdbx_error("%s", "** no GC tail-space to store"); + goto retry; + } + } else if (rc != MDBX_NOTFOUND) + goto bailout; + txn->tw.last_reclaimed = gc_rid; + } + reservation_gc_id = gc_rid--; mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, reservation_gc_id); } @@ -4341,18 +6265,18 @@ retry: unsigned chunk = left; if (unlikely(chunk > env->me_maxgc_ov1page)) { - const unsigned avail_gs_slots = - txn->mt_lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - + const unsigned avail_gc_slots = + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot + 1 - : (head_gc_id < INT16_MAX) ? (unsigned)head_gc_id : INT16_MAX; - if (avail_gs_slots > 1) { + : (gc_rid < INT16_MAX) ? (unsigned)gc_rid : INT16_MAX; + if (avail_gc_slots > 1) { if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { const unsigned threshold = - env->me_maxgc_ov1page * ((avail_gs_slots < prefer_max_scatter) - ? avail_gs_slots + env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) + ? avail_gc_slots : prefer_max_scatter); if (left < threshold) chunk = env->me_maxgc_ov1page; @@ -4364,9 +6288,9 @@ retry: if (tail > avail) { for (unsigned i = amount - span; i > 0; --i) { if (MDBX_PNL_ASCENDING - ? (env->me_reclaimed_pglist[i] + span) - : (env->me_reclaimed_pglist[i] - span) == - env->me_reclaimed_pglist[i + span]) { + ? (txn->tw.reclaimed_pglist[i] + span) + : (txn->tw.reclaimed_pglist[i] - span) == + txn->tw.reclaimed_pglist[i + span]) { span += 1; avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - @@ -4378,7 +6302,7 @@ retry: } chunk = (avail >= tail) ? tail - span - : (avail_gs_slots > 3 && + : (avail_gc_slots > 3 && reused_gc_slot < prefer_max_scatter - 3) ? avail - span : tail; @@ -4388,9 +6312,9 @@ retry: } mdbx_tassert(txn, chunk > 0); - mdbx_trace("%s: head_gc_id %" PRIaTXN ", reused_gc_slot %u, reservation-id " + mdbx_trace("%s: rc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " "%" PRIaTXN, - dbg_prefix_mode, head_gc_id, reused_gc_slot, reservation_gc_id); + dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, env->me_maxgc_ov1page); @@ -4398,7 +6322,7 @@ retry: mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); if (unlikely(reservation_gc_id < 1 || reservation_gc_id >= *env->me_oldest)) { - /* LY: not any txn in the past of freedb. */ + mdbx_error("%s", "** internal error (reservation_gc_id)"); rc = MDBX_PROBLEM; goto bailout; } @@ -4408,22 +6332,22 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, settled + 1, settled + chunk + 1, reservation_gc_id); - mdbx_prep_backlog_data(txn, &mc, data.iov_len); + mdbx_prep_backlog(txn, &mc, data.iov_len); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - /* PNL is initially empty, zero out at least the length */ - memset(data.iov_base, 0, sizeof(pgno_t)); + clean_reserved_gc_pnl(env, data); settled += chunk; mdbx_trace("%s.settled %u (+%u), continue", dbg_prefix_mode, settled, chunk); - if (txn->mt_lifo_reclaimed && - unlikely(amount < MDBX_PNL_SIZE(env->me_reclaimed_pglist))) { + if (txn->tw.lifo_reclaimed && + unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, - (unsigned)MDBX_PNL_SIZE(env->me_reclaimed_pglist)); + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); goto retry; } @@ -4433,24 +6357,26 @@ retry: mdbx_tassert( txn, cleaned_gc_slot == - (txn->mt_lifo_reclaimed ? MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) : 0)); + (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); - mdbx_trace(" >> filling"); + mdbx_trace("%s", " >> filling"); /* Fill in the reserved records */ filled_gc_slot = - txn->mt_lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - reused_gc_slot + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot : reused_gc_slot; rc = MDBX_SUCCESS; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - if (env->me_reclaimed_pglist && MDBX_PNL_SIZE(env->me_reclaimed_pglist)) { + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - const unsigned amount = MDBX_PNL_SIZE(env->me_reclaimed_pglist); + const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); unsigned left = amount; - if (txn->mt_lifo_reclaimed == nullptr) { + if (txn->tw.lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); rc = mdbx_cursor_first(&mc, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) @@ -4459,31 +6385,31 @@ retry: mdbx_tassert(txn, lifo != 0); } - while (1) { + while (true) { txnid_t fill_gc_id; mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, - (unsigned)MDBX_PNL_SIZE(env->me_reclaimed_pglist)); - if (txn->mt_lifo_reclaimed == nullptr) { + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + if (txn->tw.lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); - fill_gc_id = *(txnid_t *)key.iov_base; - if (filled_gc_slot-- == 0 || fill_gc_id > env->me_last_reclaimed) { + fill_gc_id = unaligned_peek_u64(4, key.iov_base); + if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { mdbx_notice( "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, - filled_gc_slot, fill_gc_id, env->me_last_reclaimed); + filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { mdbx_tassert(txn, lifo != 0); if (++filled_gc_slot > - (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)) { + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " "lifo_reclaimed %u" PRIaTXN, filled_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); goto retry; } - fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot]; + fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; mdbx_trace("%s.seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", dbg_prefix_mode, fill_gc_id, filled_gc_slot); key.iov_base = &fill_gc_id; @@ -4493,8 +6419,8 @@ retry: goto bailout; } mdbx_tassert(txn, cleaned_gc_slot == - (txn->mt_lifo_reclaimed - ? MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) + (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest); key.iov_base = &fill_gc_id; @@ -4518,37 +6444,36 @@ retry: mc.mc_flags &= ~C_GCFREEZE; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + clean_reserved_gc_pnl(env, data); - if (unlikely(txn->mt_loose_count || - amount != MDBX_PNL_SIZE(env->me_reclaimed_pglist))) { - memset(data.iov_base, 0, sizeof(pgno_t)); + if (unlikely(txn->tw.loose_count || + amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { mdbx_notice("** restart: reclaimed-list changed (%u -> %u, %u)", amount, - MDBX_PNL_SIZE(env->me_reclaimed_pglist), - txn->mt_loose_count); + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.loose_count); goto retry; } - if (unlikely(txn->mt_lifo_reclaimed - ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) - : cleaned_gc_id < env->me_last_reclaimed)) { - memset(data.iov_base, 0, sizeof(pgno_t)); - mdbx_notice("** restart: reclaimed-slots changed"); + if (unlikely(txn->tw.lifo_reclaimed + ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : cleaned_gc_id < txn->tw.last_reclaimed)) { + mdbx_notice("%s", "** restart: reclaimed-slots changed"); goto retry; } pgno_t *dst = data.iov_base; *dst++ = chunk; - pgno_t *src = MDBX_PNL_BEGIN(env->me_reclaimed_pglist) + left - chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO "] @%" PRIaTXN, dbg_prefix_mode, chunk, - (unsigned)(from - env->me_reclaimed_pglist), from[0], - (unsigned)(to - env->me_reclaimed_pglist), to[-1], fill_gc_id); + (unsigned)(from - txn->tw.reclaimed_pglist), from[0], + (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id); left -= chunk; if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, befree_stored + amount - left); + rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -4557,7 +6482,7 @@ retry: break; } - if (txn->mt_lifo_reclaimed == nullptr) { + if (txn->tw.lifo_reclaimed == nullptr) { mdbx_tassert(txn, lifo == 0); rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) @@ -4569,33 +6494,43 @@ retry: } mdbx_tassert(txn, rc == MDBX_SUCCESS); - if (unlikely(txn->mt_loose_count != 0 || + if (unlikely(txn->tw.loose_count != 0 || filled_gc_slot != - (txn->mt_lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) + (txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0))) { mdbx_notice("** restart: reserve excess (filled-slot %u, loose-count %u)", - filled_gc_slot, txn->mt_loose_count); + filled_gc_slot, txn->tw.loose_count); goto retry; } mdbx_tassert(txn, - txn->mt_lifo_reclaimed == NULL || - cleaned_gc_slot == MDBX_PNL_SIZE(txn->mt_lifo_reclaimed)); + txn->tw.lifo_reclaimed == NULL || + cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); bailout: txn->mt_cursors[FREE_DBI] = mc.mc_next; bailout_notracking: - if (txn->mt_lifo_reclaimed) { - MDBX_PNL_SIZE(txn->mt_lifo_reclaimed) = 0; - if (txn != env->me_txn0) { - mdbx_txl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; + mdbx_trace("<<< %u loops, rc = %d", loop, rc); + return rc; +} + +static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov, + unsigned iov_items, size_t iov_off, + size_t iov_bytes) { + MDBX_env *const env = txn->mt_env; + int rc = mdbx_pwritev(env->me_fd, iov, iov_items, iov_off, iov_bytes); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_error("Write error: %s", mdbx_strerror(rc)); + txn->mt_flags |= MDBX_TXN_ERROR; } - mdbx_trace("<<< %u loops, rc = %d", loop, rc); + for (unsigned i = 0; i < iov_items; i++) + mdbx_dpage_free(env, (MDBX_page *)iov[i].iov_base, + bytes2pgno(env, iov[i].iov_len)); + return rc; } @@ -4603,119 +6538,93 @@ bailout_notracking: * [in] txn the transaction that's being committed * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { - MDBX_env *env = txn->mt_env; - MDBX_DPL dl = txn->mt_rw_dirtylist; - unsigned i, j, pagecount = dl->length; - int rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDBX_page *dp = NULL; +static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { struct iovec iov[MDBX_COMMIT_PAGES]; - intptr_t wpos = 0, wsize = 0; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ - int n = 0; - - j = i = keep; - - if (env->me_flags & MDBX_WRITEMAP) { - /* Clear dirty flags */ - while (++i <= pagecount) { - dp = dl[i].ptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE | P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - dp->mp_validator = 0 /* TODO */; - *env->me_unsynced_pages += IS_OVERFLOW(dp) ? dp->mp_pages : 1; + const MDBX_DPL dl = (keep || txn->tw.loose_count > 1) + ? mdbx_dpl_sort(txn->tw.dirtylist) + : txn->tw.dirtylist; + MDBX_env *const env = txn->mt_env; + pgno_t flush_begin = MAX_PAGENO; + pgno_t flush_end = MIN_PAGENO; + unsigned iov_items = 0; + size_t iov_bytes = 0; + size_t iov_off = 0; + unsigned r, w; + for (r = w = keep; ++r <= dl->length;) { + MDBX_page *dp = dl[r].ptr; + mdbx_tassert(txn, + dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); + mdbx_tassert(txn, dp->mp_flags & P_DIRTY); + + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++w] = dl[r]; + continue; } - goto done; - } - - /* Write the pages */ - for (;;) { - if (++i <= pagecount) { - dp = dl[i].ptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE | P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[i].pgno = 0; - continue; - } - pgno = dl[i].pgno; - mdbx_tassert(txn, pgno >= MIN_PAGENO); - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - dp->mp_validator = 0 /* TODO */; - pos = pgno2bytes(env, pgno); - const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; - *env->me_unsynced_pages += npages; - size = pgno2bytes(env, npages); - } - /* Write up to MDBX_COMMIT_PAGES dirty pages at a time. */ - if (pos != next_pos || n == MDBX_COMMIT_PAGES || wsize + size > MAX_WRITE) { - if (n) { - /* Write previous page(s) */ - rc = mdbx_pwritev(env->me_fd, iov, n, wpos, wsize); - if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_debug("Write error: %s", mdbx_strerror(rc)); - return rc; - } -#if MDBX_CPU_CACHE_MMAP_NONCOHERENT + const unsigned npages = IS_OVERFLOW(dp) ? dp->mp_pages : 1; + flush_begin = (flush_begin < dp->mp_pgno) ? flush_begin : dp->mp_pgno; + flush_end = + (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; + *env->me_unsynced_pages += npages; + dp->mp_flags &= ~P_DIRTY; + dp->mp_validator = 0 /* TODO */; + + if ((env->me_flags & MDBX_WRITEMAP) == 0) { + const size_t size = pgno2bytes(env, npages); + if (iov_off + iov_bytes != pgno2bytes(env, dp->mp_pgno) || + iov_items == ARRAY_LENGTH(iov) || iov_bytes + size > MAX_WRITE) { + if (iov_items) { + int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; #if defined(__linux__) || defined(__gnu_linux__) - if (linux_kernel_version >= 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ + if (mdbx_linux_kernel_version >= 0x02060b00) + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the + * whole cache is always flushed. */ #endif /* Linux */ - mdbx_invalidate_mmap_noncoherent_cache(env->me_map + wpos, wsize); -#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ - - n = 0; + mdbx_flush_incoherent_mmap(env->me_map + iov_off, iov_bytes, + env->me_os_psize); + iov_items = 0; + iov_bytes = 0; + } + iov_off = pgno2bytes(env, dp->mp_pgno); } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; + iov[iov_items].iov_base = (void *)dp; + iov[iov_items].iov_len = size; + iov_items += 1; + iov_bytes += size; } - mdbx_debug("committing page %" PRIaPGNO, pgno); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; } -#if MDBX_CPU_CACHE_MMAP_NONCOHERENT && \ - (defined(__linux__) || defined(__gnu_linux__)) - if (linux_kernel_version < 0x02060b00) { - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the whole - * cache is always flushed. */ - mdbx_invalidate_mmap_noncoherent_cache(env->me_map, - pgno2bytes(env, txn->mt_next_pgno)); - } -#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT && Linux */ - - for (i = keep; ++i <= pagecount;) { - dp = dl[i].ptr; - /* This is a page we skipped above */ - if (!dl[i].pgno) { - dl[++j] = dl[i]; - dl[j].pgno = dp->mp_pgno; - continue; - } - mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); + if (iov_items) { + int rc = mdbx_flush_iov(txn, iov, iov_items, iov_off, iov_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } -done: - i--; - txn->mt_dirtyroom += i - j; - dl->length = j; +#if defined(__linux__) || defined(__gnu_linux__) + if ((env->me_flags & MDBX_WRITEMAP) == 0 && + mdbx_linux_kernel_version < 0x02060b00) + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the + * whole cache is always flushed. */ + mdbx_flush_incoherent_mmap(env->me_map + pgno2bytes(env, flush_begin), + pgno2bytes(env, flush_end - flush_begin), + env->me_os_psize); +#endif /* Linux */ + + /* TODO: use flush_begin & flush_end for msync() & sync_file_range(). */ + (void)flush_begin; + (void)flush_end; + + txn->tw.dirtyroom += r - 1 - w; + dl->length = w; + mdbx_tassert(txn, txn->mt_parent || + txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); return MDBX_SUCCESS; } @@ -4758,16 +6667,9 @@ static __inline bool TXN_DBI_EXIST(MDBX_txn *txn, MDBX_dbi dbi, } int mdbx_txn_commit(MDBX_txn *txn) { - int rc; - - if (unlikely(txn == NULL)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; MDBX_env *env = txn->mt_env; #if MDBX_TXN_CHECKPID @@ -4777,58 +6679,54 @@ int mdbx_txn_commit(MDBX_txn *txn) { } #endif /* MDBX_TXN_CHECKPID */ + /* mdbx_txn_end() mode for a commit which writes nothing */ + unsigned end_mode = + MDBX_END_EMPTY_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + if (unlikely(F_ISSET(txn->mt_flags, MDBX_RDONLY))) + goto done; + if (txn->mt_child) { rc = mdbx_txn_commit(txn->mt_child); - txn->mt_child = NULL; + mdbx_tassert(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - /* mdbx_txn_end() mode for a commit which writes nothing */ - unsigned end_mode = - MDBX_END_EMPTY_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) - goto done; - - if (unlikely(txn->mt_flags & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR))) { - mdbx_debug("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDBX_TXN_ERROR; - rc = MDBX_BAD_TXN; + if (unlikely(txn != env->me_txn)) { + mdbx_debug("%s", "attempt to commit unknown transaction"); + rc = MDBX_EINVAL; goto fail; } if (txn->mt_parent) { - MDBX_txn *parent = txn->mt_parent; - MDBX_page **lp; - MDBX_DPL dst, src; - MDBX_PNL pspill; - unsigned i, x, y, len, ps_len; - - /* Append our reclaim list to parent's */ - if (txn->mt_lifo_reclaimed) { - if (parent->mt_lifo_reclaimed) { - rc = mdbx_txl_append_list(&parent->mt_lifo_reclaimed, - txn->mt_lifo_reclaimed); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - mdbx_txl_free(txn->mt_lifo_reclaimed); - } else - parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; - txn->mt_lifo_reclaimed = NULL; + MDBX_txn *const parent = txn->mt_parent; + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + + /* Preserve space for spill list to avoid parent's state corruption + * if allocation fails. */ + if (txn->tw.spill_pages && parent->tw.spill_pages) { + rc = mdbx_pnl_need(&parent->tw.spill_pages, + MDBX_PNL_SIZE(txn->tw.spill_pages)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } - /* Append our free list to parent's */ - rc = mdbx_pnl_append_list(&parent->mt_befree_pages, txn->mt_befree_pages); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - mdbx_pnl_free(txn->mt_befree_pages); - /* Failures after this must either undo the changes - * to the parent or set MDBX_TXN_ERROR in the parent. */ + //------------------------------------------------------------------------- + + parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; + txn->tw.lifo_reclaimed = NULL; + + parent->tw.retired_pages = txn->tw.retired_pages; + txn->tw.retired_pages = NULL; + + mdbx_pnl_free(parent->tw.reclaimed_pglist); + parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; + txn->tw.reclaimed_pglist = NULL; + parent->tw.last_reclaimed = txn->tw.last_reclaimed; - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_end_pgno = txn->mt_end_pgno; - parent->mt_flags = txn->mt_flags; + parent->mt_geo = txn->mt_geo; + parent->mt_canary = txn->mt_canary; + parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; /* Merge our cursors into parent's and close them */ mdbx_cursors_eot(txn, 1); @@ -4838,123 +6736,229 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_numdbs = txn->mt_numdbs; parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { /* preserve parent's DB_NEW status */ parent->mt_dbflags[i] = txn->mt_dbflags[i] | (parent->mt_dbflags[i] & (DB_CREAT | DB_FRESH)); } - dst = parent->mt_rw_dirtylist; - src = txn->mt_rw_dirtylist; + /* Remove refunded pages from parent's dirty & spill lists */ + MDBX_DPL dst = mdbx_dpl_sort(parent->tw.dirtylist); + while (dst->length && dst[dst->length].pgno >= parent->mt_next_pgno) { + MDBX_page *mp = dst[dst->length].ptr; + if (mp && (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, mp, IS_OVERFLOW(mp) ? mp->mp_pages : 1); + dst->length -= 1; + } + parent->tw.dirtyroom += dst->sorted - dst->length; + dst->sorted = dst->length; + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + + if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0 && + MDBX_PNL_MOST(parent->tw.spill_pages) >= parent->mt_next_pgno << 1) { + const MDBX_PNL ps = parent->tw.spill_pages; +#if MDBX_PNL_ASCENDING + unsigned i = MDBX_PNL_SIZE(ps); + assert(MDBX_PNL_MOST(ps) == MDBX_PNL_LAST(ps)); + do + i -= 1; + while (i && ps[i] >= parent->mt_next_pgno << 1); + MDBX_PNL_SIZE(ps) = i; +#else + assert(MDBX_PNL_MOST(ps) == MDBX_PNL_FIRST(ps)); + unsigned i = 1, len = MDBX_PNL_SIZE(ps); + while (i < len && ps[i + 1] >= parent->mt_next_pgno << 1) + ++i; + MDBX_PNL_SIZE(ps) = len -= i; + for (unsigned k = 1; k <= len; ++k) + ps[k] = ps[k + i]; +#endif + } + /* Remove anything in our dirty list from parent's spill list */ - if ((pspill = parent->mt_spill_pages) && (ps_len = MDBX_PNL_SIZE(pspill))) { - x = y = ps_len; - MDBX_PNL_SIZE(pspill) = ~(pgno_t)0; + MDBX_DPL src = mdbx_dpl_sort(txn->tw.dirtylist); + if (likely(src->length > 0) && parent->tw.spill_pages && + MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { + MDBX_PNL sp = parent->tw.spill_pages; + assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno)); + + const unsigned len = MDBX_PNL_SIZE(parent->tw.spill_pages); + MDBX_PNL_SIZE(sp) = ~(pgno_t)0; + /* Mark our dirty pages as deleted in parent spill list */ - for (i = 0, len = src->length; ++i <= len;) { + unsigned r, w, i = 1; + w = r = len; + do { pgno_t pn = src[i].pgno << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; + while (pn > sp[r]) + r--; + if (pn == sp[r]) { + sp[r] = 1; + w = --r; } - } + } while (++i <= src->length); + /* Squash deleted pagenums if we deleted any */ - for (x = y; ++x <= ps_len;) - if ((pspill[x] & 1) == 0) - pspill[++y] = pspill[x]; - MDBX_PNL_SIZE(pspill) = y; + for (r = w; ++r <= len;) + if ((sp[r] & 1) == 0) + sp[++w] = sp[r]; + MDBX_PNL_SIZE(sp) = w; + assert(mdbx_pnl_check4assert(sp, txn->mt_next_pgno << 1)); } /* Remove anything in our spill list from parent's dirty list */ - if (txn->mt_spill_pages && MDBX_PNL_SIZE(txn->mt_spill_pages)) { - for (i = 1; i <= MDBX_PNL_SIZE(txn->mt_spill_pages); i++) { - pgno_t pn = txn->mt_spill_pages[i]; - if (pn & 1) - continue; /* deleted spillpg */ - pn >>= 1; - y = mdbx_dpl_search(dst, pn); - if (y <= dst->length && dst[y].pgno == pn) { - mdbx_free(dst[y].ptr); - while (y < dst->length) { - dst[y] = dst[y + 1]; - y++; - } - dst->length--; + if (txn->tw.spill_pages && MDBX_PNL_SIZE(txn->tw.spill_pages) > 0) { + const MDBX_PNL sp = txn->tw.spill_pages; + mdbx_pnl_sort(sp); + /* Scanning in ascend order */ + const int step = MDBX_PNL_ASCENDING ? 1 : -1; + const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(sp); + const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sp) + 1 : 0; + mdbx_tassert(txn, sp[begin] <= sp[end - step]); + + unsigned r, w = r = mdbx_dpl_search(dst, sp[begin] >> 1); + mdbx_tassert(txn, dst->sorted == dst->length); + for (int i = begin; r <= dst->length;) { + mdbx_tassert(txn, (sp[i] & 1) == 0); + const pgno_t pgno = sp[i] >> 1; + if (dst[r].pgno < pgno) { + dst[w++] = dst[r++]; + } else if (dst[r].pgno > pgno) { + i += step; + if (i == end) + while (r <= dst->length) + dst[w++] = dst[r++]; + } else { + MDBX_page *dp = dst[r++].ptr; + if ((env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pages : 1); } } - } - - /* Find len = length of merging our dirty list with parent's */ - x = dst->length; - dst->length = 0; /* simplify loops */ - if (parent->mt_parent) { - len = x + src->length; - y = mdbx_dpl_search(src, dst[x].pgno + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].pgno; - while (yp < dst[i].pgno) - i--; - if (yp == dst[i].pgno) { - i--; - len--; - } + mdbx_tassert(txn, r == dst->length + 1); + dst->length = w; + parent->tw.dirtyroom += r - w; + } + assert(dst->sorted == dst->length); + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + + unsigned d, s, l; + /* Find length of merging our dirty list with parent's */ + for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0; ++l) { + const pgno_t s_pgno = src[s].pgno; + const pgno_t d_pgno = dst[d].pgno; + d -= d_pgno >= s_pgno; + s -= d_pgno <= s_pgno; + } + assert(dst->sorted == dst->length); + dst->sorted = l += d + s; + assert(dst->sorted >= dst->length); + parent->tw.dirtyroom -= dst->sorted - dst->length; + + /* Merge our dirty list into parent's */ + for (d = dst->length, s = src->length; d > 0 && s > 0; --l) { + if (dst[d].pgno > src[s].pgno) + dst[l] = dst[d--]; + else if (dst[d].pgno < src[s].pgno) + dst[l] = src[s--]; + else { + MDBX_page *dp = dst[d--].ptr; + if (dp && (env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(env, dp, IS_OVERFLOW(dp) ? dp->mp_pgno : 1); + dst[l] = src[s--]; } - } else { /* Simplify the above for single-ancestor case */ - len = MDBX_DPL_TXNFULL - txn->mt_dirtyroom; - } - /* Merge our dirty list with parent's */ - y = src->length; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].pgno; - while (yp < dst[x].pgno) - dst[i--] = dst[x--]; - if (yp == dst[x].pgno) - mdbx_free(dst[x--].ptr); - } - mdbx_tassert(txn, i == x); - dst->length = len; - mdbx_free(txn->mt_rw_dirtylist); - parent->mt_dirtyroom = txn->mt_dirtyroom; - if (txn->mt_spill_pages) { - if (parent->mt_spill_pages) { - /* TODO: Prevent failure here, so parent does not fail */ - rc = mdbx_pnl_append_list(&parent->mt_spill_pages, txn->mt_spill_pages); - if (unlikely(rc != MDBX_SUCCESS)) - parent->mt_flags |= MDBX_TXN_ERROR; - mdbx_pnl_free(txn->mt_spill_pages); - mdbx_pnl_sort(parent->mt_spill_pages); + } + if (s) { + do + dst[l--] = src[s--]; + while (s > 0); + } else if (d) { + do + dst[l--] = dst[d--]; + while (d > 0); + } + assert(l == 0); + dst->length = dst->sorted; + mdbx_free(txn->tw.dirtylist); + txn->tw.dirtylist = nullptr; + mdbx_tassert(parent, + parent->mt_parent || + parent->tw.dirtyroom + parent->tw.dirtylist->length == + MDBX_DPL_TXNFULL); + + if (txn->tw.spill_pages) { + if (parent->tw.spill_pages) { + /* Must not fail since space was preserved above. */ + rc = mdbx_pnl_append_list(&parent->tw.spill_pages, txn->tw.spill_pages); + mdbx_assert(env, rc == MDBX_SUCCESS); + (void)rc; + mdbx_pnl_free(txn->tw.spill_pages); + mdbx_pnl_sort(parent->tw.spill_pages); } else { - parent->mt_spill_pages = txn->mt_spill_pages; + parent->tw.spill_pages = txn->tw.spill_pages; } } + parent->mt_flags &= ~(MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD); + if (parent->tw.spill_pages) { + assert(mdbx_pnl_check4assert(parent->tw.spill_pages, + parent->mt_next_pgno << 1)); + if (MDBX_PNL_SIZE(parent->tw.spill_pages)) + parent->mt_flags |= MDBX_TXN_SPILLS; + } + /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pages; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) - ; - *lp = txn->mt_loose_pages; - parent->mt_loose_count += txn->mt_loose_count; + if (txn->tw.loose_pages) { + MDBX_page **lp = &parent->tw.loose_pages; + while (*lp) + lp = &(*lp)->mp_next; + *lp = txn->tw.loose_pages; + parent->tw.loose_count += txn->tw.loose_count; + } + if (txn->tw.retired2parent_pages) { + MDBX_page *mp = txn->tw.retired2parent_pages; + do { + MDBX_page *next = mp->mp_next; + rc = mdbx_page_loose(parent, mp); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + mp = next; + } while (mp); + } + env->me_txn = parent; parent->mt_child = NULL; - mdbx_pnl_free(((MDBX_ntxn *)txn)->mnt_pgstate.mf_reclaimed_pglist); txn->mt_signature = 0; mdbx_free(txn); - return rc; - } + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - if (unlikely(txn != env->me_txn)) { - mdbx_debug("attempt to commit unknown transaction"); - rc = MDBX_EINVAL; - goto fail; + /* Scan parent's loose page for suitable for refund */ + for (MDBX_page *mp = parent->tw.loose_pages; mp; mp = mp->mp_next) { + if (mp->mp_pgno == parent->mt_next_pgno - 1) { + mdbx_refund(parent); + break; + } + } + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + return MDBX_SUCCESS; } + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); mdbx_cursors_eot(txn, 0); end_mode |= MDBX_END_EOTDONE; - if (txn->mt_rw_dirtylist->length == 0 && - !(txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS))) + if (txn->tw.dirtylist->length == 0 && + (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { + for (int i = txn->mt_numdbs; --i >= 0;) + mdbx_tassert(txn, (txn->mt_dbflags[i] & DB_DIRTY) == 0); goto done; + } mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO "/%" PRIaPGNO, @@ -4964,20 +6968,21 @@ int mdbx_txn_commit(MDBX_txn *txn) { /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { MDBX_cursor mc; - MDBX_dbi i; MDBX_val data; data.iov_len = sizeof(MDBX_db); rc = mdbx_cursor_init(&mc, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { if (unlikely(TXN_DBI_CHANGED(txn, i))) { rc = MDBX_BAD_DBI; goto fail; } - data.iov_base = &txn->mt_dbs[i]; + MDBX_db *db = &txn->mt_dbs[i]; + db->md_mod_txnid = txn->mt_txnid; + data.iov_base = db; WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, F_SUBDATA)); @@ -4991,28 +6996,26 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) goto fail; - mdbx_pnl_free(env->me_reclaimed_pglist); - env->me_reclaimed_pglist = NULL; - mdbx_pnl_shrink(&txn->mt_befree_pages); - if (mdbx_audit_enabled()) { - rc = mdbx_audit(txn, 0); + rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } rc = mdbx_page_flush(txn, 0); if (likely(rc == MDBX_SUCCESS)) { - MDBX_meta meta, *head = mdbx_meta_head(env); + if (txn->mt_dbs[MAIN_DBI].md_flags & DB_DIRTY) + txn->mt_dbs[MAIN_DBI].md_mod_txnid = txn->mt_txnid; + MDBX_meta meta, *head = mdbx_meta_head(env); meta.mm_magic_and_version = head->mm_magic_and_version; meta.mm_extra_flags = head->mm_extra_flags; meta.mm_validator_id = head->mm_validator_id; meta.mm_extra_pagehdr = head->mm_extra_pagehdr; + meta.mm_pages_retired = + head->mm_pages_retired + MDBX_PNL_SIZE(txn->tw.retired_pages); - meta.mm_geo = head->mm_geo; - meta.mm_geo.next = txn->mt_next_pgno; - meta.mm_geo.now = txn->mt_end_pgno; + meta.mm_geo = txn->mt_geo; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_canary = txn->mt_canary; @@ -5038,28 +7041,209 @@ fail: return rc; } +static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, + uint64_t *filesize, + const MDBX_page *const page, + const unsigned meta_number, + MDBX_meta *dest, + const unsigned guess_pagesize) { + if (meta->mm_magic_and_version != MDBX_DATA_MAGIC && + meta->mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) { + mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, + meta->mm_magic_and_version); + return ((meta->mm_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; + } + + if (page->mp_pgno != meta_number) { + mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, + page->mp_pgno); + return MDBX_INVALID; + } + + if (page->mp_flags != P_META) { + mdbx_error("page #%u not a meta-page", meta_number); + return MDBX_INVALID; + } + + /* LY: check pagesize */ + if (!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || + meta->mm_psize > MAX_PAGESIZE) { + mdbx_notice("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->mm_psize); + return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; + } + + if (dest && meta_number == 0 && guess_pagesize != meta->mm_psize) { + dest->mm_psize = meta->mm_psize; + mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); + } + + if (safe64_read(&meta->mm_txnid_a) != safe64_read(&meta->mm_txnid_b)) { + mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + return MDBX_RESULT_TRUE; + } + + /* LY: check signature as a checksum */ + if (META_IS_STEADY(meta) && meta->mm_datasync_sign != mdbx_meta_sign(meta)) { + mdbx_notice("meta[%u] has invalid steady-checksum (0x%" PRIx64 + " != 0x%" PRIx64 "), skip it", + meta_number, meta->mm_datasync_sign, mdbx_meta_sign(meta)); + return MDBX_RESULT_TRUE; + } + + mdbx_debug("read meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, + meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, + meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, + meta->mm_geo.grow, meta->mm_geo.shrink, + meta->mm_txnid_a.inconsistent, mdbx_durable_str(meta)); + + /* LY: check min-pages value */ + if (meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO) { + mdbx_notice("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.lower); + return MDBX_INVALID; + } + + /* LY: check max-pages value */ + if (meta->mm_geo.upper < MIN_PAGENO || meta->mm_geo.upper > MAX_PAGENO || + meta->mm_geo.upper < meta->mm_geo.lower) { + mdbx_notice("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.upper); + return MDBX_INVALID; + } + + /* LY: check last_pgno */ + if (meta->mm_geo.next < MIN_PAGENO || meta->mm_geo.next - 1 > MAX_PAGENO) { + mdbx_notice("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next); + return MDBX_CORRUPTED; + } + + /* LY: check filesize & used_bytes */ + const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; + if (used_bytes > *filesize) { + /* Here could be a race with DB-shrinking performed by other process */ + int err = mdbx_filesize(env->me_fd, filesize); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (used_bytes > *filesize) { + mdbx_notice("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, *filesize); + return MDBX_CORRUPTED; + } + } + + /* LY: check mapsize limits */ + const uint64_t mapsize_min = meta->mm_geo.lower * (uint64_t)meta->mm_psize; + STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE); + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE) { + mdbx_notice("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); + return MDBX_VERSION_MISMATCH; + } + + const uint64_t mapsize_max = meta->mm_geo.upper * (uint64_t)meta->mm_psize; + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (mapsize_max > MAX_MAPSIZE || + MAX_PAGENO < roundup_powerof2((size_t)mapsize_max, env->me_os_psize) / + (size_t)meta->mm_psize) { + if (meta->mm_geo.next - 1 > MAX_PAGENO || used_bytes > MAX_MAPSIZE) { + mdbx_notice("meta[%u] has too large max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); + return MDBX_TOO_LARGE; + } + + /* allow to open large DB from a 32-bit environment */ + mdbx_notice("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); + meta->mm_geo.upper = (pgno_t)(MAX_MAPSIZE / meta->mm_psize); + } + + /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. + * + * Copy-with-compaction by previous version of libmdbx could produce DB-file + * less than meta.geo.lower bound, in case actual filling is low or no data + * at all. This is not a problem as there is no damage or loss of data. + * Therefore it is better not to consider such situation as an error, but + * silently correct it. */ + if (meta->mm_geo.now < meta->mm_geo.lower) + meta->mm_geo.now = meta->mm_geo.lower; + if (meta->mm_geo.now > meta->mm_geo.upper && + meta->mm_geo.next <= meta->mm_geo.upper) + meta->mm_geo.now = meta->mm_geo.upper; + + if (meta->mm_geo.next > meta->mm_geo.now) { + mdbx_notice("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next, meta->mm_geo.now); + return MDBX_CORRUPTED; + } + + /* LY: GC root */ + if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) { + if (meta->mm_dbs[FREE_DBI].md_branch_pages || + meta->mm_dbs[FREE_DBI].md_depth || meta->mm_dbs[FREE_DBI].md_entries || + meta->mm_dbs[FREE_DBI].md_leaf_pages || + meta->mm_dbs[FREE_DBI].md_overflow_pages) { + mdbx_notice("meta[%u] has false-empty GC, skip it", meta_number); + return MDBX_CORRUPTED; + } + } else if (meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next) { + mdbx_notice("meta[%u] has invalid GC-root %" PRIaPGNO ", skip it", + meta_number, meta->mm_dbs[FREE_DBI].md_root); + return MDBX_CORRUPTED; + } + + /* LY: MainDB root */ + if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) { + if (meta->mm_dbs[MAIN_DBI].md_branch_pages || + meta->mm_dbs[MAIN_DBI].md_depth || meta->mm_dbs[MAIN_DBI].md_entries || + meta->mm_dbs[MAIN_DBI].md_leaf_pages || + meta->mm_dbs[MAIN_DBI].md_overflow_pages) { + mdbx_notice("meta[%u] has false-empty maindb", meta_number); + return MDBX_CORRUPTED; + } + } else if (meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next) { + mdbx_notice("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", + meta_number, meta->mm_dbs[MAIN_DBI].md_root); + return MDBX_CORRUPTED; + } + + if (safe64_read(&meta->mm_txnid_a) == 0) { + mdbx_warning("meta[%u] has zero txnid, skip it", meta_number); + return MDBX_RESULT_TRUE; + } + + return MDBX_SUCCESS; +} + /* Read the environment parameters of a DB environment * before mapping it into memory. */ -static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, - uint64_t *filesize) { - STATIC_ASSERT(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); - +static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest, + uint64_t *filesize, + const int lck_exclusive) { int rc = mdbx_filesize(env->me_fd, filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(meta, 0, sizeof(MDBX_meta)); - meta->mm_datasync_sign = MDBX_DATASIGN_WEAK; + memset(dest, 0, sizeof(MDBX_meta)); + dest->mm_datasync_sign = MDBX_DATASIGN_WEAK; rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ unsigned loop_limit = NUM_METAS * 2; for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { - MDBX_page page; - /* We don't know the page size on first time. * So, just guess it. */ - unsigned guess_pagesize = meta->mm_psize; + unsigned guess_pagesize = dest->mm_psize; if (guess_pagesize == 0) guess_pagesize = (loop_count > NUM_METAS) ? env->me_psize : env->me_os_psize; @@ -5067,49 +7251,35 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, const unsigned meta_number = loop_count % NUM_METAS; const unsigned offset = guess_pagesize * meta_number; + char buffer[MIN_PAGESIZE]; unsigned retryleft = 42; while (1) { mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", - meta_number, offset, (unsigned)sizeof(page), retryleft); - int err = mdbx_pread(env->me_fd, &page, sizeof(page), offset); + meta_number, offset, MIN_PAGESIZE, retryleft); + int err = mdbx_pread(env->me_fd, buffer, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), - err, mdbx_strerror(err)); + if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && + *filesize == 0 && (env->me_flags & MDBX_RDONLY) == 0) + mdbx_notice("read meta: empty file (%d, %s)", err, + mdbx_strerror(err)); + else + mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } - MDBX_page again; - err = mdbx_pread(env->me_fd, &again, sizeof(again), offset); + char again[MIN_PAGESIZE]; + err = mdbx_pread(env->me_fd, again, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(again), - err, mdbx_strerror(err)); + mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } - if (memcmp(&page, &again, sizeof(page)) == 0 || --retryleft == 0) + if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) break; - mdbx_info("meta[%u] was updated, re-read it", meta_number); - } - - if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC && - page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC_DEVEL) { - mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - page.mp_meta.mm_magic_and_version); - return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC) - ? MDBX_INVALID - : MDBX_VERSION_MISMATCH; - } - - if (page.mp_pgno != meta_number) { - mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, - page.mp_pgno); - return MDBX_INVALID; - } - - if (page.mp_flags != P_META) { - mdbx_error("page #%u not a meta-page", meta_number); - return MDBX_INVALID; + mdbx_verbose("meta[%u] was updated, re-read it", meta_number); } if (!retryleft) { @@ -5117,200 +7287,25 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, continue; } - /* LY: check pagesize */ - if (!mdbx_is_power2(page.mp_meta.mm_psize) || - page.mp_meta.mm_psize < MIN_PAGESIZE || - page.mp_meta.mm_psize > MAX_PAGESIZE) { - mdbx_notice("meta[%u] has invalid pagesize (%u), skip it", meta_number, - page.mp_meta.mm_psize); - rc = mdbx_is_power2(page.mp_meta.mm_psize) ? MDBX_VERSION_MISMATCH - : MDBX_INVALID; - continue; - } - - if (meta_number == 0 && guess_pagesize != page.mp_meta.mm_psize) { - meta->mm_psize = page.mp_meta.mm_psize; - mdbx_info("meta[%u] took pagesize %u", meta_number, - page.mp_meta.mm_psize); - } - - if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) { - mdbx_warning("meta[%u] not completely updated, skip it", meta_number); - continue; - } - - /* LY: check signature as a checksum */ - if (META_IS_STEADY(&page.mp_meta) && - page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { - mdbx_notice("meta[%u] has invalid steady-checksum (0x%" PRIx64 - " != 0x%" PRIx64 "), skip it", - meta_number, page.mp_meta.mm_datasync_sign, - mdbx_meta_sign(&page.mp_meta)); - continue; - } - - mdbx_debug("read meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - page.mp_pgno, page.mp_meta.mm_dbs[MAIN_DBI].md_root, - page.mp_meta.mm_dbs[FREE_DBI].md_root, page.mp_meta.mm_geo.lower, - page.mp_meta.mm_geo.next, page.mp_meta.mm_geo.now, - page.mp_meta.mm_geo.upper, page.mp_meta.mm_geo.grow, - page.mp_meta.mm_geo.shrink, page.mp_meta.mm_txnid_a, - mdbx_durable_str(&page.mp_meta)); - - /* LY: check min-pages value */ - if (page.mp_meta.mm_geo.lower < MIN_PAGENO || - page.mp_meta.mm_geo.lower > MAX_PAGENO) { - mdbx_notice("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", - meta_number, page.mp_meta.mm_geo.lower); - rc = MDBX_INVALID; - continue; - } - - /* LY: check max-pages value */ - if (page.mp_meta.mm_geo.upper < MIN_PAGENO || - page.mp_meta.mm_geo.upper > MAX_PAGENO || - page.mp_meta.mm_geo.upper < page.mp_meta.mm_geo.lower) { - mdbx_notice("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", - meta_number, page.mp_meta.mm_geo.upper); - rc = MDBX_INVALID; - continue; - } - - /* LY: check last_pgno */ - if (page.mp_meta.mm_geo.next < MIN_PAGENO || - page.mp_meta.mm_geo.next - 1 > MAX_PAGENO) { - mdbx_notice("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", - meta_number, page.mp_meta.mm_geo.next); - rc = MDBX_CORRUPTED; - continue; - } - - /* LY: check filesize & used_bytes */ - const uint64_t used_bytes = - page.mp_meta.mm_geo.next * (uint64_t)page.mp_meta.mm_psize; - if (used_bytes > *filesize) { - /* Here could be a race with DB-shrinking performed by other process */ - rc = mdbx_filesize(env->me_fd, filesize); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (used_bytes > *filesize) { - mdbx_notice("meta[%u] used-bytes (%" PRIu64 - ") beyond filesize (%" PRIu64 "), skip it", - meta_number, used_bytes, *filesize); - rc = MDBX_CORRUPTED; - continue; - } - } - - /* LY: check mapsize limits */ - const uint64_t mapsize_min = - page.mp_meta.mm_geo.lower * (uint64_t)page.mp_meta.mm_psize; - STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE); - STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - if (mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE) { - mdbx_notice("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_min); - rc = MDBX_VERSION_MISMATCH; - continue; - } - - const uint64_t mapsize_max = - page.mp_meta.mm_geo.upper * (uint64_t)page.mp_meta.mm_psize; - STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - if (mapsize_max > MAX_MAPSIZE || - MAX_PAGENO < mdbx_roundup2((size_t)mapsize_max, env->me_os_psize) / - (size_t)page.mp_meta.mm_psize) { - if (page.mp_meta.mm_geo.next - 1 > MAX_PAGENO || - used_bytes > MAX_MAPSIZE) { - mdbx_notice("meta[%u] has too large max-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_max); - rc = MDBX_TOO_LARGE; - continue; - } - - /* allow to open large DB from a 32-bit environment */ - mdbx_notice("meta[%u] has too large max-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_max, used_bytes); - page.mp_meta.mm_geo.upper = (pgno_t)(MAX_MAPSIZE / page.mp_meta.mm_psize); - } - - /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. - * - * Copy-with-compaction by previous version of libmdbx could produce DB-file - * less than meta.geo.lower bound, in case actual filling is low or no data - * at all. This is not a problem as there is no damage or loss of data. - * Therefore it is better not to consider such situation as an error, but - * silently correct it. */ - if (page.mp_meta.mm_geo.now < page.mp_meta.mm_geo.lower) - page.mp_meta.mm_geo.now = page.mp_meta.mm_geo.lower; - if (page.mp_meta.mm_geo.now > page.mp_meta.mm_geo.upper) - page.mp_meta.mm_geo.now = page.mp_meta.mm_geo.upper; - - if (page.mp_meta.mm_geo.next > page.mp_meta.mm_geo.now) { - mdbx_notice("meta[%u] next-pageno (%" PRIaPGNO - ") is beyond end-pgno (%" PRIaPGNO "), skip it", - meta_number, page.mp_meta.mm_geo.next, - page.mp_meta.mm_geo.now); - rc = MDBX_CORRUPTED; - continue; - } - - /* LY: FreeDB root */ - if (page.mp_meta.mm_dbs[FREE_DBI].md_root == P_INVALID) { - if (page.mp_meta.mm_dbs[FREE_DBI].md_branch_pages || - page.mp_meta.mm_dbs[FREE_DBI].md_depth || - page.mp_meta.mm_dbs[FREE_DBI].md_entries || - page.mp_meta.mm_dbs[FREE_DBI].md_leaf_pages || - page.mp_meta.mm_dbs[FREE_DBI].md_overflow_pages) { - mdbx_notice("meta[%u] has false-empty freedb, skip it", meta_number); - rc = MDBX_CORRUPTED; - continue; - } - } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root >= - page.mp_meta.mm_geo.next) { - mdbx_notice("meta[%u] has invalid freedb-root %" PRIaPGNO ", skip it", - meta_number, page.mp_meta.mm_dbs[FREE_DBI].md_root); - rc = MDBX_CORRUPTED; - continue; - } - - /* LY: MainDB root */ - if (page.mp_meta.mm_dbs[MAIN_DBI].md_root == P_INVALID) { - if (page.mp_meta.mm_dbs[MAIN_DBI].md_branch_pages || - page.mp_meta.mm_dbs[MAIN_DBI].md_depth || - page.mp_meta.mm_dbs[MAIN_DBI].md_entries || - page.mp_meta.mm_dbs[MAIN_DBI].md_leaf_pages || - page.mp_meta.mm_dbs[MAIN_DBI].md_overflow_pages) { - mdbx_notice("meta[%u] has false-empty maindb", meta_number); - rc = MDBX_CORRUPTED; - continue; - } - } else if (page.mp_meta.mm_dbs[MAIN_DBI].md_root >= - page.mp_meta.mm_geo.next) { - mdbx_notice("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", - meta_number, page.mp_meta.mm_dbs[MAIN_DBI].md_root); - rc = MDBX_CORRUPTED; - continue; - } - - if (page.mp_meta.mm_txnid_a == 0) { - mdbx_warning("meta[%u] has zero txnid, skip it", meta_number); + MDBX_page *const page = (MDBX_page *)buffer; + MDBX_meta *const meta = page_meta(page); + rc = mdbx_validate_meta(env, meta, filesize, page, meta_number, dest, + guess_pagesize); + if (rc != MDBX_SUCCESS) continue; - } - if (mdbx_meta_ot(prefer_noweak, env, meta, &page.mp_meta)) { - *meta = page.mp_meta; - if (META_IS_WEAK(meta)) + if (mdbx_meta_ot(prefer_steady, env, dest, meta)) { + *dest = *meta; + if (!META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ - mdbx_info("latch meta[%u]", meta_number); + mdbx_verbose("latch meta[%u]", meta_number); } } - if (META_IS_WEAK(meta)) { - mdbx_error("no usable meta-pages, database is corrupted"); + if (dest->mm_psize == 0 || + (!META_IS_STEADY(dest) && + !meta_weak_acceptable(env, dest, lck_exclusive))) { + mdbx_error("%s", "no usable meta-pages, database is corrupted"); return rc; } @@ -5320,7 +7315,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, unsigned num) { - mdbx_ensure(env, mdbx_is_power2(env->me_psize)); + mdbx_ensure(env, is_powerof2(env->me_psize)); mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); @@ -5331,49 +7326,49 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, memset(model, 0, sizeof(*model)); model->mp_pgno = num; model->mp_flags = P_META; - model->mp_meta.mm_magic_and_version = MDBX_DATA_MAGIC; - - model->mp_meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - model->mp_meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - model->mp_meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); - model->mp_meta.mm_geo.shrink = - (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); - model->mp_meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - model->mp_meta.mm_geo.next = NUM_METAS; - - mdbx_ensure(env, model->mp_meta.mm_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, model->mp_meta.mm_geo.upper <= MAX_PAGENO); - mdbx_ensure(env, model->mp_meta.mm_geo.now >= model->mp_meta.mm_geo.lower); - mdbx_ensure(env, model->mp_meta.mm_geo.now <= model->mp_meta.mm_geo.upper); - mdbx_ensure(env, model->mp_meta.mm_geo.next >= MIN_PAGENO); - mdbx_ensure(env, model->mp_meta.mm_geo.next <= model->mp_meta.mm_geo.now); - mdbx_ensure(env, model->mp_meta.mm_geo.grow == - bytes2pgno(env, env->me_dbgeo.grow)); - mdbx_ensure(env, model->mp_meta.mm_geo.shrink == + MDBX_meta *const model_meta = page_meta(model); + model_meta->mm_magic_and_version = MDBX_DATA_MAGIC; + + model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + model_meta->mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); + model_meta->mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); + model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + model_meta->mm_geo.next = NUM_METAS; + + mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); + mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); + mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); + mdbx_ensure(env, + model_meta->mm_geo.grow == bytes2pgno(env, env->me_dbgeo.grow)); + mdbx_ensure(env, model_meta->mm_geo.shrink == bytes2pgno(env, env->me_dbgeo.shrink)); - model->mp_meta.mm_psize = env->me_psize; - model->mp_meta.mm_flags = (uint16_t)env->me_flags; - model->mp_meta.mm_flags |= + model_meta->mm_psize = env->me_psize; + model_meta->mm_flags = (uint16_t)env->me_flags; + model_meta->mm_flags |= MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - model->mp_meta.mm_dbs[FREE_DBI].md_root = P_INVALID; - model->mp_meta.mm_dbs[MAIN_DBI].md_root = P_INVALID; - mdbx_meta_set_txnid(env, &model->mp_meta, MIN_TXNID + num); - model->mp_meta.mm_datasync_sign = mdbx_meta_sign(&model->mp_meta); + model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; + mdbx_meta_set_txnid(env, model_meta, MIN_TXNID + num); + model_meta->mm_datasync_sign = mdbx_meta_sign(model_meta); return (MDBX_page *)((uint8_t *)model + env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. * Return pointer to recenly (head) meta-page. */ -static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { +static MDBX_meta *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page0 = (MDBX_page *)buffer; MDBX_page *page1 = mdbx_meta_model(env, page0, 0); MDBX_page *page2 = mdbx_meta_model(env, page1, 1); mdbx_meta_model(env, page2, 2); - mdbx_assert(env, !mdbx_meta_eq(env, &page0->mp_meta, &page1->mp_meta)); - mdbx_assert(env, !mdbx_meta_eq(env, &page1->mp_meta, &page2->mp_meta)); - mdbx_assert(env, !mdbx_meta_eq(env, &page2->mp_meta, &page0->mp_meta)); - return page2; + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page0), page_meta(page1))); + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page1), page_meta(page2))); + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page2), page_meta(page0))); + return page_meta(page2); } static int mdbx_sync_locked(MDBX_env *env, unsigned flags, @@ -5388,43 +7383,93 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, !META_IS_STEADY(head) || *env->me_unsynced_pages != 0); mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); if (flags & (MDBX_NOSYNC | MDBX_MAPASYNC)) { /* Check auto-sync conditions */ const pgno_t autosync_threshold = *env->me_autosync_threshold; - const uint64_t unsynced_timeout = *env->me_unsynced_timeout; + const uint64_t autosync_period = *env->me_autosync_period; if ((autosync_threshold && *env->me_unsynced_pages >= autosync_threshold) || - (unsynced_timeout && mdbx_osal_monotime() >= unsynced_timeout)) + (autosync_period && + mdbx_osal_monotime() - *env->me_sync_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ } - /* LY: check conditions to shrink datafile */ - const pgno_t backlog_gap = - pending->mm_dbs[FREE_DBI].md_depth + mdbx_backlog_extragap(env); pgno_t shrink = 0; - if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && - pending->mm_geo.now - pending->mm_geo.next > - pending->mm_geo.shrink + backlog_gap) { - const pgno_t largest = mdbx_find_largest( + if (flags & MDBX_SHRINK_ALLOWED) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = mdbx_find_largest( env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next : pending->mm_geo.next); - if (pending->mm_geo.now > largest && - pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; - const pgno_t with_backlog_gap = largest + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) - mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + mdbx_assert(env, largest_pgno >= NUM_METAS); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + const pgno_t edge = env->me_poison_edge; + if (edge > largest_pgno) { + env->me_poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), + pgno2bytes(env, edge - largest_pgno)); + ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, largest_pgno), + pgno2bytes(env, edge - largest_pgno)); + } +#endif /* MDBX_USE_VALGRIND */ +#if defined(MADV_DONTNEED) + const size_t largest_aligned2os_bytes = + pgno_align2os_bytes(env, largest_pgno); + const pgno_t largest_aligned2os_pgno = + bytes2pgno(env, largest_aligned2os_bytes); + const pgno_t prev_discarded_pgno = *env->me_discarded_tail; + if (prev_discarded_pgno > + largest_aligned2os_pgno + + /* 1M threshold to avoid unreasonable madvise() call */ + bytes2pgno(env, MEGABYTE)) { + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, + largest_pgno); + *env->me_discarded_tail = largest_aligned2os_pgno; + const size_t prev_discarded_bytes = + pgno2bytes(env, prev_discarded_pgno) & ~(env->me_os_psize - 1); + mdbx_ensure(env, prev_discarded_bytes > largest_aligned2os_bytes); + int advise = MADV_DONTNEED; +#if defined(MADV_FREE) && \ + 0 /* MADV_FREE works for only anonymous vma at the moment */ + if ((env->me_flags & MDBX_WRITEMAP) && + mdbx_linux_kernel_version > 0x04050000) + advise = MADV_FREE; +#endif /* MADV_FREE */ + int err = madvise(env->me_map + largest_aligned2os_bytes, + prev_discarded_bytes - largest_aligned2os_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_FREE || MADV_DONTNEED */ + + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; + if (pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next > + pending->mm_geo.shrink + backlog_gap) { + if (pending->mm_geo.now > largest_pgno && + pending->mm_geo.now - largest_pgno > + pending->mm_geo.shrink + backlog_gap) { + const pgno_t aligner = pending->mm_geo.grow ? pending->mm_geo.grow + : pending->mm_geo.shrink; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + if (META_IS_STEADY(mdbx_meta_steady(env))) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (mdbx_meta_txnid_stable(env, head) == + pending->mm_txnid_a.inconsistent) + mdbx_meta_set_txnid( + env, pending, + safe64_txnid_next(pending->mm_txnid_a.inconsistent)); + } } } } @@ -5463,20 +7508,14 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (rc == MDBX_RESULT_FALSE /* carry steady */) { pending->mm_datasync_sign = mdbx_meta_sign(pending); *env->me_unsynced_pages = 0; - *env->me_unsynced_timeout = 0; + *env->me_sync_timestamp = mdbx_osal_monotime(); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - const uint64_t autosync_period = *env->me_autosync_period; - if (autosync_period && *env->me_unsynced_timeout == 0) - *env->me_unsynced_timeout = mdbx_osal_monotime() + autosync_period; - pending->mm_datasync_sign = - (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC - ? MDBX_DATASIGN_NONE - : MDBX_DATASIGN_WEAK; + pending->mm_datasync_sign = MDBX_DATASIGN_WEAK; } MDBX_meta *target = nullptr; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) { + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a.inconsistent) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -5487,7 +7526,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target = head; else { mdbx_ensure(env, mdbx_meta_eq(env, head, pending)); - mdbx_debug("skip update meta"); + mdbx_debug("%s", "skip update meta"); return MDBX_SUCCESS; } } else if (head == meta0) @@ -5503,12 +7542,11 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_debug("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", - container_of(target, MDBX_page, mp_data)->mp_pgno, - pending->mm_dbs[MAIN_DBI].md_root, + data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, - pending->mm_geo.grow, pending->mm_geo.shrink, pending->mm_txnid_a, - mdbx_durable_str(pending)); + pending->mm_geo.grow, pending->mm_geo.shrink, + pending->mm_txnid_a.inconsistent, mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, @@ -5532,13 +7570,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) < - pending->mm_txnid_a); + pending->mm_txnid_a.inconsistent); if (env->me_flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ + mdbx_meta_update_begin(env, target, pending->mm_txnid_a.inconsistent); target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_meta_update_begin(env, target, pending->mm_txnid_a); #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * and mm_flags to allow readers catch actual pagesize. */ @@ -5553,18 +7591,19 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; + target->mm_pages_retired = pending->mm_pages_retired; mdbx_jitter4testing(true); - mdbx_flush_noncoherent_cpu_writeback(); + mdbx_flush_incoherent_cpu_writeback(); /* LY: 'commit' the meta */ - mdbx_meta_update_end(env, target, pending->mm_txnid_b); + mdbx_meta_update_end(env, target, pending->mm_txnid_b.inconsistent); mdbx_jitter4testing(true); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ - mdbx_ensure(env, - mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); + mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) == + pending->mm_txnid_a.inconsistent && + !META_IS_STEADY(head) && META_IS_STEADY(pending)); mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, sizeof(head->mm_geo)) == 0); mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, @@ -5573,21 +7612,21 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, sizeof(head->mm_canary)) == 0); } target->mm_datasync_sign = pending->mm_datasync_sign; - mdbx_flush_noncoherent_cpu_writeback(); + mdbx_flush_incoherent_cpu_writeback(); mdbx_jitter4testing(true); } else { rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: - mdbx_debug("write failed, disk error?"); + mdbx_debug("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), (uint8_t *)target - env->me_map); goto fail; } - mdbx_invalidate_mmap_noncoherent_cache(target, sizeof(MDBX_meta)); + mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); } /* LY: step#3 - sync meta-pages. */ @@ -5595,11 +7634,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { - const size_t offset = - ((uint8_t *)container_of(head, MDBX_page, mp_meta)) - - env->me_dxb_mmap.dxb; + const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb; const size_t paged_offset = offset & ~(env->me_os_psize - 1); - const size_t paged_length = mdbx_roundup2( + const size_t paged_length = roundup_powerof2( env->me_psize + offset - paged_offset, env->me_os_psize); rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, flags & MDBX_MAPASYNC); @@ -5610,13 +7647,15 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (rc != MDBX_SUCCESS) goto undo; } + *env->me_meta_sync_txnid = (uint32_t)pending->mm_txnid_a.inconsistent; } /* LY: shrink datafile if needed */ if (unlikely(shrink)) { - mdbx_info("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", - pending->mm_geo.now, shrink); - rc = mdbx_mapresize(env, pending->mm_geo.now, pending->mm_geo.upper); + mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->mm_geo.now, shrink); + rc = mdbx_mapresize(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper); if (MDBX_IS_ERROR(rc)) goto fail; } @@ -5628,58 +7667,49 @@ fail: return rc; } -int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { - if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; - return env->me_maxkey_limit; -} - -#define mdbx_nodemax(pagesize) \ - (((((pagesize)-PAGEHDRSZ) / MDBX_MINKEYS) & ~(uintptr_t)1) - sizeof(indx_t)) - -#define mdbx_maxkey(nodemax) (((nodemax)-NODESIZE - sizeof(MDBX_db)) / 2) - -#define mdbx_maxgc_ov1page(pagesize) \ - (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) - static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); - STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page)); - mdbx_ensure(env, mdbx_is_power2(pagesize)); + STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); + mdbx_ensure(env, is_powerof2(pagesize)); mdbx_ensure(env, pagesize >= MIN_PAGESIZE); mdbx_ensure(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; - STATIC_ASSERT(mdbx_maxgc_ov1page(MIN_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_maxgc_ov1page(MAX_PAGESIZE) < MDBX_DPL_TXNFULL); + STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); + STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_DPL_TXNFULL); const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; mdbx_ensure(env, maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_DPL_TXNFULL); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; - STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX); - const intptr_t nodemax = mdbx_nodemax(pagesize); - mdbx_ensure(env, nodemax > 42 && nodemax < UINT16_MAX && nodemax % 2 == 0); - env->me_nodemax = (unsigned)nodemax; - - STATIC_ASSERT(mdbx_maxkey(MIN_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_maxkey(MIN_PAGESIZE) < MIN_PAGESIZE); - STATIC_ASSERT(mdbx_maxkey(MAX_PAGESIZE) > 42); - STATIC_ASSERT(mdbx_maxkey(MAX_PAGESIZE) < MAX_PAGESIZE); - const intptr_t maxkey_limit = mdbx_maxkey(env->me_nodemax); - mdbx_ensure(env, maxkey_limit > 42 && (size_t)maxkey_limit < pagesize && - maxkey_limit % 2 == 0); - env->me_maxkey_limit = (unsigned)maxkey_limit; - - env->me_psize2log = mdbx_log2(pagesize); + STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); + STATIC_ASSERT(LEAF_NODEMAX(MAX_PAGESIZE) < UINT16_MAX); + STATIC_ASSERT(LEAF_NODEMAX(MIN_PAGESIZE) > BRANCH_NODEMAX(MIN_PAGESIZE)); + STATIC_ASSERT(BRANCH_NODEMAX(MAX_PAGESIZE) > NODESIZE + 42); + STATIC_ASSERT(BRANCH_NODEMAX(MAX_PAGESIZE) < UINT16_MAX); + const intptr_t branch_nodemax = BRANCH_NODEMAX(pagesize); + mdbx_ensure(env, branch_nodemax > 42 && branch_nodemax < (int)UINT16_MAX && + branch_nodemax % 2 == 0); + env->me_branch_nodemax = (unsigned)branch_nodemax; + env->me_maxkey_nd = (uint16_t)mdbx_limits_keysize_max(env->me_psize, 0); + env->me_maxkey_ds = + (uint16_t)mdbx_limits_keysize_max(env->me_psize, MDBX_DUPSORT); + env->me_maxval_nd = (unsigned)mdbx_limits_valsize_max(env->me_psize, 0); + env->me_maxval_ds = + (unsigned)mdbx_limits_valsize_max(env->me_psize, MDBX_DUPSORT); + mdbx_ensure(env, env->me_maxkey_nd == + env->me_branch_nodemax - NODESIZE - sizeof(pgno_t)); + mdbx_ensure(env, env->me_maxkey_ds == + env->me_branch_nodemax - NODESIZE - sizeof(MDBX_db)); + + env->me_psize2log = log2n(pagesize); mdbx_assert(env, pgno2bytes(env, 1) == pagesize); mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); } int __cold mdbx_env_create(MDBX_env **penv) { MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); - if (!env) + if (unlikely(!env)) return MDBX_ENOMEM; env->me_maxreaders = DEFAULT_READERS; @@ -5690,7 +7720,7 @@ int __cold mdbx_env_create(MDBX_env **penv) { int rc; const size_t os_psize = mdbx_syspagesize(); - if (!mdbx_is_power2(os_psize) || os_psize < MIN_PAGESIZE) { + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); rc = MDBX_INCOMPATIBLE; goto bailout; @@ -5711,7 +7741,10 @@ int __cold mdbx_env_create(MDBX_env **penv) { mdbx_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } - rc = mdbx_fastmutex_init(&env->me_lckless_stub.wmutex); + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + rc = mdbx_ipclock_stub(&env->me_lckless_stub.wlock); +#endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { mdbx_fastmutex_destroy(&env->me_remap_guard); mdbx_fastmutex_destroy(&env->me_dbi_lock); @@ -5730,113 +7763,6 @@ bailout: return rc; } -static int __cold mdbx_env_map(MDBX_env *env, const int is_exclusive, - const size_t usedsize) { - int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, - env->me_dbgeo.upper); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - -#ifdef MADV_DONTFORK - if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTFORK) != 0)) - return errno; -#endif - -#ifdef MADV_NOHUGEPAGE - (void)madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); -#endif - -#if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) - const size_t meta_length = pgno2bytes(env, NUM_METAS); - (void)madvise(env->me_map, meta_length, MADV_DODUMP); - if (!(env->me_flags & MDBX_PAGEPERTURB)) - (void)madvise(env->me_map + meta_length, env->me_mapsize - meta_length, - MADV_DONTDUMP); -#endif - - if (is_exclusive && (env->me_flags & MDBX_WRITEMAP) != 0) { -#ifdef MADV_REMOVE_OR_FREE - const size_t used_alined2os = mdbx_roundup2(usedsize, env->me_os_psize); - if (used_alined2os < env->me_mapsize) - (void)madvise(env->me_map + used_alined2os, - env->me_mapsize - used_alined2os, MADV_REMOVE_OR_FREE); -#else - (void)usedsize; -#endif - } - -#ifdef POSIX_FADV_RANDOM - /* this also checks that the file size is valid for a particular FS */ - rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM); - if (unlikely(rc != 0)) - return rc; -#elif defined(F_RDAHEAD) - if (unlikely(fcntl(env->me_fd, F_RDAHEAD, 0) == -1)) - return errno; -#endif - -#if defined(MADV_RANDOM) - if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_RANDOM) != 0)) - return errno; -#elif defined(POSIX_MADV_RANDOM) - rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); - if (unlikely(rc != 0)) - return errno; -#endif - - /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ - if (env->me_flags & MDBX_NORDAHEAD) { -#ifdef POSIX_FADV_DONTNEED - rc = posix_fadvise(env->me_fd, 0, env->me_mapsize, POSIX_FADV_DONTNEED); - if (unlikely(rc != 0)) - return rc; -#endif -#if defined(MADV_DONTNEED) - if (unlikely(madvise(env->me_map, env->me_mapsize, MADV_DONTNEED) != 0)) - return errno; -#elif defined(POSIX_MADV_DONTNEED) - rc = posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_DONTNEED); - if (unlikely(rc != 0)) - return errno; -#endif - } else { -#ifdef POSIX_FADV_WILLNEED - rc = posix_fadvise(env->me_fd, 0, usedsize, POSIX_FADV_WILLNEED); - if (unlikely(rc != 0)) - return rc; -#elif defined(F_RDADVISE) - struct radvisory hint; - hint.ra_offset = 0; - hint.ra_count = usedsize; - if (unlikely(fcntl(env->me_fd, F_RDADVISE, &hint) == -1)) - return errno; -#endif -#if defined(MADV_WILLNEED) - if (unlikely(madvise(env->me_map, usedsize, MADV_WILLNEED) != 0)) - return errno; -#elif defined(POSIX_MADV_WILLNEED) - rc = posix_madvise(env->me_map, usedsize, POSIX_MADV_WILLNEED); - if (unlikely(rc != 0)) - return errno; -#endif -#if defined(_WIN32) || defined(_WIN64) - if (mdbx_PrefetchVirtualMemory) { - WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map; - hint.NumberOfBytes = usedsize; - (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); - } -#endif /* Windows */ - } - -#ifdef USE_VALGRIND - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); -#endif - - return MDBX_SUCCESS; -} - __cold LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, intptr_t size_upper, intptr_t growth_step, @@ -5869,7 +7795,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, int rc = MDBX_PROBLEM; if (env->me_map) { /* env already mapped */ - if (!env->me_lck || (env->me_flags & MDBX_RDONLY)) + if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (!inside_txn) { @@ -5884,13 +7810,9 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, mdbx_find_oldest(env->me_txn0); } + /* get untouched params from DB */ if (pagesize < 0) pagesize = env->me_psize; - if (pagesize != (intptr_t)env->me_psize) { - rc = MDBX_EINVAL; - goto bailout; - } - if (size_lower < 0) size_lower = pgno2bytes(env, head->mm_geo.lower); if (size_now < 0) @@ -5902,6 +7824,10 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (shrink_threshold < 0) shrink_threshold = pgno2bytes(env, head->mm_geo.shrink); + if (pagesize != (intptr_t)env->me_psize) { + rc = MDBX_EINVAL; + goto bailout; + } const size_t usedbytes = pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); if ((size_t)size_upper < usedbytes) { @@ -5929,7 +7855,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pagesize = MAX_PAGESIZE; if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2(pagesize)) { + !is_powerof2(pagesize)) { rc = MDBX_EINVAL; goto bailout; } @@ -5980,13 +7906,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, goto bailout; } - size_lower = mdbx_roundup2(size_lower, env->me_os_psize); - size_upper = mdbx_roundup2(size_upper, env->me_os_psize); - size_now = mdbx_roundup2(size_now, env->me_os_psize); + size_lower = roundup_powerof2(size_lower, env->me_os_psize); + size_upper = roundup_powerof2(size_upper, env->me_os_psize); + size_now = roundup_powerof2(size_now, env->me_os_psize); /* LY: подбираем значение size_upper: * - кратное размеру системной страницы - * - без нарушения MAX_MAPSIZE или MAX_PAGENO */ + * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ while (unlikely((size_t)size_upper > MAX_MAPSIZE || (uint64_t)size_upper / pagesize > MAX_PAGENO)) { if ((size_t)size_upper < env->me_os_psize + MIN_MAPSIZE || @@ -6015,28 +7941,18 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if ((size_t)growth_step > MEGABYTE * 16) growth_step = MEGABYTE * 16; } - growth_step = mdbx_roundup2(growth_step, env->me_os_psize); + if (growth_step == 0 && shrink_threshold > 0) + growth_step = 1; + growth_step = roundup_powerof2(growth_step, env->me_os_psize); if (bytes2pgno(env, growth_step) > UINT16_MAX) growth_step = pgno2bytes(env, UINT16_MAX); - if (shrink_threshold < 0) { + if (shrink_threshold < 0) shrink_threshold = growth_step + growth_step; - if (shrink_threshold < growth_step) - shrink_threshold = growth_step; - } - shrink_threshold = mdbx_roundup2(shrink_threshold, env->me_os_psize); + shrink_threshold = roundup_powerof2(shrink_threshold, env->me_os_psize); if (bytes2pgno(env, shrink_threshold) > UINT16_MAX) shrink_threshold = pgno2bytes(env, UINT16_MAX); -#ifdef POSIX_FADV_RANDOM - if (env->me_fd != INVALID_HANDLE_VALUE) { - /* this also checks that the file size is valid for a particular FS */ - rc = posix_fadvise(env->me_fd, 0, env->me_dbgeo.upper, POSIX_FADV_RANDOM); - if (unlikely(rc != 0)) - goto bailout; - } -#endif - /* save user's geo-params for future open/create */ env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; @@ -6045,42 +7961,87 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_dbgeo.shrink = shrink_threshold; rc = MDBX_SUCCESS; - if (env->me_map) { - /* apply new params */ - mdbx_assert(env, pagesize == (intptr_t)env->me_psize); + mdbx_ensure(env, pagesize >= MIN_PAGESIZE); + mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + mdbx_ensure(env, is_powerof2(pagesize)); + mdbx_ensure(env, is_powerof2(env->me_os_psize)); - MDBX_meta *head = mdbx_meta_head(env); - MDBX_meta meta = *head; - meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); - meta.mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); - - mdbx_assert(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_assert(env, meta.mm_geo.lower >= MIN_PAGENO); - mdbx_assert(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_assert(env, meta.mm_geo.upper <= MAX_PAGENO); - mdbx_assert(env, meta.mm_geo.now >= meta.mm_geo.next); - mdbx_assert(env, env->me_dbgeo.upper >= env->me_dbgeo.lower); - mdbx_assert(env, meta.mm_geo.upper >= meta.mm_geo.now); - mdbx_assert(env, meta.mm_geo.now >= meta.mm_geo.lower); - mdbx_assert(env, meta.mm_geo.grow == bytes2pgno(env, env->me_dbgeo.grow)); - mdbx_assert(env, - meta.mm_geo.shrink == bytes2pgno(env, env->me_dbgeo.shrink)); + mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.lower / pagesize >= MIN_PAGENO); + mdbx_ensure(env, env->me_dbgeo.lower % pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0); - if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.upper / pagesize <= MAX_PAGENO); + mdbx_ensure(env, env->me_dbgeo.upper % pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0); + mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + mdbx_ensure(env, env->me_dbgeo.now % pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0); + + mdbx_ensure(env, env->me_dbgeo.grow % pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0); + mdbx_ensure(env, env->me_dbgeo.shrink % pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + + if (env->me_map) { + /* apply new params to opened environment */ + mdbx_ensure(env, pagesize == (intptr_t)env->me_psize); + MDBX_meta meta; + MDBX_meta *head = nullptr; + const mdbx_geo_t *current_geo; + if (inside_txn) { + current_geo = &env->me_txn->mt_geo; + } else { + head = mdbx_meta_head(env); + meta = *head; + current_geo = &meta.mm_geo; + } + + mdbx_geo_t new_geo; + new_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + new_geo.now = bytes2pgno(env, env->me_dbgeo.now); + new_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + new_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); + new_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); + new_geo.next = current_geo->next; + + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.lower) == env->me_dbgeo.lower); + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.upper) == env->me_dbgeo.upper); + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.now) == env->me_dbgeo.now); + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.grow) == env->me_dbgeo.grow); + mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.shrink) == + env->me_dbgeo.shrink); + + mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_ensure(env, new_geo.lower >= MIN_PAGENO); + mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_ensure(env, new_geo.upper <= MAX_PAGENO); + mdbx_ensure(env, new_geo.now >= new_geo.next); + mdbx_ensure(env, new_geo.upper >= new_geo.now); + mdbx_ensure(env, new_geo.now >= new_geo.lower); + + if (memcmp(current_geo, &new_geo, sizeof(mdbx_geo_t)) != 0) { #if defined(_WIN32) || defined(_WIN64) /* Was DB shrinking disabled before and now it will be enabled? */ - if (meta.mm_geo.lower < meta.mm_geo.upper && meta.mm_geo.shrink && - !(head->mm_geo.lower < head->mm_geo.upper && head->mm_geo.shrink)) { + if (new_geo.lower < new_geo.upper && new_geo.shrink && + !(current_geo->lower < current_geo->upper && current_geo->shrink)) { + if (!env->me_lck) { + rc = MDBX_EPERM; + goto bailout; + } rc = mdbx_rdt_lock(env); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Check if there are any reading threads that do not use the SRWL */ - const mdbx_pid_t CurrentTid = GetCurrentThreadId(); + const size_t CurrentTid = GetCurrentThreadId(); const MDBX_reader *const begin = env->me_lck->mti_readers; const MDBX_reader *const end = begin + env->me_lck->mti_numreaders; for (const MDBX_reader *reader = begin; reader < end; ++reader) { @@ -6098,16 +8059,23 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } #endif - if (meta.mm_geo.now != head->mm_geo.now || - meta.mm_geo.upper != head->mm_geo.upper) { - rc = mdbx_mapresize(env, meta.mm_geo.now, meta.mm_geo.upper); + if (new_geo.now != current_geo->now || + new_geo.upper != current_geo->upper) { + rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - head = /* base address could be changed */ mdbx_meta_head(env); + mdbx_assert(env, (head == nullptr) == inside_txn); + if (head) + head = /* base address could be changed */ mdbx_meta_head(env); + } + if (inside_txn) { + env->me_txn->mt_geo = new_geo; + env->me_txn->mt_flags |= MDBX_TXN_DIRTY; + } else { + mdbx_meta_set_txnid( + env, &meta, safe64_txnid_next(mdbx_meta_txnid_stable(env, head))); + rc = mdbx_sync_locked(env, env->me_flags, &meta); } - *env->me_unsynced_pages += 1; - mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1); - rc = mdbx_sync_locked(env, env->me_flags, &meta); } } else if (pagesize != (intptr_t)env->me_psize) { mdbx_setup_pagesize(env, pagesize); @@ -6120,7 +8088,7 @@ bailout: } int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { - return mdbx_env_set_geometry(env, -1, size, -1, -1, -1, -1); + return mdbx_env_set_geometry(env, size, size, size, -1, -1, -1); } int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { @@ -6141,7 +8109,7 @@ int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { } int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { - if (unlikely(readers < 1 || readers > INT16_MAX)) + if (unlikely(readers < 1 || readers > MDBX_READERS_LIMIT)) return MDBX_EINVAL; if (unlikely(!env)) @@ -6170,21 +8138,21 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { /* Further setup required for opening an MDBX environment */ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { - uint64_t filesize_before_mmap; + uint64_t filesize_before; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, &meta, &filesize_before_mmap); + int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0) return err; - mdbx_debug("create new database"); + mdbx_debug("%s", "create new database"); rc = /* new database */ MDBX_RESULT_TRUE; if (!env->me_dbgeo.now) { /* set defaults if not configured */ - err = mdbx_env_set_mapsize(env, DEFAULT_MAPSIZE); + err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -6193,65 +8161,66 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (!buffer) return MDBX_ENOMEM; - meta = mdbx_init_metas(env, buffer)->mp_meta; + meta = *mdbx_init_metas(env, buffer); err = mdbx_pwrite(env->me_fd, buffer, env->me_psize * NUM_METAS, 0); mdbx_free(buffer); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_fd, filesize_before_mmap = env->me_dbgeo.now); + err = mdbx_ftruncate(env->me_fd, filesize_before = env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; #ifndef NDEBUG /* just for checking */ - err = mdbx_read_header(env, &meta, &filesize_before_mmap); + err = mdbx_read_header(env, &meta, &filesize_before, lck_rc); if (unlikely(err != MDBX_SUCCESS)) return err; #endif } - mdbx_info("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, - meta.mm_txnid_a, mdbx_durable_str(&meta)); + mdbx_verbose("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, + meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, + meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, + meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta)); mdbx_setup_pagesize(env, meta.mm_psize); const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + const size_t used_aligned2os_bytes = + roundup_powerof2(used_bytes, env->me_os_psize); if ((env->me_flags & MDBX_RDONLY) /* readonly */ || lck_rc != MDBX_RESULT_TRUE /* not exclusive */) { /* use present params from db */ + const size_t pagesize = meta.mm_psize; err = mdbx_env_set_geometry( - env, meta.mm_geo.lower * (uint64_t)meta.mm_psize, - meta.mm_geo.now * (uint64_t)meta.mm_psize, - meta.mm_geo.upper * (uint64_t)meta.mm_psize, - meta.mm_geo.grow * (uint64_t)meta.mm_psize, - meta.mm_geo.shrink * (uint64_t)meta.mm_psize, meta.mm_psize); + env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize, + meta.mm_geo.upper * pagesize, meta.mm_geo.grow * pagesize, + meta.mm_geo.shrink * pagesize, meta.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("could not use present dbsize-params from db"); + mdbx_error("%s", "could not use present dbsize-params from db"); return MDBX_INCOMPATIBLE; } } else if (env->me_dbgeo.now) { /* silently growth to last used page */ - if (env->me_dbgeo.now < used_bytes) - env->me_dbgeo.now = used_bytes; - if (env->me_dbgeo.upper < used_bytes) - env->me_dbgeo.upper = used_bytes; + if (env->me_dbgeo.now < used_aligned2os_bytes) + env->me_dbgeo.now = used_aligned2os_bytes; + if (env->me_dbgeo.upper < used_aligned2os_bytes) + env->me_dbgeo.upper = used_aligned2os_bytes; /* apply preconfigured params, but only if substantial changes: * - upper or lower limit changes * - shrink threshold or growth step - * But ignore just chagne just a 'now/current' size. */ + * But ignore change just a 'now/current' size. */ if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != - pgno_align2os_bytes(env, meta.mm_geo.upper) || + pgno2bytes(env, meta.mm_geo.upper) || bytes_align2os_bytes(env, env->me_dbgeo.lower) != - pgno_align2os_bytes(env, meta.mm_geo.lower) || + pgno2bytes(env, meta.mm_geo.lower) || bytes_align2os_bytes(env, env->me_dbgeo.shrink) != - pgno_align2os_bytes(env, meta.mm_geo.shrink) || + pgno2bytes(env, meta.mm_geo.shrink) || bytes_align2os_bytes(env, env->me_dbgeo.grow) != - pgno_align2os_bytes(env, meta.mm_geo.grow)) { + pgno2bytes(env, meta.mm_geo.grow)) { if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) /* pre-shrink if enabled */ @@ -6262,7 +8231,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { env->me_dbgeo.upper, env->me_dbgeo.grow, env->me_dbgeo.shrink, meta.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("could not apply preconfigured dbsize-params to db"); + mdbx_error("%s", "could not apply preconfigured dbsize-params to db"); return MDBX_INCOMPATIBLE; } @@ -6273,18 +8242,22 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); meta.mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); - mdbx_info("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, - meta.mm_txnid_a, mdbx_durable_str(&meta)); + mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, + meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, + meta.mm_geo.upper, meta.mm_geo.grow, meta.mm_geo.shrink, + meta.mm_txnid_a.inconsistent, mdbx_durable_str(&meta)); + } else { + /* fetch back 'now/current' size, since it was ignored during comparison + * and may differ. */ + env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); } mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); } else { - /* geo-params not pre-configured by user, - * get current values from a meta. */ + /* geo-params are not pre-configured by user, + * get current values from the meta. */ env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now); env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower); env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper); @@ -6292,56 +8265,86 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { env->me_dbgeo.shrink = pgno2bytes(env, meta.mm_geo.shrink); } - const size_t expected_bytes = - mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize); - mdbx_ensure(env, expected_bytes >= used_bytes); - if (filesize_before_mmap != expected_bytes) { + mdbx_ensure(env, + pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); + mdbx_ensure(env, env->me_dbgeo.now >= used_bytes); + if (unlikely(filesize_before != env->me_dbgeo.now)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_info("filesize mismatch (expect %" PRIuPTR "/%" PRIaPGNO - ", have %" PRIu64 "/%" PRIaPGNO "), " - "assume collision in non-exclusive mode", - expected_bytes, bytes2pgno(env, expected_bytes), - filesize_before_mmap, - bytes2pgno(env, (size_t)filesize_before_mmap)); + mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); } else { - mdbx_notice("filesize mismatch (expect %" PRIuSIZE "/%" PRIaPGNO - ", have %" PRIu64 "/%" PRIaPGNO ")", - expected_bytes, bytes2pgno(env, expected_bytes), - filesize_before_mmap, - bytes2pgno(env, (size_t)filesize_before_mmap)); - if (filesize_before_mmap < used_bytes) { + mdbx_notice("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); + if (filesize_before < used_bytes) { mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO ", have %" PRIaPGNO ")", - meta.mm_geo.next, - bytes2pgno(env, (size_t)filesize_before_mmap)); + meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { - if (filesize_before_mmap % env->me_os_psize) { - mdbx_error("filesize should be rounded-up to system page"); + if (filesize_before & (env->me_os_psize - 1)) { + mdbx_error("%s", "filesize should be rounded-up to system page"); return MDBX_WANNA_RECOVERY; } - mdbx_notice("ignore filesize mismatch in readonly-mode"); + mdbx_warning("%s", "ignore filesize mismatch in readonly-mode"); } else { - mdbx_info("resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO " pages", - expected_bytes, bytes2pgno(env, expected_bytes)); - err = mdbx_ftruncate(env->me_fd, expected_bytes); - if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("error %d, while resize datafile to %" PRIuSIZE - " bytes, %" PRIaPGNO " pages", - rc, expected_bytes, bytes2pgno(env, expected_bytes)); - return err; - } - filesize_before_mmap = expected_bytes; + mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); } } } - err = mdbx_env_map(env, lck_rc /* exclusive status */, expected_bytes); - if (err != MDBX_SUCCESS) + mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", + bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + + err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) + const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); + err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) return err; + err = madvise(env->me_map + meta_length_aligned2os, + env->me_dxb_mmap.current - meta_length_aligned2os, + (mdbx_runtime_flags & MDBX_DBG_DUMP) ? MADV_DODUMP + : MADV_DONTDUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif +#ifdef MDBX_USE_VALGRIND + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); +#endif + + mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->me_dxb_mmap.limit); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, + env->me_dxb_mmap.limit - used_bytes); + ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, + env->me_dxb_mmap.limit - used_bytes); + env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit); +#endif /* MDBX_USE_VALGRIND */ + + /* NOTE: AddressSanitizer (at least GCC 7.x, 8.x) could generate + * false-positive alarm here. I have no other explanation for this + * except due to an internal ASAN error, as the problem is reproduced + * in a single-threaded application under the active assert() above. */ const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); if (meta_clash_mask) { mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); @@ -6349,9 +8352,11 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { } while (1) { - MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *const head = mdbx_meta_head(env); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); - if (head_txnid == meta.mm_txnid_a) + MDBX_meta *const steady = mdbx_meta_steady(env); + const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady); + if (head_txnid == steady_txnid) break; if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { @@ -6359,39 +8364,61 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (env->me_flags & MDBX_RDONLY) { mdbx_error("rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN "), but unable in read-only mode", - head_txnid, meta.mm_txnid_a); + head_txnid, steady_txnid); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } + if (meta_bootid_match(head)) { + MDBX_meta clone = *head; + uint64_t filesize = env->me_dbgeo.now; + err = mdbx_validate_meta( + env, &clone, &filesize, data_page(head), + bytes2pgno(env, (uint8_t *)data_page(head) - env->me_map), nullptr, + env->me_psize); + if (err == MDBX_SUCCESS) { + mdbx_notice("opening after an unclean shutdown, " + "but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH, " + "rollback NOT needed", + bootid.x, bootid.y); + meta = clone; + *env->me_unsynced_pages = meta.mm_geo.next; + break; + } + mdbx_notice("opening after an unclean shutdown, " + "but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH, " + "but last meta not valid, rollback needed", + bootid.x, bootid.y); + } + const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - txnid_t undo_txnid = 0; + txnid_t undo_txnid = 0 /* zero means undo is unneeded */; while ( (head != meta0 && mdbx_meta_txnid_fluid(env, meta0) == undo_txnid) || (head != meta1 && mdbx_meta_txnid_fluid(env, meta1) == undo_txnid) || (head != meta2 && mdbx_meta_txnid_fluid(env, meta2) == undo_txnid)) - undo_txnid += 1; - if (unlikely(undo_txnid >= meta.mm_txnid_a)) { + undo_txnid = safe64_txnid_next(undo_txnid); + if (unlikely(undo_txnid >= steady_txnid)) { mdbx_fatal("rollback failed: no suitable txnid (0,1,2) < %" PRIaTXN, - meta.mm_txnid_a); + steady_txnid); return MDBX_PANIC /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN " as %" PRIaTXN, - head_txnid, meta.mm_txnid_a, undo_txnid); + head_txnid, steady_txnid, undo_txnid); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); if (env->me_flags & MDBX_WRITEMAP) { - head->mm_txnid_a = undo_txnid; + /* It is possible to update txnid without safe64_write(), + * since DB opened exclusive for now */ + head->mm_txnid_a.inconsistent = undo_txnid; head->mm_datasync_sign = MDBX_DATASIGN_WEAK; - head->mm_txnid_b = undo_txnid; - const size_t offset = - ((uint8_t *)container_of(head, MDBX_page, mp_meta)) - - env->me_dxb_mmap.dxb; + head->mm_txnid_b.inconsistent = undo_txnid; + const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb; const size_t paged_offset = offset & ~(env->me_os_psize - 1); - const size_t paged_length = mdbx_roundup2( + const size_t paged_length = roundup_powerof2( env->me_psize + offset - paged_offset, env->me_os_psize); err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, false); } else { @@ -6401,11 +8428,15 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); } - if (err) + if (err) { + mdbx_error("error %d rollback from %" PRIaTXN ", to %" PRIaTXN + " as %" PRIaTXN, + err, head_txnid, steady_txnid, undo_txnid); return err; + } - mdbx_invalidate_mmap_noncoherent_cache(env->me_map, - pgno2bytes(env, NUM_METAS)); + mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); mdbx_ensure(env, undo_txnid == mdbx_meta_txnid_fluid(env, head)); mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env)); continue; @@ -6414,78 +8445,115 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { if (!env->me_lck) { /* LY: without-lck (read-only) mode, so it is imposible that other * process made weak checkpoint. */ - mdbx_error("without-lck, unable recovery/rollback"); + mdbx_error("%s", "without-lck, unable recovery/rollback"); return MDBX_WANNA_RECOVERY; } /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ - mdbx_info("assume collision or online weak checkpoint"); + mdbx_verbose("%s", "assume collision or online weak checkpoint"); break; } const MDBX_meta *head = mdbx_meta_head(env); if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - /* re-check file size after mmap */ - uint64_t filesize_after_mmap; - err = mdbx_filesize(env->me_fd, &filesize_after_mmap); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (filesize_after_mmap != expected_bytes) { - if (filesize_after_mmap != filesize_before_mmap) - mdbx_info("datafile resized by system to %" PRIu64 " bytes", - filesize_after_mmap); - if (filesize_after_mmap % env->me_os_psize || - filesize_after_mmap > env->me_dbgeo.upper || - filesize_after_mmap < used_bytes) { - mdbx_info("unacceptable/unexpected datafile size %" PRIu64, - filesize_after_mmap); - return MDBX_PROBLEM; - } - if ((env->me_flags & MDBX_RDONLY) == 0) { - meta.mm_geo.now = - bytes2pgno(env, env->me_dbgeo.now = (size_t)filesize_after_mmap); - mdbx_info("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, meta.mm_geo.now); - } + /* re-check size after mmap */ + if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || + env->me_dxb_mmap.current < used_bytes) { + mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR, + env->me_dxb_mmap.current); + return MDBX_PROBLEM; + } + if (env->me_dxb_mmap.current != env->me_dbgeo.now && + (env->me_flags & MDBX_RDONLY) == 0) { + meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); + mdbx_verbose("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dxb_mmap.current, meta.mm_geo.now); } if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { const txnid_t txnid = mdbx_meta_txnid_stable(env, head); - mdbx_info("updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - head->mm_geo.shrink, head->mm_geo.grow, txnid, - meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - meta.mm_geo.shrink, meta.mm_geo.grow, txnid + 1); + const txnid_t next_txnid = safe64_txnid_next(txnid); + mdbx_verbose("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + head->mm_geo.shrink, head->mm_geo.grow, txnid, + meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + meta.mm_geo.shrink, meta.mm_geo.grow, next_txnid); mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); - mdbx_meta_set_txnid(env, &meta, txnid + 1); - *env->me_unsynced_pages += 1; + mdbx_meta_set_txnid(env, &meta, next_txnid); err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); if (err) { - mdbx_info("error %d, while updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - err, head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - head->mm_geo.shrink, head->mm_geo.grow, txnid, - meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - meta.mm_geo.shrink, meta.mm_geo.grow, txnid + 1); + mdbx_error("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, head->mm_geo.lower, head->mm_geo.now, + head->mm_geo.upper, head->mm_geo.shrink, head->mm_geo.grow, + txnid, meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + meta.mm_geo.shrink, meta.mm_geo.grow, next_txnid); return err; } } } + *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes); + if (used_aligned2os_bytes < env->me_dxb_mmap.current) { +#if defined(MADV_REMOVE) + if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0) { + mdbx_notice("open-MADV_%s %u..%u", "REMOVE", *env->me_discarded_tail, + bytes2pgno(env, env->me_dxb_mmap.current)); + err = + madvise(env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail, + bytes2pgno(env, env->me_dxb_mmap.current)); + err = + madvise(env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_DONTNEED) + err = ignore_enosys(posix_madvise( + env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_DONTNEED) + err = ignore_enosys(posix_fadvise( + env->me_fd, used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DONTNEED */ + } + + const bool readahead = (env->me_flags & MDBX_NORDAHEAD) == 0 && + mdbx_is_readahead_reasonable(env->me_dxb_mmap.current, + 0) == MDBX_RESULT_TRUE; + err = mdbx_set_readahead(env, 0, used_bytes, readahead); + if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) + return err; + return rc; } -/****************************************************************************/ +/******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, @@ -6497,136 +8565,297 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, (env->me_flags & MDBX_EXCLUSIVE) ? true : false); if (err != MDBX_SUCCESS) { if (!(err == MDBX_ENOFILE && (env->me_flags & MDBX_EXCLUSIVE)) && - !(err == MDBX_EROFS && (env->me_flags & MDBX_RDONLY))) + !((err == MDBX_EROFS || err == MDBX_EACCESS || err == MDBX_EPERM) && + (env->me_flags & MDBX_RDONLY))) + return err; + + /* ensure the file system is read-only */ + err = mdbx_check_fs_rdonly(env->me_fd, lck_pathname, err); + if (err != MDBX_SUCCESS) return err; /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + /* beginning of a locked section ---------------------------------------- */ + lcklist_lock(); + mdbx_assert(env, env->me_lcklist_next == nullptr); env->me_lfd = INVALID_HANDLE_VALUE; const int rc = mdbx_lck_seize(env); - if (MDBX_IS_ERROR(rc)) + if (MDBX_IS_ERROR(rc)) { + /* Calling lcklist_detach_locked() is required to restore POSIX-filelock + * and this job will be done by mdbx_env_close0(). */ + lcklist_unlock(); return rc; + } + /* insert into inprocess lck-list */ + env->me_lcklist_next = inprocess_lcklist_head; + inprocess_lcklist_head = env; + lcklist_unlock(); + /* end of a locked section ---------------------------------------------- */ env->me_oldest = &env->me_lckless_stub.oldest; - env->me_unsynced_timeout = &env->me_lckless_stub.unsynced_timeout; + env->me_sync_timestamp = &env->me_lckless_stub.sync_timestamp; env->me_autosync_period = &env->me_lckless_stub.autosync_period; env->me_unsynced_pages = &env->me_lckless_stub.autosync_pending; env->me_autosync_threshold = &env->me_lckless_stub.autosync_threshold; + env->me_discarded_tail = &env->me_lckless_stub.discarded_tail; + env->me_meta_sync_txnid = &env->me_lckless_stub.meta_sync_txnid; env->me_maxreaders = UINT_MAX; -#ifdef MDBX_OSAL_LOCK - env->me_wmutex = &env->me_lckless_stub.wmutex; -#endif +#if MDBX_LOCKING > 0 + env->me_wlock = &env->me_lckless_stub.wlock; +#endif /* MDBX_LOCKING > 0 */ mdbx_debug("lck-setup:%s%s%s", " lck-less", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); return rc; } + /* beginning of a locked section ------------------------------------------ */ + lcklist_lock(); + mdbx_assert(env, env->me_lcklist_next == nullptr); + /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - const int rc = mdbx_lck_seize(env); - if (MDBX_IS_ERROR(rc)) - return rc; + err = mdbx_lck_seize(env); + if (MDBX_IS_ERROR(err)) { + bailout: + /* Calling lcklist_detach_locked() is required to restore POSIX-filelock + * and this job will be done by mdbx_env_close0(). */ + lcklist_unlock(); + return err; + } + + MDBX_env *inprocess_neighbor = nullptr; + if (err == MDBX_RESULT_TRUE) { + err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (MDBX_IS_ERROR(err)) + goto bailout; + if (inprocess_neighbor && + ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { + err = MDBX_BUSY; + goto bailout; + } + } + const int lck_seize_rc = err; mdbx_debug("lck-setup:%s%s%s", " with-lck", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" + : " cooperative"); - uint64_t size; + uint64_t size = 0; err = mdbx_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) - return err; + goto bailout; - if (rc == MDBX_RESULT_TRUE) { - uint64_t wanna = mdbx_roundup2( - (env->me_maxreaders - 1) * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), - env->me_os_psize); -#ifndef NDEBUG - err = mdbx_ftruncate(env->me_lfd, size = 0); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#endif + if (lck_seize_rc == MDBX_RESULT_TRUE) { + size = roundup_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo), + env->me_os_psize); mdbx_jitter4testing(false); - - if (size != wanna) { - err = mdbx_ftruncate(env->me_lfd, wanna); - if (unlikely(err != MDBX_SUCCESS)) - return err; - size = wanna; - } } else { - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_BUSY; - if (size > PTRDIFF_MAX || (size & (env->me_os_psize - 1)) || + if (env->me_flags & MDBX_EXCLUSIVE) { + err = MDBX_BUSY; + goto bailout; + } + if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { - mdbx_notice("lck-file has invalid size %" PRIu64 " bytes", size); - return MDBX_PROBLEM; + mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size); + err = MDBX_PROBLEM; + goto bailout; } } const size_t maxreaders = - ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; - if (maxreaders > UINT16_MAX) { + ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); + if (size > 65536 || maxreaders < 2 || maxreaders > MDBX_READERS_LIMIT) { mdbx_error("lck-size too big (up to %" PRIuPTR " readers)", maxreaders); - return MDBX_PROBLEM; + err = MDBX_PROBLEM; + goto bailout; } env->me_maxreaders = (unsigned)maxreaders; - err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size); + err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size, + lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE + : MMAP_OPTION_SEMAPHORE); if (unlikely(err != MDBX_SUCCESS)) - return err; + goto bailout; #ifdef MADV_DODUMP - (void)madvise(env->me_lck, size, MADV_DODUMP); -#endif - -#ifdef MADV_DONTFORK - if (madvise(env->me_lck, size, MADV_DONTFORK) < 0) - return errno; -#endif + err = madvise(env->me_lck, size, MADV_DODUMP) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#endif /* MADV_DODUMP */ #ifdef MADV_WILLNEED - if (madvise(env->me_lck, size, MADV_WILLNEED) < 0) - return errno; -#endif - -#ifdef MADV_RANDOM - if (madvise(env->me_lck, size, MADV_RANDOM) < 0) - return errno; -#endif - - if (rc == MDBX_RESULT_TRUE) { - /* LY: exlcusive mode, init lck */ - memset(env->me_lck, 0, (size_t)size); - err = mdbx_lck_init(env); - if (err) - return err; + err = madvise(env->me_lck, size, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#endif /* MADV_WILLNEED */ - env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; - env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; + struct MDBX_lockinfo *const lck = env->me_lck; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + /* LY: exlcusive mode, check and reset lck content */ + memset(lck, 0, (size_t)size); + mdbx_jitter4testing(false); + lck->mti_magic_and_version = MDBX_LOCK_MAGIC; + lck->mti_os_and_format = MDBX_LOCK_FORMAT; } else { - if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { - mdbx_error("lock region has invalid magic/version"); - return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC) - ? MDBX_INVALID - : MDBX_VERSION_MISMATCH; + if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { + mdbx_error("%s", "lock region has invalid magic/version"); + err = ((lck->mti_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; + goto bailout; } - if (env->me_lck->mti_os_and_format != MDBX_LOCK_FORMAT) { + if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32, - env->me_lck->mti_os_and_format, MDBX_LOCK_FORMAT); - return MDBX_VERSION_MISMATCH; + lck->mti_os_and_format, MDBX_LOCK_FORMAT); + err = MDBX_VERSION_MISMATCH; + goto bailout; } } - mdbx_assert(env, !MDBX_IS_ERROR(rc)); - env->me_oldest = &env->me_lck->mti_oldest_reader; - env->me_unsynced_timeout = &env->me_lck->mti_unsynced_timeout; - env->me_autosync_period = &env->me_lck->mti_autosync_period; - env->me_unsynced_pages = &env->me_lck->mti_unsynced_pages; - env->me_autosync_threshold = &env->me_lck->mti_autosync_threshold; -#ifdef MDBX_OSAL_LOCK - env->me_wmutex = &env->me_lck->mti_wmutex; + err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc); + if (MDBX_IS_ERROR(err)) + goto bailout; + + mdbx_ensure(env, env->me_lcklist_next == nullptr); + /* insert into inprocess lck-list */ + env->me_lcklist_next = inprocess_lcklist_head; + inprocess_lcklist_head = env; + lcklist_unlock(); + /* end of a locked section ------------------------------------------------ */ + + mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc)); + env->me_oldest = &lck->mti_oldest_reader; + env->me_sync_timestamp = &lck->mti_sync_timestamp; + env->me_autosync_period = &lck->mti_autosync_period; + env->me_unsynced_pages = &lck->mti_unsynced_pages; + env->me_autosync_threshold = &lck->mti_autosync_threshold; + env->me_discarded_tail = &lck->mti_discarded_tail; + env->me_meta_sync_txnid = &lck->mti_meta_sync_txnid; +#if MDBX_LOCKING > 0 + env->me_wlock = &lck->mti_wlock; +#endif /* MDBX_LOCKING > 0 */ + return lck_seize_rc; +} + +__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { + if (volume <= 1024 * 1024 * 4ul) + return MDBX_RESULT_TRUE; + + const intptr_t pagesize = mdbx_syspagesize(); + if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) + return MDBX_INCOMPATIBLE; + +#if defined(_WIN32) || defined(_WIN64) + MEMORYSTATUSEX info; + memset(&info, 0, sizeof(info)); + info.dwLength = sizeof(info); + if (!GlobalMemoryStatusEx(&info)) + return GetLastError(); #endif - return rc; + const int log2page = log2n(pagesize); + +#if defined(_WIN32) || defined(_WIN64) + const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page); +#elif defined(_SC_PHYS_PAGES) + const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES); + if (total_ram_pages == -1) + return errno; +#elif defined(_SC_AIX_REALMEM) + const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM); + if (total_ram_Kb == -1) + return errno; + const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page; +#elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \ + defined(HW_PHYSMEM) + size_t ram, len = sizeof(ram); + static const int mib[] = { + CTL_HW, +#if defined(HW_USERMEM) + HW_USERMEM +#elif defined(HW_PHYSMEM64) + HW_PHYSMEM64 +#elif defined(HW_MEMSIZE) + HW_MEMSIZE +#else + HW_PHYSMEM +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0) + return errno; + if (len != sizeof(ram)) + return MDBX_ENOSYS; + const intptr_t total_ram_pages = (intptr_t)(ram >> log2page); +#else +#error "FIXME: Get User-accessible or physical RAM" +#endif + if (total_ram_pages < 1) + return MDBX_ENOSYS; + + const intptr_t volume_pages = (volume + pagesize - 1) >> log2page; + const intptr_t redundancy_pages = + (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page) + : (intptr_t)(redundancy + pagesize - 1) >> log2page; + if (volume_pages >= total_ram_pages || + volume_pages + redundancy_pages >= total_ram_pages) + return MDBX_RESULT_FALSE; + +#if defined(_WIN32) || defined(_WIN64) + const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page); +#elif defined(_SC_AVPHYS_PAGES) + const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES); + if (avail_ram_pages == -1) + return errno; +#elif defined(__MACH__) + mach_msg_type_number_t count = HOST_VM_INFO_COUNT; + vm_statistics_data_t vmstat; + mach_port_t mport = mach_host_self(); + kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO, + (host_info_t)&vmstat, &count); + mach_port_deallocate(mach_task_self(), mport); + if (unlikely(kerr != KERN_SUCCESS)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = vmstat.free_count; +#elif defined(VM_TOTAL) || defined(VM_METER) + struct vmtotal info; + size_t len = sizeof(info); + static const int mib[] = { + CTL_VM, +#if defined(VM_TOTAL) + VM_TOTAL +#elif defined(VM_METER) + VM_METER +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0) + return errno; + if (len != sizeof(info)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = info.t_free; +#else +#error "FIXME: Get Available RAM" +#endif + if (avail_ram_pages < 1) + return MDBX_ENOSYS; + + return (volume_pages + redundancy_pages >= avail_ram_pages) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; } /* Only a subset of the mdbx_env flags can be changed @@ -6634,7 +8863,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, * environment and re-opening it with the new flags. */ #define CHANGEABLE \ (MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \ - MDBX_COALESCE | MDBX_PAGEPERTURB) + MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) #define CHANGELESS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -6685,11 +8914,25 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, /* LY: silently ignore irrelevant flags when * we're only getting read access */ flags &= ~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOSYNC | MDBX_NOMETASYNC | - MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT); + MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); } else { - if (!((env->me_free_pgs = mdbx_pnl_alloc(MDBX_PNL_INITIAL)) && - (env->me_dirtylist = - mdbx_calloc(MDBX_DPL_TXNFULL + 1, sizeof(MDBX_DP))))) +#ifdef __OpenBSD__ + /* Temporary `workaround` for OpenBSD kernel's bug. + * See https://github.com/leo-yuriev/libmdbx/issues/67 */ + if ((flags & MDBX_WRITEMAP) == 0) { + if (flags & MDBX_ACCEDE) + flags |= MDBX_WRITEMAP; + else { + mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "OpenBSD requires MDBX_WRITEMAP because of an internal " + "bug(s) in a file/buffer/page cache.\n"); + rc = 42 /* ENOPROTOOPT */; + goto bailout; + } + } +#endif /* __OpenBSD__ */ + env->me_dirtylist = mdbx_calloc(MDBX_DPL_TXNFULL + 1, sizeof(MDBX_DP)); + if (!env->me_dirtylist) rc = MDBX_ENOMEM; } @@ -6706,14 +8949,36 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, rc = MDBX_ENOMEM; goto bailout; } - env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDBX_INTEGERKEY */ + env->me_dbxs[FREE_DBI].md_cmp = + mdbx_cmp_int_align4; /* aligned MDBX_INTEGERKEY */ int oflags; if (F_ISSET(flags, MDBX_RDONLY)) oflags = O_RDONLY; - else if (mode != 0) + else if (mode != 0) { + if ((flags & MDBX_NOSUBDIR) == 0) { +#if defined(_WIN32) || defined(_WIN64) + if (!CreateDirectoryA(path, nullptr)) { + rc = GetLastError(); + if (rc != ERROR_ALREADY_EXISTS) + goto bailout; + } +#else + const mode_t dir_mode = + (/* inherit read/write permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write/search for owner */ S_IRWXU | + ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | + ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); + if (mkdir(path, dir_mode)) { + rc = errno; + if (rc != EEXIST) + goto bailout; + } +#endif + } oflags = O_RDWR | O_CREAT; - else + } else oflags = O_RDWR; rc = mdbx_openfile(dxb_pathname, oflags, mode, &env->me_fd, @@ -6721,12 +8986,61 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, if (rc != MDBX_SUCCESS) goto bailout; +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + env->me_sysv_ipc.key = ftok(dxb_pathname, 42); + if (env->me_sysv_ipc.key == -1) { + rc = errno; + goto bailout; + } +#endif /* MDBX_LOCKING */ + +#if !(defined(_WIN32) || defined(_WIN64)) + if (mode == 0) { + struct stat st; + if (fstat(env->me_fd, &st)) { + rc = errno; + goto bailout; + } + mode = st.st_mode; + } +#endif + const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; } + const unsigned rigorous_flags = MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_MAPASYNC; + const unsigned mode_flags = rigorous_flags | MDBX_NOMETASYNC | + MDBX_LIFORECLAIM | MDBX_COALESCE | MDBX_NORDAHEAD; + + if (env->me_lck && lck_rc != MDBX_RESULT_TRUE && + (env->me_flags & MDBX_RDONLY) == 0) { + while (env->me_lck->mti_envmode == MDBX_RDONLY) { + if (atomic_cas32(&env->me_lck->mti_envmode, MDBX_RDONLY, + env->me_flags & mode_flags)) + break; + atomic_yield(); + } + + if (env->me_flags & MDBX_ACCEDE) { + /* pickup current mode-flags, including MDBX_LIFORECLAIM | + * MDBX_COALESCE | MDBX_NORDAHEAD */ + const unsigned diff = + (env->me_lck->mti_envmode ^ env->me_flags) & mode_flags; + mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, + env->me_flags ^ diff); + env->me_flags ^= diff; + } + + if ((env->me_lck->mti_envmode ^ env->me_flags) & rigorous_flags) { + mdbx_error("%s", "current mode/flags incompatible with requested"); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + } + const int dxb_rc = mdbx_setup_dxb(env, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; @@ -6735,36 +9049,17 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, mdbx_debug("opened dbenv %p", (void *)env); if (env->me_lck) { - const unsigned mode_flags = - MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); - if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdbx_lck_downgrade(env, true); - mdbx_debug("lck-downgrade-full: rc %i ", rc); - } else { - rc = mdbx_lck_downgrade(env, false); - mdbx_debug("lck-downgrade-partial: rc %i ", rc); - } + rc = mdbx_lck_downgrade(env); + mdbx_debug("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) goto bailout; } else { - if ((env->me_flags & MDBX_RDONLY) == 0) { - while (env->me_lck->mti_envmode == MDBX_RDONLY) { - if (mdbx_atomic_compare_and_swap32(&env->me_lck->mti_envmode, - MDBX_RDONLY, - env->me_flags & mode_flags)) - break; - /* TODO: yield/relax cpu */ - } - if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { - mdbx_error("current mode/flags incompatible with requested"); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - } + rc = mdbx_reader_check0(env, false, NULL); + if (MDBX_IS_ERROR(rc)) + goto bailout; } if ((env->me_flags & MDBX_NOTLS) == 0) { @@ -6777,12 +9072,14 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, } if ((flags & MDBX_RDONLY) == 0) { + rc = MDBX_ENOMEM; MDBX_txn *txn; int tsize = sizeof(MDBX_txn), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + sizeof(unsigned) + 1); - if ((env->me_pbuf = mdbx_calloc(2, env->me_psize)) && + if ((env->me_pbuf = mdbx_calloc( + 1 /* page buffer */ + 1 /* page killer bufer */, env->me_psize)) && (txn = mdbx_calloc(1, size))) { txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); @@ -6792,8 +9089,10 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; - } else { - rc = MDBX_ENOMEM; + txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + if (txn->tw.retired_pages && txn->tw.reclaimed_pglist) + rc = MDBX_SUCCESS; } } @@ -6805,20 +9104,20 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, mdbx_debug("opened database version %u, pagesize %u", (uint8_t)meta->mm_magic_and_version, env->me_psize); mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - container_of(meta, MDBX_page, mp_data)->mp_pgno, - mdbx_meta_txnid_fluid(env, meta)); + data_page(meta)->mp_pgno, mdbx_meta_txnid_fluid(env, meta)); mdbx_debug("depth: %u", db->md_depth); mdbx_debug("entries: %" PRIu64, db->md_entries); mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages); mdbx_debug("root: %" PRIaPGNO, db->md_root); + mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); } #endif bailout: if (rc) { - mdbx_env_close0(env); + rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; env->me_flags = saved_me_flags | MDBX_FATAL_ERROR; } mdbx_free(lck_pathname); @@ -6826,37 +9125,30 @@ bailout: } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -static void __cold mdbx_env_close0(MDBX_env *env) { - if (!(env->me_flags & MDBX_ENV_ACTIVE)) - return; - env->me_flags &= ~MDBX_ENV_ACTIVE; - - /* Doing this here since me_dbxs may not exist during mdbx_env_close */ - if (env->me_dbxs) { - for (unsigned i = env->me_maxdbs; --i >= CORE_DBS;) - mdbx_free(env->me_dbxs[i].md_name.iov_base); - mdbx_free(env->me_dbxs); - } - - mdbx_free(env->me_pbuf); - mdbx_free(env->me_dbiseqs); - mdbx_free(env->me_dbflags); - mdbx_free(env->me_path); - mdbx_free(env->me_dirtylist); - if (env->me_txn0) { - mdbx_txl_free(env->me_txn0->mt_lifo_reclaimed); - mdbx_free(env->me_txn0); +static int __cold mdbx_env_close0(MDBX_env *env) { + if (!(env->me_flags & MDBX_ENV_ACTIVE)) { + mdbx_ensure(env, env->me_lcklist_next == nullptr); + return MDBX_SUCCESS; } - mdbx_pnl_free(env->me_free_pgs); + env->me_flags &= ~MDBX_ENV_ACTIVE; + env->me_oldest = nullptr; + env->me_sync_timestamp = nullptr; + env->me_autosync_period = nullptr; + env->me_unsynced_pages = nullptr; + env->me_autosync_threshold = nullptr; + env->me_discarded_tail = nullptr; + env->me_meta_sync_txnid = nullptr; if (env->me_flags & MDBX_ENV_TXKEY) mdbx_rthc_remove(env->me_txkey); - if (env->me_live_reader) - (void)mdbx_rpid_clear(env); + + lcklist_lock(); + const int rc = lcklist_detach_locked(env); + lcklist_unlock(); if (env->me_map) { mdbx_munmap(&env->me_dxb_mmap); -#ifdef USE_VALGRIND +#ifdef MDBX_USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; #endif @@ -6868,18 +9160,31 @@ static void __cold mdbx_env_close0(MDBX_env *env) { if (env->me_lck) mdbx_munmap(&env->me_lck_mmap); - env->me_oldest = nullptr; - env->me_unsynced_timeout = nullptr; - env->me_autosync_period = nullptr; - env->me_unsynced_pages = nullptr; - env->me_autosync_threshold = nullptr; - mdbx_lck_destroy(env); if (env->me_lfd != INVALID_HANDLE_VALUE) { (void)mdbx_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; } + + if (env->me_dbxs) { + for (unsigned i = env->me_maxdbs; --i >= CORE_DBS;) + mdbx_free(env->me_dbxs[i].md_name.iov_base); + mdbx_free(env->me_dbxs); + } + mdbx_free(env->me_pbuf); + mdbx_free(env->me_dbiseqs); + mdbx_free(env->me_dbflags); + mdbx_free(env->me_path); + mdbx_free(env->me_dirtylist); + if (env->me_txn0) { + mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); + mdbx_pnl_free(env->me_txn0->tw.retired_pages); + mdbx_pnl_free(env->me_txn0->tw.spill_pages); + mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist); + mdbx_free(env->me_txn0); + } env->me_flags = 0; + return rc; } int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { @@ -6892,9 +9197,17 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if ((env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0) { - if (env->me_txn0 && env->me_txn0->mt_owner && - env->me_txn0->mt_owner != mdbx_thread_self()) +#if MDBX_TXN_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) + /* Check the PID even if MDBX_TXN_CHECKPID=0 on non-Windows + * platforms (i.e. where fork() is available). + * This is required to legitimize a call after fork() + * from a child process, that should be allowed to free resources. */ + if (unlikely(env->me_pid != mdbx_getpid())) + env->me_flags |= MDBX_FATAL_ERROR; +#endif /* MDBX_TXN_CHECKPID */ + + if ((env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && env->me_txn0) { + if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) return MDBX_BUSY; if (!dont_sync) { #if defined(_WIN32) || defined(_WIN64) @@ -6903,25 +9216,31 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { * Because in the "owner died" condition kernel don't release * file lock immediately. */ rc = mdbx_env_sync_ex(env, true, false); + rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; #else - rc = mdbx_env_sync_ex(env, true, true); - rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || - rc == EWOULDBLOCK) - ? MDBX_SUCCESS - : rc; + struct stat st; + if (unlikely(fstat(env->me_fd, &st))) + rc = errno; + else if (st.st_nlink > 0 /* don't sync deleted files */) { + rc = mdbx_env_sync_ex(env, true, true); + rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) + ? MDBX_SUCCESS + : rc; + } #endif } } - VALGRIND_DESTROY_MEMPOOL(env); while ((dp = env->me_dpages) != NULL) { ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); env->me_dpages = dp->mp_next; mdbx_free(dp); } + VALGRIND_DESTROY_MEMPOOL(env); - mdbx_env_close0(env); + rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ @@ -6931,11 +9250,11 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ -#ifdef MDBX_OSAL_LOCK - mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_lckless_stub.wmutex) == - MDBX_SUCCESS); -#endif +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + mdbx_ensure(env, mdbx_ipclock_destroy(&env->me_lckless_stub.wlock) == 0); +#endif /* MDBX_LOCKING */ + mdbx_ensure(env, env->me_lcklist_next == nullptr); env->me_pid = 0; env->me_signature = 0; mdbx_free(env); @@ -6948,139 +9267,82 @@ __cold int mdbx_env_close(MDBX_env *env) { } /* Compare two items pointing at aligned unsigned int's. */ -static int __hot mdbx_cmp_int_ai(const MDBX_val *a, const MDBX_val *b) { +static int __hot mdbx_cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); - mdbx_assert(NULL, 0 == (uintptr_t)a->iov_base % sizeof(int) && - 0 == (uintptr_t)b->iov_base % sizeof(int)); switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); + return CMP2INT(unaligned_peek_u32(4, a->iov_base), + unaligned_peek_u32(4, b->iov_base)); case 8: - return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); + return CMP2INT(unaligned_peek_u64(4, a->iov_base), + unaligned_peek_u64(4, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, __LINE__); return 0; } } /* Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot mdbx_cmp_int_a2(const MDBX_val *a, const MDBX_val *b) { +static int __hot mdbx_cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); - mdbx_assert(NULL, 0 == (uintptr_t)a->iov_base % sizeof(uint16_t) && - 0 == (uintptr_t)b->iov_base % sizeof(uint16_t)); -#if UNALIGNED_OK switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); + return CMP2INT(unaligned_peek_u32(2, a->iov_base), + unaligned_peek_u32(2, b->iov_base)); case 8: - return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); + return CMP2INT(unaligned_peek_u64(2, a->iov_base), + unaligned_peek_u64(2, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, __LINE__); return 0; } -#else - mdbx_assert(NULL, 0 == a->iov_len % sizeof(uint16_t)); - { - int diff; - const uint16_t *pa, *pb, *end; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - end = (const uint16_t *)a->iov_base; - pa = (const uint16_t *)((char *)a->iov_base + a->iov_len); - pb = (const uint16_t *)((char *)b->iov_base + a->iov_len); - do { - diff = *--pa - *--pb; -#else /* __BYTE_ORDER__ */ - end = (const uint16_t *)((char *)a->iov_base + a->iov_len); - pa = (const uint16_t *)a->iov_base; - pb = (const uint16_t *)b->iov_base; - do { - diff = *pa++ - *pb++; -#endif /* __BYTE_ORDER__ */ - if (likely(diff != 0)) - break; - } while (pa != end); - return diff; - } -#endif /* UNALIGNED_OK */ } /* Compare two items pointing at unsigneds of unknown alignment. * * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -static int __hot mdbx_cmp_int_ua(const MDBX_val *a, const MDBX_val *b) { +static int __hot mdbx_cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); -#if UNALIGNED_OK switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); + return CMP2INT(unaligned_peek_u32(1, a->iov_base), + unaligned_peek_u32(1, b->iov_base)); case 8: - return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); + return CMP2INT(unaligned_peek_u64(1, a->iov_base), + unaligned_peek_u64(1, b->iov_base)); default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, __LINE__); return 0; } -#else - mdbx_assert(NULL, a->iov_len == sizeof(int) || a->iov_len == sizeof(size_t)); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - { - int diff; - const uint8_t *pa, *pb; - - pa = (const uint8_t *)a->iov_base + a->iov_len; - pb = (const uint8_t *)b->iov_base + a->iov_len; - - do { - diff = *--pa - *--pb; - if (likely(diff != 0)) - break; - } while (pa != a->iov_base); - return diff; - } -#else /* __BYTE_ORDER__ */ - return memcmp(a->iov_base, b->iov_base, a->iov_len); -#endif /* __BYTE_ORDER__ */ -#endif /* UNALIGNED_OK */ } /* Compare two items lexically */ static int __hot mdbx_cmp_memn(const MDBX_val *a, const MDBX_val *b) { -/* LY: assumes that length of keys are NOT equal for most cases, - * if no then branch-prediction should mitigate the problem */ -#if 0 - /* LY: without branch instructions on x86, - * but isn't best for equal length of keys */ - int diff_len = mdbx_cmp2int(a->iov_len, b->iov_len); -#else - /* LY: best when length of keys are equal, - * but got a branch-penalty otherwise */ - if (likely(a->iov_len == b->iov_len)) + if (a->iov_len == b->iov_len) return memcmp(a->iov_base, b->iov_base, a->iov_len); - int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; -#endif - size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; int diff_data = memcmp(a->iov_base, b->iov_base, shortest); return likely(diff_data) ? diff_data : diff_len; } /* Compare two items in reverse byte order */ static int __hot mdbx_cmp_memnr(const MDBX_val *a, const MDBX_val *b) { - const uint8_t *pa, *pb, *end; - - pa = (const uint8_t *)a->iov_base + a->iov_len; - pb = (const uint8_t *)b->iov_base + b->iov_len; - size_t minlen = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - end = pa - minlen; + const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; + const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + const uint8_t *const end = pa - shortest; while (pa != end) { int diff = *--pa - *--pb; if (likely(diff)) return diff; } - return mdbx_cmp2int(a->iov_len, b->iov_len); + return CMP2INT(a->iov_len, b->iov_len); } /* Search for key within a page, using binary search. @@ -7094,12 +9356,12 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int low, high; int rc = 0; MDBX_page *mp = mc->mc_pg[mc->mc_top]; - MDBX_node *node = NULL; + MDBX_node *node = nullptr; MDBX_val nodekey; MDBX_cmp_func *cmp; DKBUF; - const unsigned nkeys = NUMKEYS(mp); + const unsigned nkeys = page_numkeys(mp); mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", @@ -7112,16 +9374,19 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, /* Branch pages have no data, so if using integer keys, * alignment is guaranteed. Use faster mdbx_cmp_int_ai. */ - if (cmp == mdbx_cmp_int_a2 && IS_BRANCH(mp)) - cmp = mdbx_cmp_int_ai; + if (cmp == mdbx_cmp_int_align2 && IS_BRANCH(mp)) + cmp = mdbx_cmp_int_align4; unsigned i = 0; if (IS_LEAF2(mp)) { - nodekey.iov_len = mc->mc_db->md_xsize; - node = NODEPTR(mp, 0); /* fake */ + mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); + nodekey.iov_len = mp->mp_leaf2_ksize; + node = (MDBX_node *)(intptr_t)-1; /* fake */ while (low <= high) { i = (low + high) >> 1; - nodekey.iov_base = LEAF2KEY(mp, i, nodekey.iov_len); + nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); + mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); rc = cmp(key, &nodekey); mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); if (rc == 0) @@ -7135,16 +9400,18 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, while (low <= high) { i = (low + high) >> 1; - node = NODEPTR(mp, i); - nodekey.iov_len = NODEKSZ(node); - nodekey.iov_base = NODEKEY(node); + node = page_node(mp, i); + nodekey.iov_len = node_ks(node); + nodekey.iov_base = node_key(node); + mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); rc = cmp(key, &nodekey); if (IS_LEAF(mp)) mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); else mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, - DKEY(&nodekey), NODEPGNO(node), rc); + DKEY(&nodekey), node_pgno(node), rc); if (rc == 0) break; if (rc > 0) @@ -7166,8 +9433,8 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, /* There is no entry larger or equal to the key. */ return NULL; - /* nodeptr is fake for LEAF2 */ - return IS_LEAF2(mp) ? node : NODEPTR(mp, i); + /* page_node is fake for LEAF2 */ + return IS_LEAF2(mp) ? node : page_node(mp, i); } #if 0 /* unused for now */ @@ -7230,41 +9497,29 @@ static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, int *lvl) { MDBX_txn *txn = mc->mc_txn; + if (unlikely(pgno >= txn->mt_next_pgno)) { + mdbx_debug("page %" PRIaPGNO " not found", pgno); + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_NOTFOUND; + } + MDBX_env *env = txn->mt_env; MDBX_page *p = NULL; int level; - - if (!(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_WRITEMAP))) { - MDBX_txn *tx2 = txn; + if ((txn->mt_flags & (MDBX_RDONLY | MDBX_WRITEMAP)) == 0) { level = 1; do { - MDBX_DPL dl = tx2->mt_rw_dirtylist; /* Spilled pages were dirtied in this txn and flushed * because the dirty list got full. Bring this page * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ - if (tx2->mt_spill_pages) { - pgno_t pn = pgno << 1; - unsigned x = mdbx_pnl_search(tx2->mt_spill_pages, pn); - if (x <= MDBX_PNL_SIZE(tx2->mt_spill_pages) && - tx2->mt_spill_pages[x] == pn) - goto mapped; - } - if (dl->length) { - unsigned y = mdbx_dpl_search(dl, pgno); - if (y <= dl->length && dl[y].pgno == pgno) { - p = dl[y].ptr; - goto done; - } - } + if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) + goto mapped; + p = mdbx_dpl_find(txn->tw.dirtylist, pgno); + if (p) + goto done; level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - - if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_debug("page %" PRIaPGNO " not found", pgno); - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_NOTFOUND; + } while ((txn = txn->mt_parent) != NULL); } level = 0; @@ -7272,6 +9527,7 @@ mapped: p = pgno2page(env, pgno); done: + txn = nullptr /* avoid future use */; if (unlikely(p->mp_pgno != pgno)) { mdbx_error("mismatch pgno %" PRIaPGNO " (actual) != %" PRIaPGNO " (expected)", @@ -7279,15 +9535,21 @@ done: return MDBX_CORRUPTED; } - if (unlikely(p->mp_upper < p->mp_lower || + if (unlikely(p->mp_upper < p->mp_lower || ((p->mp_lower | p->mp_upper) & 1) || PAGEHDRSZ + p->mp_upper > env->me_psize) && !IS_OVERFLOW(p)) { mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower, - p->mp_upper, env->me_psize - PAGEHDRSZ); + p->mp_upper, page_space(env)); return MDBX_CORRUPTED; } /* TODO: more checks here, including p->mp_validator */ + if (mdbx_audit_enabled()) { + int err = mdbx_page_check(env, p, true); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + *ret = p; if (lvl) *lvl = level; @@ -7304,20 +9566,20 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, while (IS_BRANCH(mp)) { MDBX_node *node; - indx_t i; + int i; mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, - NUMKEYS(mp)); - /* Don't assert on branch pages in the FreeDB. We can get here - * while in the process of rebalancing a FreeDB branch page; we must + page_numkeys(mp)); + /* Don't assert on branch pages in the GC. We can get here + * while in the process of rebalancing a GC branch page; we must * let that proceed. ITS#8336 */ - mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - mdbx_debug("found index 0 to page %" PRIaPGNO, NODEPGNO(NODEPTR(mp, 0))); + mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1); + mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { i = 0; if (flags & MDBX_PS_LAST) { - i = NUMKEYS(mp) - 1; + i = page_numkeys(mp) - 1; /* if already init'd, see if we're already in right place */ if (mc->mc_flags & C_INITIALIZED) { if (mc->mc_ki[mc->mc_top] == i) { @@ -7331,7 +9593,7 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int exact; node = mdbx_node_search(mc, key, &exact); if (node == NULL) - i = NUMKEYS(mp) - 1; + i = page_numkeys(mp) - 1; else { i = mc->mc_ki[mc->mc_top]; if (!exact) { @@ -7342,13 +9604,13 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, mdbx_debug("following index %u for key [%s]", i, DKEY(key)); } - mdbx_cassert(mc, i < NUMKEYS(mp)); - node = NODEPTR(mp, i); + mdbx_cassert(mc, i < (int)page_numkeys(mp)); + node = page_node(mp, i); - if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, NULL)) != 0)) return rc; - mc->mc_ki[mc->mc_top] = i; + mc->mc_ki[mc->mc_top] = (indx_t)i; if (unlikely(rc = mdbx_cursor_push(mc, mp))) return rc; @@ -7375,18 +9637,54 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, return MDBX_SUCCESS; } +static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { + MDBX_cursor mc; + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDBX_BAD_DBI; + int rc = mdbx_cursor_init(&mc, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_page_search(&mc, &txn->mt_dbxs[dbi].md_name, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; + + MDBX_val data; + int exact = 0; + MDBX_node *node = mdbx_node_search(&mc, &txn->mt_dbxs[dbi].md_name, &exact); + if (unlikely(!exact)) + return MDBX_BAD_DBI; + if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDBX_INCOMPATIBLE; /* not a named DB */ + rc = mdbx_node_read(&mc, node, &data); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(data.iov_len < sizeof(MDBX_db))) + return MDBX_INCOMPATIBLE; /* not a named DB */ + + uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. */ + if (unlikely((txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS) != md_flags)) + return MDBX_INCOMPATIBLE; + + memcpy(&txn->mt_dbs[dbi], data.iov_base, sizeof(MDBX_db)); + txn->mt_dbflags[dbi] &= ~DB_STALE; + return MDBX_SUCCESS; +} + /* Search for the lowest key under the current branch page. - * This just bypasses a NUMKEYS check in the current page + * This just bypasses a numkeys check in the current page * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to * be underfilled. */ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_BRANCH(mp)); - MDBX_node *node = NODEPTR(mp, 0); + MDBX_node *node = page_node(mp, 0); int rc; - if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, NULL)) != 0)) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -7416,48 +9714,20 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { - mdbx_debug("transaction has failed, must abort"); + mdbx_debug("%s", "transaction has failed, must abort"); return MDBX_BAD_TXN; } /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbflag & DB_STALE)) { - MDBX_cursor mc2; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; - rc = mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI); + rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; - { - MDBX_val data; - int exact = 0; - MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDBX_BAD_DBI; - if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) - return MDBX_INCOMPATIBLE; /* not a named DB */ - rc = mdbx_node_read(&mc2, leaf, &data); - if (rc) - return rc; - - uint16_t md_flags; - memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDBX_db, md_flags)), - sizeof(uint16_t)); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. */ - if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) - return MDBX_INCOMPATIBLE; - memcpy(mc->mc_db, data.iov_base, sizeof(MDBX_db)); - } - *mc->mc_dbflag &= ~DB_STALE; } root = mc->mc_db->md_root; if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdbx_debug("tree is empty"); + mdbx_debug("%s", "tree is empty"); return MDBX_NOTFOUND; } @@ -7483,111 +9753,6 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { return mdbx_page_search_root(mc, key, flags); } -static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { - MDBX_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDBX_env *env = txn->mt_env; - MDBX_PNL sl = txn->mt_spill_pages; - pgno_t pn = pg << 1; - int rc; - - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - mdbx_cassert(mc, IS_OVERFLOW(mp)); - mdbx_debug("free ov page %" PRIaPGNO " (%u)", pg, ovpages); - - if (mdbx_audit_enabled() && env->me_reclaimed_pglist) { - mdbx_cassert(mc, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - const unsigned a = mdbx_pnl_search(env->me_reclaimed_pglist, pg); - mdbx_cassert(mc, a > MDBX_PNL_SIZE(env->me_reclaimed_pglist) || - env->me_reclaimed_pglist[a] != pg); - if (a <= MDBX_PNL_SIZE(env->me_reclaimed_pglist) && - unlikely(env->me_reclaimed_pglist[a] == pg)) - return MDBX_PROBLEM; - - if (ovpages > 1) { - const unsigned b = - mdbx_pnl_search(env->me_reclaimed_pglist, pg + ovpages - 1); - mdbx_cassert(mc, a == b); - if (unlikely(a != b)) - return MDBX_PROBLEM; - } - } - - /* If the page is dirty or on the spill list we just acquired it, - * so we should give it back to our current free list, if any. - * Otherwise put it onto the list of pages we freed in this txn. - * - * Won't create me_reclaimed_pglist: me_last_reclaimed must be inited along - * with it. - * Unsupported in nested txns: They would need to hide the page - * range in ancestor txns' dirty and spilled lists. */ - if (env->me_reclaimed_pglist && !txn->mt_parent && - (IS_DIRTY(mp) || - (sl && (x = mdbx_pnl_search(sl, pn)) <= MDBX_PNL_SIZE(sl) && - sl[x] == pn))) { - unsigned i, j; - pgno_t *mop; - MDBX_DP *dl, ix, iy; - rc = mdbx_pnl_need(&env->me_reclaimed_pglist, ovpages); - if (unlikely(rc)) - return rc; - - if (!IS_DIRTY(mp)) { - /* This page is no longer spilled */ - if (x == MDBX_PNL_SIZE(sl)) - MDBX_PNL_SIZE(sl)--; - else - sl[x] |= 1; - goto release; - } - /* Remove from dirty list */ - dl = txn->mt_rw_dirtylist; - x = dl->length--; - for (ix = dl[x]; ix.ptr != mp; ix = iy) { - if (likely(x > 1)) { - x--; - iy = dl[x]; - dl[x] = ix; - } else { - mdbx_cassert(mc, x > 1); - mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp, - mp->mp_pgno); - j = dl->length += 1; - dl[j] = ix; /* Unsorted. OK when MDBX_TXN_ERROR. */ - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; - } - } - txn->mt_dirtyroom++; - if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(env, mp, IS_OVERFLOW(mp) ? mp->mp_pages : 1); - release: - /* Insert in me_reclaimed_pglist */ - mop = env->me_reclaimed_pglist; - j = MDBX_PNL_SIZE(mop) + ovpages; - for (i = MDBX_PNL_SIZE(mop); i && MDBX_PNL_DISORDERED(mop[i], pg);) - mop[j--] = mop[i--]; - MDBX_PNL_SIZE(mop) += ovpages; - - pgno_t n = MDBX_PNL_ASCENDING ? pg + ovpages : pg; - while (j > i) - mop[j--] = MDBX_PNL_ASCENDING ? --n : n++; - mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); - } else { - rc = mdbx_pnl_append_range(&txn->mt_befree_pages, pg, ovpages); - if (unlikely(rc)) - return rc; - } - - mc->mc_db->md_overflow_pages -= ovpages; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - outer->md_overflow_pages -= ovpages; - } - return 0; -} - /* Return the data associated with a given node. * * [in] mc The cursor for this operation. @@ -7595,72 +9760,58 @@ static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { * [out] data Updated to point to the node's data. * * Returns 0 on success, non-zero on failure. */ -static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, +static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node, MDBX_val *data) { - MDBX_page *omp; /* overflow page */ - pgno_t pgno; - int rc; - - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->iov_len = NODEDSZ(leaf); - data->iov_base = NODEDATA(leaf); - return MDBX_SUCCESS; - } - - /* Read overflow data. */ - data->iov_len = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { - mdbx_debug("read overflow page %" PRIaPGNO " failed", pgno); - return rc; + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { + /* Read overflow data. */ + MDBX_page *omp; /* overflow page */ + int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, NULL); + if (unlikely((rc != MDBX_SUCCESS))) { + mdbx_debug("read overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return rc; + } + data->iov_base = page_data(omp); } - data->iov_base = PAGEDATA(omp); - return MDBX_SUCCESS; } int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - int exact = 0; DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - if (unlikely(!key || !data || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!key || !data)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; + + int exact = 0; return mdbx_cursor_set(&cx.outer, key, data, MDBX_SET, &exact); } -int mdbx_get2(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - int exact = 0; +int mdbx_get_nearest(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data) { DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - if (unlikely(!key || !data || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!key || !data)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -7669,19 +9820,74 @@ int mdbx_get2(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { return MDBX_BAD_TXN; MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - const int op = - (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) ? MDBX_GET_BOTH : MDBX_SET_KEY; - rc = mdbx_cursor_set(&cx.outer, key, data, op, &exact); + MDBX_val save_data = *data; + int exact = 0; + rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_RANGE, &exact); if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (exact && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) != 0) { + *data = save_data; + exact = 0; + rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_GET_BOTH_RANGE, &exact); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + return exact ? MDBX_SUCCESS : MDBX_RESULT_TRUE; } +int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + size_t *values_count) { + DKBUF; + mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return MDBX_EINVAL; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + int exact = 0; + rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY, &exact); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && values_count) + *values_count = 0; + return rc; + } + + if (values_count) { + *values_count = 1; + if (cx.outer.mc_xcursor != NULL) { + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && + (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); + *values_count = + (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || + cx.inner.mx_db.md_entries <= PTRDIFF_MAX) + ? (size_t)cx.inner.mx_db.md_entries + : PTRDIFF_MAX; + } + } + } + return MDBX_SUCCESS; +} + /* Find a sibling for a page. * Replaces the page at the top of the cursor's stack with the specified * sibling, if one exists. @@ -7696,16 +9902,15 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { MDBX_node *indx; MDBX_page *mp; - if (unlikely(mc->mc_snum < 2)) { + if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ - } mdbx_cursor_pop(mc); mdbx_debug("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if (move_right - ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) : (mc->mc_ki[mc->mc_top] == 0)) { mdbx_debug("no more keys left, moving to %s sibling", move_right ? "right" : "left"); @@ -7725,8 +9930,8 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { } mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { + indx = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, node_pgno(indx), &mp, NULL)) != 0)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); return rc; @@ -7736,7 +9941,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { if (unlikely(rc != MDBX_SUCCESS)) return rc; if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; return MDBX_SUCCESS; } @@ -7745,7 +9950,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { MDBX_page *mp; - MDBX_node *leaf; + MDBX_node *node; int rc; if ((mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) @@ -7756,20 +9961,20 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) + if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } if (mc->mc_db->md_flags & MDBX_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { rc = mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); return rc; } } @@ -7787,8 +9992,8 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto skip; } - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - mdbx_debug("=====> move to next sibling page"); + if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) { + mdbx_debug("%s", "=====> move to next sibling page"); if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; @@ -7802,36 +10007,36 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, skip: mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", - mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } mdbx_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, leaf); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (data) { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, data)) != MDBX_SUCCESS)) return rc; - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (F_ISSET(node_flags(node), F_DUPDATA)) { rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } } - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); return MDBX_SUCCESS; } @@ -7839,7 +10044,7 @@ skip: static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { MDBX_page *mp; - MDBX_node *leaf; + MDBX_node *node; int rc; if ((mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) @@ -7854,15 +10059,15 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if ((mc->mc_db->md_flags & MDBX_DUPSORT) && - mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { + node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { if (op == MDBX_PREV || op == MDBX_PREV_DUP) { rc = mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) { - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); mc->mc_flags &= ~C_EOF; } return rc; @@ -7881,12 +10086,12 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_flags &= ~(C_EOF | C_DEL); if (mc->mc_ki[mc->mc_top] == 0) { - mdbx_debug("=====> move to prev sibling page"); + mdbx_debug("%s", "=====> move to prev sibling page"); if ((rc = mdbx_cursor_sibling(mc, 0)) != MDBX_SUCCESS) { return rc; } mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else @@ -7894,36 +10099,36 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", - mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } mdbx_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, leaf); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (data) { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, data)) != MDBX_SUCCESS)) return rc; - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (F_ISSET(node_flags(node), F_DUPDATA)) { rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } } - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); return MDBX_SUCCESS; } @@ -7932,7 +10137,7 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op, int *exactp) { int rc; MDBX_page *mp; - MDBX_node *leaf = NULL; + MDBX_node *node = NULL; DKBUF; if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && @@ -7951,16 +10156,16 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { + if (!page_numkeys(mp)) { mc->mc_ki[mc->mc_top] = 0; return MDBX_NOTFOUND; } if (IS_LEAF2(mp)) { nodekey.iov_len = mc->mc_db->md_xsize; - nodekey.iov_base = LEAF2KEY(mp, 0, nodekey.iov_len); + nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len); } else { - leaf = NODEPTR(mp, 0); - MDBX_GET_KEYVALUE(leaf, nodekey); + node = page_node(mp, 0); + get_key(node, &nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (unlikely(rc == 0)) { @@ -7972,14 +10177,14 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto set1; } if (rc > 0) { - const unsigned nkeys = NUMKEYS(mp); + const unsigned nkeys = page_numkeys(mp); unsigned i; if (nkeys > 1) { if (IS_LEAF2(mp)) { - nodekey.iov_base = LEAF2KEY(mp, nkeys - 1, nodekey.iov_len); + nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); } else { - leaf = NODEPTR(mp, nkeys - 1); - MDBX_GET_KEYVALUE(leaf, nodekey); + node = page_node(mp, nkeys - 1); + get_key(node, &nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { @@ -7991,14 +10196,14 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, goto set1; } if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { /* This is definitely the right page, skip search_page */ if (IS_LEAF2(mp)) { nodekey.iov_base = - LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); + page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDBX_GET_KEYVALUE(leaf, nodekey); + node = page_node(mp, mc->mc_ki[mc->mc_top]); + get_key(node, &nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { @@ -8016,7 +10221,7 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* If any parents have right-sibs, search. * Otherwise, there's nothing further. */ for (i = 0; i < mc->mc_top; i++) - if (mc->mc_ki[i] < NUMKEYS(mc->mc_pg[i]) - 1) + if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) break; if (i == mc->mc_top) { /* There are no other pages */ @@ -8046,21 +10251,21 @@ __hot static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, IS_LEAF(mp)); set2: - leaf = mdbx_node_search(mc, key, exactp); + node = mdbx_node_search(mc, key, exactp); if (exactp != NULL && !*exactp) { /* MDBX_SET specified and not an exact match. */ return MDBX_NOTFOUND; } - if (leaf == NULL) { - mdbx_debug("===> inexact leaf not found, goto sibling"); + if (node == NULL) { + mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); + node = page_node(mp, 0); } set1: @@ -8070,18 +10275,18 @@ set1: if (IS_LEAF2(mp)) { if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, leaf); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (F_ISSET(node_flags(node), F_DUPDATA)) { if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); } else { @@ -8099,7 +10304,7 @@ set1: } } else if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { MDBX_val olddata; - if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, &olddata)) != MDBX_SUCCESS)) return rc; if (unlikely(mc->mc_dbx->md_dcmp == NULL)) return MDBX_EINVAL; @@ -8113,14 +10318,14 @@ set1: } else { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, data)) != MDBX_SUCCESS)) return rc; } } /* The key already matches in all other cases */ if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY(key), DVAL(data)); return rc; @@ -8129,7 +10334,6 @@ set1: /* Move the cursor to the first item in the database. */ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; - MDBX_node *leaf; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -8141,39 +10345,37 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; - mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->iov_len); + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); return MDBX_SUCCESS; } + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, leaf); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, data)) != MDBX_SUCCESS)) return rc; } } - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); return MDBX_SUCCESS; } /* Move the cursor to the last item in the database. */ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; - MDBX_node *leaf; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -8187,56 +10389,55 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); } - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED | C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = - LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], + key->iov_len); return MDBX_SUCCESS; } + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, leaf); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, node, data)) != MDBX_SUCCESS)) return rc; } } - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + get_key_optional(node, key); return MDBX_SUCCESS; } int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - int rc; - int exact = 0; - int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); - if (unlikely(mc == NULL)) return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + int exact = 0; + int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = NUMKEYS(mp); + const unsigned nkeys = page_numkeys(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { mdbx_cassert(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; @@ -8247,14 +10448,14 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { - MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDBX_GET_MAYNULL_KEYPTR(leaf, key); + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + get_key_optional(node, key); if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (F_ISSET(node_flags(node), F_DUPDATA)) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = mdbx_xcursor_init1(mc, leaf); + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); @@ -8264,7 +10465,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_GET_CURRENT); } else { - rc = mdbx_node_read(mc, leaf, data); + rc = mdbx_node_read(mc, node, data); } if (unlikely(rc)) return rc; @@ -8278,7 +10479,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) return MDBX_INCOMPATIBLE; - /* FALLTHRU */ + /* fall through */ + __fallthrough; case MDBX_SET: case MDBX_SET_KEY: case MDBX_SET_RANGE: @@ -8308,9 +10510,10 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor *mx; fetchm: mx = &mc->mc_xcursor->mx_cursor; - data->iov_len = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; - data->iov_base = PAGEDATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top]) - 1; + data->iov_len = + page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; + data->iov_base = page_data(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1; } else { rc = MDBX_NOTFOUND; } @@ -8355,15 +10558,15 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) return MDBX_INCOMPATIBLE; - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); return MDBX_NOTFOUND; } { - MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDBX_GET_MAYNULL_KEYPTR(leaf, key); - rc = mdbx_node_read(mc, leaf, data); + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(node_flags(node), F_DUPDATA)) { + get_key_optional(node, key); + rc = mdbx_node_read(mc, node, data); break; } } @@ -8420,22 +10623,25 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, unsigned flags) { MDBX_env *env; - MDBX_page *fp, *sub_root = NULL; - uint16_t fp_flags; + MDBX_page *sub_root = NULL; MDBX_val xdata, *rdata, dkey, olddata; - MDBX_db dummy; + MDBX_db nested_dupdb; unsigned mcount = 0, dcount = 0, nospill; size_t nsize; - int rc = MDBX_SUCCESS, rc2; + int rc2; unsigned nflags; DKBUF; - if (unlikely(mc == NULL || key == NULL)) + if (unlikely(mc == NULL || key == NULL || data == NULL)) return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ @@ -8457,35 +10663,37 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, nospill = flags & MDBX_NOSPILL; flags &= ~MDBX_NOSPILL; - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; - - if (unlikely(key->iov_len > env->me_maxkey_limit)) - return MDBX_BAD_VALSIZE; + if (unlikely(mc->mc_txn->mt_flags & (MDBX_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - if (unlikely(data->iov_len > ((mc->mc_db->md_flags & MDBX_DUPSORT) - ? env->me_maxkey_limit - : MDBX_MAXDATASIZE))) - return MDBX_BAD_VALSIZE; + if ((mc->mc_flags & C_SUB) == 0) { + if (unlikely(key->iov_len > (size_t)((mc->mc_db->md_flags & MDBX_DUPSORT) + ? env->me_maxkey_ds + : env->me_maxkey_nd) || + data->iov_len > ((mc->mc_db->md_flags & MDBX_DUPSORT) + ? env->me_maxval_ds + : env->me_maxval_nd))) { + return MDBX_BAD_VALSIZE; + } - if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && - unlikely(key->iov_len != sizeof(uint32_t) && - key->iov_len != sizeof(uint64_t))) { - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - } + if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && + unlikely(key->iov_len != sizeof(uint32_t) && + key->iov_len != sizeof(uint64_t))) { + mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + } - if ((mc->mc_db->md_flags & MDBX_INTEGERDUP) && - unlikely(data->iov_len != sizeof(uint32_t) && - data->iov_len != sizeof(uint64_t))) { - mdbx_cassert(mc, !"data-size is invalid MDBX_INTEGERDUP"); - return MDBX_BAD_VALSIZE; + if ((mc->mc_db->md_flags & MDBX_INTEGERDUP) && + unlikely(data->iov_len != sizeof(uint32_t) && + data->iov_len != sizeof(uint64_t))) { + mdbx_cassert(mc, !"data-size is invalid MDBX_INTEGERDUP"); + return MDBX_BAD_VALSIZE; + } } mdbx_debug("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY(key), key ? key->iov_len : 0, + DDBI(mc), DKEY(key), key->iov_len, DVAL((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; @@ -8503,8 +10711,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EKEYMISMATCH; if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { - MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor != NULL && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); @@ -8518,6 +10726,13 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; flags -= MDBX_CURRENT; } + } else if (unlikely(node_size(key, data) > + /* See note inside leaf_size() */ + env->me_branch_nodemax)) { + rc = mdbx_cursor_del(mc, 0); + if (rc != MDBX_SUCCESS) + return rc; + flags -= MDBX_CURRENT; } } } @@ -8574,7 +10789,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (rc == MDBX_NO_ROOT) { MDBX_page *np; /* new database, write a root leaf page */ - mdbx_debug("allocating new root leaf page"); + mdbx_debug("%s", "allocating new root leaf page"); if (unlikely(rc2 = mdbx_page_new(mc, P_LEAF, 1, &np))) { return rc2; } @@ -8596,16 +10811,17 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, bool insert_key, insert_data, do_sub = false; insert_key = insert_data = (rc != MDBX_SUCCESS); + uint16_t fp_flags = P_LEAF | P_DIRTY; + MDBX_page *fp = env->me_pbuf; if (insert_key) { /* The key does not exist */ mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) { + node_size(key, data) > + /* See note inside leaf_size() */ env->me_branch_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty * "old sub-page" for prep_subDB to expand to a full page. */ - fp_flags = P_LEAF | P_DIRTY; - fp = env->me_pbuf; - fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */ + fp->mp_leaf2_ksize = (uint16_t)data->iov_len /* used if MDBX_DUPFIXED */; fp->mp_lower = fp->mp_upper = 0; olddata.iov_len = PAGEHDRSZ; goto prep_subDB; @@ -8617,7 +10833,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, unsigned ksize = mc->mc_db->md_xsize; if (key->iov_len != ksize) return MDBX_BAD_VALSIZE; - ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to @@ -8649,158 +10865,23 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } more:; - MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.iov_len = NODEDSZ(leaf); - olddata.iov_base = NODEDATA(leaf); - - /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { - /* Prepare (sub-)page/sub-DB to accept the new item, if needed. - * fp: old sub-page or a header faking it. - * mp: new (sub-)page. offset: growth in page size. - * xdata: node data with new page or DB. */ - unsigned i, offset = 0; - MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - /* Was a single item before, must convert now */ - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - - /* does data match? */ - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (unlikely(flags & (MDBX_NODUPDATA | MDBX_APPENDDUP))) - return MDBX_KEYEXIST; - /* overwrite it */ - goto current; - } - - /* Just overwrite the current item */ - if (flags & MDBX_CURRENT) - goto current; - - /* Back up original data item */ - dupdata_flag = 1; - dkey.iov_len = olddata.iov_len; - dkey.iov_base = memcpy(fp + 1, olddata.iov_base, olddata.iov_len); - - /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; - fp->mp_lower = 0; - xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; - if (mc->mc_db->md_flags & MDBX_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = (uint16_t)data->iov_len; - xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - } else { - xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.iov_len & 1) + (data->iov_len & 1); - } - fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); - olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ - } else if (leaf->mn_flags & F_SUBDATA) { - /* Data is on sub-DB, just store it */ - flags |= F_DUPDATA | F_SUBDATA; - goto put_sub; - } else { - /* Data is on sub-page */ - fp = olddata.iov_base; - switch (flags) { - default: - if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { - offset = EVEN(NODESIZE + sizeof(indx_t) + data->iov_len); - break; - } - offset = fp->mp_leaf2_ksize; - if (SIZELEFT(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; - } - /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ - __fallthrough; - case MDBX_CURRENT | MDBX_NODUPDATA: - case MDBX_CURRENT: - fp->mp_flags |= P_DIRTY; - fp->mp_pgno = mp->mp_pgno; - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.iov_len = olddata.iov_len + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.iov_len > env->me_nodemax) { - /* Too big for a sub-page, convert to sub-DB */ - fp_flags &= ~P_SUBP; - prep_subDB: - dummy.md_xsize = 0; - dummy.md_flags = 0; - if (mc->mc_db->md_flags & MDBX_DUPFIXED) { - fp_flags |= P_LEAF2; - dummy.md_xsize = fp->mp_leaf2_ksize; - dummy.md_flags = MDBX_DUPFIXED; - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) - dummy.md_flags |= MDBX_INTEGERKEY; - } - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.iov_len = sizeof(MDBX_db); - xdata.iov_base = &dummy; - if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) - return rc; - mc->mc_db->md_leaf_pages += 1; - mdbx_cassert(mc, env->me_psize > olddata.iov_len); - offset = env->me_psize - (unsigned)olddata.iov_len; - flags |= F_DUPDATA | F_SUBDATA; - dummy.md_root = mp->mp_pgno; - dummy.md_seq = dummy.md_merkle = 0; - sub_root = mp; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; - mp->mp_lower = fp->mp_lower; - mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); - mp->mp_upper = (indx_t)(fp->mp_upper + offset); - if (unlikely(fp_flags & P_LEAF2)) { - memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); - } else { - memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, - (char *)fp + fp->mp_upper + PAGEHDRSZ, - olddata.iov_len - fp->mp_upper - PAGEHDRSZ); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), - NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); - for (i = 0; i < NUMKEYS(fp); i++) { - mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); - mp->mp_ptrs[i] += (indx_t)offset; - } - } - } + if (mdbx_audit_enabled()) { + int err = mdbx_cursor_check(mc, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = true; - if (!insert_key) - mdbx_node_del(mc, 0); - goto new_sub; - } - current: - /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ - if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) - return MDBX_INCOMPATIBLE; /* overflow page overwrites need special handling */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDBX_page *omp; - pgno_t pg; + if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { int level, ovpages, - dpages = (LEAFSIZE(key, data) > env->me_nodemax) - ? OVPAGES(env, data->iov_len) + dpages = (node_size(key, data) > + /* See note inside leaf_size() */ env->me_branch_nodemax) + ? number_of_ovpages(env, data->iov_len) : 0; - memcpy(&pg, olddata.iov_base, sizeof(pg)); + const pgno_t pg = node_largedata_pgno(node); + MDBX_page *omp; if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) return rc2; ovpages = omp->mp_pages; @@ -8828,7 +10909,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(!np)) return MDBX_ENOMEM; /* Note - this page is already counted in parent's dirtyroom */ - rc2 = mdbx_dpl_insert(mc->mc_txn->mt_rw_dirtylist, pg, np); + rc2 = mdbx_dpl_append(mc->mc_txn->tw.dirtylist, pg, np); if (unlikely(rc2 != MDBX_SUCCESS)) { rc = rc2; mdbx_dpage_free(env, np, ovpages); @@ -8849,11 +10930,11 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */ omp = np; } - SETDSZ(leaf, data->iov_len); + node_set_ds(node, data->iov_len); if (F_ISSET(flags, MDBX_RESERVE)) - data->iov_base = PAGEDATA(omp); + data->iov_base = page_data(omp); else - memcpy(PAGEDATA(omp), data->iov_base, data->iov_len); + memcpy(page_data(omp), data->iov_base, data->iov_len); if (mdbx_audit_enabled()) { int err = mdbx_cursor_check(mc, false); @@ -8863,36 +10944,196 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } } - if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDBX_SUCCESS) + if ((rc2 = mdbx_page_retire(mc, omp)) != MDBX_SUCCESS) return rc2; - } else if (data->iov_len == olddata.iov_len) { - mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(leaf->mn_ksize)); - /* same size, just replace it. Note that we could - * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. */ - if (F_ISSET(flags, MDBX_RESERVE)) - data->iov_base = olddata.iov_base; - else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.iov_base, data->iov_base, data->iov_len); - else { - mdbx_cassert(mc, NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); - mdbx_cassert(mc, NODEDSZ(leaf) == 0); - mdbx_cassert(mc, leaf->mn_flags == 0); - mdbx_cassert(mc, key->iov_len < UINT16_MAX); - leaf->mn_ksize = (uint16_t)key->iov_len; - memcpy(NODEKEY(leaf), key->iov_base, key->iov_len); - mdbx_cassert(mc, (char *)NODEKEY(leaf) + NODEDSZ(leaf) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); - goto fix_parent; + } else { + olddata.iov_len = node_ds(node); + olddata.iov_base = node_data(node); + mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <= + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, if needed. + * fp: old sub-page or a header faking it. + * mp: new (sub-)page. offset: growth in page size. + * xdata: node data with new page or DB. */ + unsigned i; + size_t offset = 0; + MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(node_flags(node), F_DUPDATA)) { + + /* Just overwrite the current item */ + if (flags & MDBX_CURRENT) { + mdbx_cassert( + mc, + node_size(key, data) <= + /* See note inside leaf_size() */ env->me_branch_nodemax); + goto current; + } + + /* does data match? */ + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (unlikely(flags & (MDBX_NODUPDATA | MDBX_APPENDDUP))) + return MDBX_KEYEXIST; + /* overwrite it */ + mdbx_cassert( + mc, + node_size(key, data) <= + /* See note inside leaf_size() */ env->me_branch_nodemax); + goto current; + } + + /* Back up original data item */ + dupdata_flag = 1; + dkey.iov_len = olddata.iov_len; + dkey.iov_base = memcpy(fp + 1, olddata.iov_base, olddata.iov_len); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; + fp->mp_lower = 0; + xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_leaf2_ksize = (uint16_t)data->iov_len; + xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ + mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + } else { + xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.iov_len & 1) + (data->iov_len & 1); + mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + } + fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); + olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ + } else if (node_flags(node) & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA | F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.iov_base; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { + offset = node_size(data, nullptr) + sizeof(indx_t); + break; + } + offset = fp->mp_leaf2_ksize; + if (page_room(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ + __fallthrough; + case MDBX_CURRENT | MDBX_NODUPDATA: + case MDBX_CURRENT: + fp->mp_flags |= P_DIRTY; + fp->mp_pgno = mp->mp_pgno; + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.iov_len = olddata.iov_len + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + node_ks(node) + xdata.iov_len > + /* See note inside leaf_size() */ env->me_branch_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; + prep_subDB: + nested_dupdb.md_xsize = 0; + nested_dupdb.md_flags = 0; + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { + fp_flags |= P_LEAF2; + nested_dupdb.md_xsize = fp->mp_leaf2_ksize; + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) + nested_dupdb.md_flags = MDBX_INTEGERKEY; + } + nested_dupdb.md_depth = 1; + nested_dupdb.md_branch_pages = 0; + nested_dupdb.md_leaf_pages = 1; + nested_dupdb.md_overflow_pages = 0; + nested_dupdb.md_entries = page_numkeys(fp); + xdata.iov_len = sizeof(nested_dupdb); + xdata.iov_base = &nested_dupdb; + if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) + return rc; + mc->mc_db->md_leaf_pages += 1; + mdbx_cassert(mc, env->me_psize > olddata.iov_len); + offset = env->me_psize - (unsigned)olddata.iov_len; + flags |= F_DUPDATA | F_SUBDATA; + nested_dupdb.md_root = mp->mp_pgno; + nested_dupdb.md_seq = nested_dupdb.md_mod_txnid = 0; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; + mp->mp_lower = fp->mp_lower; + mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + mp->mp_upper = (indx_t)(fp->mp_upper + offset); + if (unlikely(fp_flags & P_LEAF2)) { + memcpy(page_data(mp), page_data(fp), + page_numkeys(fp) * fp->mp_leaf2_ksize); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, + (char *)fp + fp->mp_upper + PAGEHDRSZ, + olddata.iov_len - fp->mp_upper - PAGEHDRSZ); + memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), + page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); + for (i = 0; i < page_numkeys(fp); i++) { + mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); + mp->mp_ptrs[i] += (indx_t)offset; + } + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = true; + if (!insert_key) + mdbx_node_del(mc, 0); + goto new_sub; } - if (mdbx_audit_enabled()) { - int err = mdbx_cursor_check(mc, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; + /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ + if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) + return MDBX_INCOMPATIBLE; + + current: + if (data->iov_len == olddata.iov_len) { + mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. */ + if (F_ISSET(flags, MDBX_RESERVE)) + data->iov_base = olddata.iov_base; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.iov_base, data->iov_base, data->iov_len); + else { + mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); + mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); + mdbx_cassert(mc, node_ds(node) == 0); + mdbx_cassert(mc, node_flags(node) == 0); + mdbx_cassert(mc, key->iov_len < UINT16_MAX); + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + goto fix_parent; + } + + if (mdbx_audit_enabled()) { + int err = mdbx_cursor_check(mc, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; } - return MDBX_SUCCESS; } mdbx_node_del(mc, 0); } @@ -8902,8 +11143,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, new_sub: nflags = flags & NODE_ADD_FLAGS; nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len - : mdbx_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + : leaf_size(env, key, rdata); + if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) nflags &= ~MDBX_APPEND; /* sub-page may need room to grow */ if (!insert_key) @@ -8949,14 +11190,14 @@ new_sub: size_t ecount; put_sub: xdata.iov_len = 0; - xdata.iov_base = ""; - MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + xdata.iov_base = nullptr; + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (flags & MDBX_CURRENT) { xflags = (flags & MDBX_NODUPDATA) ? MDBX_CURRENT | MDBX_NOOVERWRITE | MDBX_NOSPILL : MDBX_CURRENT | MDBX_NOSPILL; } else { - rc2 = mdbx_xcursor_init1(mc, leaf); + rc2 = mdbx_xcursor_init1(mc, node); if (unlikely(rc2 != MDBX_SUCCESS)) return rc2; xflags = (flags & MDBX_NODUPDATA) ? MDBX_NOOVERWRITE | MDBX_NOSPILL @@ -8972,13 +11213,13 @@ new_sub: /* we've done our job */ dkey.iov_len = 0; } - if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + if (!(node_flags(node) & F_SUBDATA) || sub_root) { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2; MDBX_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDBX_page *mp = mc->mc_pg[i]; - const int nkeys = NUMKEYS(mp); + const int nkeys = page_numkeys(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -9002,7 +11243,7 @@ new_sub: xflags |= MDBX_APPEND; rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); + void *db = node_data(node); memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); @@ -9045,24 +11286,20 @@ fail: } int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) { - MDBX_node *leaf; - MDBX_page *mp; - int rc; - if (unlikely(!mc)) return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; - if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) + if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) return MDBX_NOTFOUND; if (unlikely(!(flags & MDBX_NOSPILL) && @@ -9073,35 +11310,35 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (IS_LEAF2(mp)) goto del_key; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { if (flags & MDBX_NODUPDATA) { /* mdbx_cursor_del0() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + if (!F_ISSET(node_flags(node), F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); } rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { + if (node_flags(node) & F_SUBDATA) { /* update subDB info */ - void *db = NODEDATA(leaf); + void *db = node_data(node); memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { MDBX_cursor *m2; /* shrink fake page */ mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + node = page_node(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -9109,15 +11346,15 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) { if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - MDBX_node *n2 = leaf; - if (m2->mc_ki[mc->mc_top] >= NUMKEYS(mp)) + MDBX_node *inner = node; + if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp)) continue; if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { - n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (n2->mn_flags & F_SUBDATA) + inner = page_node(mp, m2->mc_ki[mc->mc_top]); + if (node_flags(inner) & F_SUBDATA) continue; } - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner); } } } @@ -9131,30 +11368,25 @@ int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) { /* otherwise fall thru and delete the sub-DB */ } - if (leaf->mn_flags & F_SUBDATA) { + if (node_flags(node) & F_SUBDATA) { /* add all the child DB's pages to the free list */ - mc->mc_db->md_branch_pages -= mc->mc_xcursor->mx_db.md_branch_pages; - mc->mc_db->md_leaf_pages -= mc->mc_xcursor->mx_db.md_leaf_pages; - mc->mc_db->md_overflow_pages -= mc->mc_xcursor->mx_db.md_overflow_pages; rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); if (unlikely(rc)) goto fail; } } /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */ - else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { + else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) { rc = MDBX_INCOMPATIBLE; goto fail; } /* add overflow pages to free list */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + if (F_ISSET(node_flags(node), F_BIGDATA)) { MDBX_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) || - (rc = mdbx_ovpage_free(mc, omp)))) + if (unlikely( + (rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, NULL)) || + (rc = mdbx_page_retire(mc, omp)))) goto fail; } @@ -9183,87 +11415,28 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) return rc; + *mp = np; mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = (uint16_t)(flags | P_DIRTY); np->mp_lower = 0; np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); - if (IS_BRANCH(np)) - mc->mc_db->md_branch_pages++; - else if (IS_LEAF(np)) - mc->mc_db->md_leaf_pages++; - else { - mdbx_cassert(mc, IS_OVERFLOW(np)); + mc->mc_db->md_branch_pages += IS_BRANCH(np); + mc->mc_db->md_leaf_pages += IS_LEAF(np); + if (unlikely(IS_OVERFLOW(np))) { mc->mc_db->md_overflow_pages += num; np->mp_pages = num; - } - - if (unlikely(mc->mc_flags & C_SUB)) { + mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); + } else if (unlikely(mc->mc_flags & C_SUB)) { MDBX_db *outer = mdbx_outer_db(mc); - if (IS_BRANCH(np)) - outer->md_branch_pages++; - else if (IS_LEAF(np)) - outer->md_leaf_pages++; - else { - mdbx_cassert(mc, IS_OVERFLOW(np)); - outer->md_overflow_pages += num; - } + outer->md_branch_pages += IS_BRANCH(np); + outer->md_leaf_pages += IS_LEAF(np); } - *mp = np; return MDBX_SUCCESS; } -/* Calculate the size of a leaf node. - * - * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node - * size will only include the key and not the data. Sizes are always - * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the MDBX_node headers. - * - * [in] env The environment handle. - * [in] key The key for the node. - * [in] data The data for the node. - * - * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_leaf_size(MDBX_env *env, const MDBX_val *key, - const MDBX_val *data) { - size_t sz = LEAFSIZE(key, data); - if (sz > env->me_nodemax) { - /* put on overflow page */ - sz = sz - data->iov_len + sizeof(pgno_t); - } - - return EVEN(sz) + sizeof(indx_t); -} - -/* Calculate the size of a branch node. - * - * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the MDBX_node header plus the - * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. - * - * [in] env The environment handle. - * [in] key The key for the node. - * - * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_branch_size(MDBX_env *env, const MDBX_val *key) { - size_t sz = INDXSIZE(key); - if (unlikely(sz > env->me_nodemax)) { - /* put on overflow page */ - /* not implemented */ - mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, - __LINE__); - sz = sz - key->iov_len + sizeof(pgno_t); - } - - return EVEN(sz) + sizeof(indx_t); -} - static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, unsigned indx, const MDBX_val *key) { @@ -9278,36 +11451,26 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2)); const unsigned ksize = mc->mc_db->md_xsize; mdbx_cassert(mc, ksize == key->iov_len); + const unsigned nkeys = page_numkeys(mp); - const int room = SIZELEFT(mp); - mdbx_cassert(mc, room >= (int)ksize); - if (unlikely(room < (int)ksize)) { - bailout: + /* Just using these for counting */ + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t)); + if (unlikely(lower > upper)) { mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PAGE_FULL; } + mp->mp_lower = (indx_t)lower; + mp->mp_upper = (indx_t)upper; - char *const ptr = LEAF2KEY(mp, indx, ksize); - mdbx_cassert(mc, NUMKEYS(mp) >= indx); - if (unlikely(NUMKEYS(mp) < indx)) - goto bailout; - - const unsigned diff = NUMKEYS(mp) - indx; + char *const ptr = page_leaf2key(mp, indx, ksize); + mdbx_cassert(mc, nkeys >= indx); + const unsigned diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ memmove(ptr + ksize, ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); - - /* Just using these for counting */ - mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); - mp->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); - mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - - mdbx_cassert(mc, - mp->mp_upper >= mp->mp_lower && - PAGEHDRSZ + mp->mp_upper <= mc->mc_txn->mt_env->me_psize); return MDBX_SUCCESS; } @@ -9325,48 +11488,33 @@ static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH); STATIC_ASSERT(NODESIZE % 2 == 0); - const size_t room = SIZELEFT(mp); - const size_t node_size = - likely(key != NULL) ? NODESIZE + EVEN(key->iov_len) : NODESIZE; - mdbx_cassert(mc, mdbx_branch_size(mc->mc_txn->mt_env, key) == - node_size + sizeof(indx_t)); - mdbx_cassert(mc, room >= node_size + sizeof(indx_t)); - if (unlikely(room < node_size + sizeof(indx_t))) { - bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_FULL; - } - - const unsigned numkeys = NUMKEYS(mp); - mdbx_cassert(mc, numkeys >= indx); - if (unlikely(numkeys < indx)) - goto bailout; - /* Move higher pointers up one slot. */ - for (unsigned i = numkeys; i > indx; --i) + const unsigned nkeys = page_numkeys(mp); + mdbx_cassert(mc, nkeys >= indx); + for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ - const size_t ofs = mp->mp_upper - node_size; - mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mdbx_cassert(mc, ofs <= UINT16_MAX); - mp->mp_ptrs[indx] = (uint16_t)ofs; - mp->mp_upper = (uint16_t)ofs; - mp->mp_lower += sizeof(indx_t); + const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key); + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->mp_lower = (indx_t)lower; + mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; /* Write the node data. */ - MDBX_node *node = NODEPTR(mp, indx); - SETPGNO(node, pgno); - node->mn_ksize = 0; - node->mn_flags = 0; + MDBX_node *node = page_node(mp, indx); + node_set_pgno(node, pgno); + node_set_flags(node, 0); + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); + node_set_ks(node, 0); if (likely(key != NULL)) { - node->mn_ksize = (uint16_t)key->iov_len; - memcpy(NODEKEY(node), key->iov_base, key->iov_len); + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); } - - mdbx_cassert(mc, - mp->mp_upper >= mp->mp_lower && - PAGEHDRSZ + mp->mp_upper <= mc->mc_txn->mt_env->me_psize); return MDBX_SUCCESS; } @@ -9385,66 +11533,55 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF); MDBX_page *largepage = NULL; - const size_t room = SIZELEFT(mp); - size_t node_size = NODESIZE + key->iov_len; + size_t leaf_bytes = 0; if (unlikely(flags & F_BIGDATA)) { /* Data already on overflow page. */ STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); - node_size += sizeof(pgno_t); - } else if (unlikely(node_size + data->iov_len > - mc->mc_txn->mt_env->me_nodemax)) { - const pgno_t ovpages = OVPAGES(mc->mc_txn->mt_env, data->iov_len); + leaf_bytes = node_size(key, nullptr) + sizeof(pgno_t) + sizeof(indx_t); + } else if (unlikely(node_size(key, data) > + /* See note inside leaf_size() */ + mc->mc_txn->mt_env->me_branch_nodemax)) { /* Put data on overflow page. */ - mdbx_debug("data size is %" PRIuPTR ", node would be %" PRIuPTR - ", put data on %u-overflow page(s)", - data->iov_len, node_size + data->iov_len, ovpages); + mdbx_cassert(mc, !F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)); + const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); int rc = mdbx_page_new(mc, P_OVERFLOW, ovpages, &largepage); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_debug("allocated overflow page %" PRIaPGNO, largepage->mp_pgno); + mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->mp_pages, largepage->mp_pgno, data->iov_len); flags |= F_BIGDATA; - node_size += sizeof(pgno_t); - mdbx_cassert(mc, mdbx_leaf_size(mc->mc_txn->mt_env, key, data) == - EVEN(node_size) + sizeof(indx_t)); + leaf_bytes = node_size(key, nullptr) + sizeof(pgno_t) + sizeof(indx_t); } else { - node_size += data->iov_len; - mdbx_cassert(mc, mdbx_leaf_size(mc->mc_txn->mt_env, key, data) == - EVEN(node_size) + sizeof(indx_t)); - } - - node_size = EVEN(node_size); - mdbx_cassert(mc, room >= node_size + sizeof(indx_t)); - if (unlikely(room < node_size + sizeof(indx_t))) { - bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_FULL; + leaf_bytes = node_size(key, data) + sizeof(indx_t); } - - const unsigned numkeys = NUMKEYS(mp); - mdbx_cassert(mc, numkeys >= indx); - if (unlikely(numkeys < indx)) - goto bailout; + mdbx_cassert(mc, leaf_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ - for (unsigned i = numkeys; i > indx; --i) + const unsigned nkeys = page_numkeys(mp); + mdbx_cassert(mc, nkeys >= indx); + for (unsigned i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ - const size_t ofs = mp->mp_upper - node_size; - mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mdbx_cassert(mc, ofs <= UINT16_MAX); - mp->mp_ptrs[indx] = (uint16_t)ofs; - mp->mp_upper = (uint16_t)ofs; - mp->mp_lower += sizeof(indx_t); + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (leaf_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->mp_lower = (indx_t)lower; + mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; /* Write the node data. */ - MDBX_node *node = NODEPTR(mp, indx); - node->mn_ksize = (uint16_t)key->iov_len; - node->mn_flags = (uint16_t)flags; - SETDSZ(node, data->iov_len); - memcpy(NODEKEY(node), key->iov_base, key->iov_len); - - void *nodedata = NODEDATA(node); + MDBX_node *node = page_node(mp, indx); + node_set_ks(node, key->iov_len); + node_set_flags(node, (uint8_t)flags); + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); + node_set_ds(node, data->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + + void *nodedata = node_data(node); if (likely(largepage == NULL)) { if (unlikely(flags & F_BIGDATA)) memcpy(nodedata, data->iov_base, sizeof(pgno_t)); @@ -9453,17 +11590,13 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, else if (likely(nodedata != data->iov_base)) memcpy(nodedata, data->iov_base, data->iov_len); } else { - memcpy(nodedata, &largepage->mp_pgno, sizeof(pgno_t)); - nodedata = PAGEDATA(largepage); + poke_pgno(nodedata, largepage->mp_pgno); + nodedata = page_data(largepage); if (unlikely(flags & MDBX_RESERVE)) data->iov_base = nodedata; else if (likely(nodedata != data->iov_base)) memcpy(nodedata, data->iov_base, data->iov_len); } - - mdbx_cassert(mc, - mp->mp_upper >= mp->mp_lower && - PAGEHDRSZ + mp->mp_upper <= mc->mc_txn->mt_env->me_psize); return MDBX_SUCCESS; } @@ -9473,20 +11606,20 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, * part of a MDBX_DUPFIXED database. */ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - indx_t i, j, numkeys, ptr; + int indx = mc->mc_ki[mc->mc_top]; + int i, j, nkeys, ptr; MDBX_node *node; char *base; mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx, IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - numkeys = NUMKEYS(mp); - mdbx_cassert(mc, indx < numkeys); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, indx < nkeys); if (IS_LEAF2(mp)) { mdbx_cassert(mc, ksize >= sizeof(indx_t)); - unsigned diff = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); + unsigned diff = nkeys - 1 - indx; + base = page_leaf2key(mp, indx, ksize); if (diff) memmove(base, base + ksize, diff * ksize); mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); @@ -9497,18 +11630,18 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { return; } - node = NODEPTR(mp, indx); - size_t sz = NODESIZE + node->mn_ksize; + node = page_node(mp, indx); + size_t sz = NODESIZE + node_ks(node); if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) + if (F_ISSET(node_flags(node), F_BIGDATA)) sz += sizeof(pgno_t); else - sz += NODEDSZ(node); + sz += node_ds(node); } sz = EVEN(sz); ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { + for (i = j = 0; i < nkeys; i++) { if (i != indx) { mp->mp_ptrs[j] = mp->mp_ptrs[i]; if (mp->mp_ptrs[i] < ptr) { @@ -9538,9 +11671,9 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { size_t nsize, delta, len, ptr; int i; - node = NODEPTR(mp, indx); - sp = (MDBX_page *)NODEDATA(node); - delta = SIZELEFT(sp); + node = page_node(mp, indx); + sp = (MDBX_page *)node_data(node); + delta = page_room(sp); assert(delta > 0); /* Prepare to shift upward, set len = length(subpage part to shift) */ @@ -9548,28 +11681,28 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { delta &= /* do not make the node uneven-sized */ ~1u; if (unlikely(delta) == 0) return; - nsize = NODEDSZ(node) - delta; + nsize = node_ds(node) - delta; assert(nsize % 1 == 0); len = nsize; } else { xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ - for (i = NUMKEYS(sp); --i >= 0;) { + for (i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); } - nsize = NODEDSZ(node) - delta; + nsize = node_ds(node) - delta; len = PAGEHDRSZ; } sp->mp_upper = sp->mp_lower; sp->mp_pgno = mp->mp_pgno; - SETDSZ(node, nsize); + node_set_ds(node, nsize); /* Shift <lower nodes...initial part of subpage> upward */ base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + delta, base, (char *)sp + len - base); ptr = mp->mp_ptrs[indx]; - for (i = NUMKEYS(mp); --i >= 0;) { + for (i = page_numkeys(mp); --i >= 0;) { if (mp->mp_ptrs[i] <= ptr) { assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] += (indx_t)delta; @@ -9620,21 +11753,25 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { if (unlikely(mx == nullptr)) return MDBX_CORRUPTED; - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db)); + if (node_flags(node) & F_SUBDATA) { + if (unlikely(node_ds(node) != sizeof(MDBX_db))) + return MDBX_CORRUPTED; + memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; } else { - MDBX_page *fp = NODEDATA(node); + if (unlikely(node_ds(node) <= PAGEHDRSZ)) + return MDBX_CORRUPTED; + MDBX_page *fp = node_data(node); mx->mx_db.md_xsize = 0; mx->mx_db.md_flags = 0; mx->mx_db.md_depth = 1; mx->mx_db.md_branch_pages = 0; mx->mx_db.md_leaf_pages = 1; mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); + mx->mx_db.md_entries = page_numkeys(fp); mx->mx_db.md_root = fp->mp_pgno; mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; @@ -9642,21 +11779,14 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDBX_DUPFIXED) { - mx->mx_db.md_flags = MDBX_DUPFIXED; mx->mx_db.md_xsize = fp->mp_leaf2_ksize; if (mc->mc_db->md_flags & MDBX_INTEGERDUP) - mx->mx_db.md_flags |= MDBX_INTEGERKEY; + mx->mx_db.md_flags = MDBX_INTEGERKEY; } } mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, mx->mx_db.md_root); mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; - /* FIXME: #if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdbx_cmp_int && mx->mx_db.md_pad == - sizeof(size_t)) - mx->mx_dbx.md_cmp = mdbx_cmp_clong; - #endif */ - return MDBX_SUCCESS; } @@ -9727,23 +11857,19 @@ static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { } int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { - if (unlikely(!ret || !txn)) + if (unlikely(!ret)) return MDBX_EINVAL; + *ret = NULL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) - return MDBX_EINVAL; + if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_RDONLY))) + return MDBX_EACCESS; const size_t size = (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) ? sizeof(MDBX_cursor_couple) @@ -9751,7 +11877,7 @@ int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { MDBX_cursor *mc; if (likely((mc = mdbx_malloc(size)) != NULL)) { - int rc = mdbx_cursor_init(mc, txn, dbi); + rc = mdbx_cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_free(mc); return rc; @@ -9770,22 +11896,17 @@ int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { } int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { - if (unlikely(!mc || !txn)) + if (unlikely(!mc)) return MDBX_EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; - - if (unlikely(txn->mt_flags & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR))) - return MDBX_BAD_TXN; - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && mc->mc_signature != MDBX_MC_READY4CLOSE)) return MDBX_EINVAL; + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) return MDBX_EINVAL; @@ -9809,19 +11930,17 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { /* Return the count of duplicate data items for the current key */ int mdbx_cursor_count(MDBX_cursor *mc, size_t *countp) { - if (unlikely(mc == NULL || countp == NULL)) + if (unlikely(mc == NULL)) return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; if (!mc->mc_snum) { @@ -9830,15 +11949,15 @@ int mdbx_cursor_count(MDBX_cursor *mc, size_t *countp) { } MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { *countp = 0; return MDBX_NOTFOUND; } *countp = 1; if (mc->mc_xcursor != NULL) { - MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) @@ -9902,34 +12021,34 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { char *base; size_t len; int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; + int ptr, i, nkeys, indx; DKBUF; indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); + node = page_node(mp, indx); ptr = mp->mp_ptrs[indx]; if (MDBX_DEBUG) { MDBX_val k2; char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; - k2.iov_base = NODEKEY(node); - k2.iov_len = node->mn_ksize; + k2.iov_base = node_key(node); + k2.iov_len = node_ks(node); mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), + ptr, mdbx_dump_val(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } /* Sizes must be 2-byte aligned. */ ksize = EVEN(key->iov_len); - oksize = EVEN(node->mn_ksize); + oksize = EVEN(node_ks(node)); delta = ksize - oksize; /* Shift node contents if EVEN(key length) changed. */ if (delta) { - if (SIZELEFT(mp) < delta) { + if (delta > (int)page_room(mp)) { /* not enough space left, do a delete and split */ mdbx_debug("Not enough room, delta = %d, splitting...", delta); - pgno_t pgno = NODEPGNO(node); + pgno_t pgno = node_pgno(node); mdbx_node_del(mc, 0); int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) @@ -9937,8 +12056,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { return rc; } - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { + nkeys = page_numkeys(mp); + for (i = 0; i < nkeys; i++) { if (mp->mp_ptrs[i] <= ptr) { mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] -= (indx_t)delta; @@ -9951,30 +12070,24 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { mdbx_cassert(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; - node = NODEPTR(mp, indx); + node = page_node(mp, indx); } /* But even if no shift was needed, update ksize */ - if (node->mn_ksize != key->iov_len) - node->mn_ksize = (uint16_t)key->iov_len; + node_set_ks(node, key->iov_len); - memcpy(NODEKEY(node), key->iov_base, key->iov_len); + if (key->iov_len) + memcpy(node_key(node), key->iov_base, key->iov_len); return MDBX_SUCCESS; } -static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); - /* Move a node from csrc to cdst. */ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { int rc; DKBUF; - /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) - return rc; - - MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; - MDBX_page *const pdst = cdst->mc_pg[cdst->mc_top]; + MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; + MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); @@ -9987,33 +12100,37 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { MDBX_val key4move; switch (PAGETYPE(psrc)) { case P_BRANCH: { - const MDBX_node *srcnode = NODEPTR(psrc, csrc->mc_ki[csrc->mc_top]); - mdbx_cassert(csrc, srcnode->mn_flags == 0); - const pgno_t srcpg = NODEPGNO(srcnode); - key4move.iov_len = NODEKSZ(srcnode); - key4move.iov_base = NODEKEY(srcnode); + const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); + mdbx_cassert(csrc, node_flags(srcnode) == 0); + const pgno_t srcpg = node_pgno(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); + if (csrc->mc_ki[csrc->mc_top] == 0) { const uint16_t snum = csrc->mc_snum; mdbx_cassert(csrc, snum > 0); /* must find the lowest key below src */ rc = mdbx_page_search_lowest(csrc); - MDBX_page *psrc2 = csrc->mc_pg[csrc->mc_top]; + MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; if (unlikely(rc)) return rc; - mdbx_cassert(csrc, IS_LEAF(psrc2)); - if (unlikely(!IS_LEAF(psrc2))) + mdbx_cassert(csrc, IS_LEAF(lowest_page)); + if (unlikely(!IS_LEAF(lowest_page))) goto bailout; - if (IS_LEAF2(psrc2)) { + if (IS_LEAF2(lowest_page)) { key4move.iov_len = csrc->mc_db->md_xsize; - key4move.iov_base = LEAF2KEY(psrc2, 0, key4move.iov_len); + key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len); } else { - const MDBX_node *s2 = NODEPTR(psrc2, 0); - key4move.iov_len = NODEKSZ(s2); - key4move.iov_base = NODEKEY(s2); + const MDBX_node *lowest_node = page_node(lowest_page, 0); + key4move.iov_len = node_ks(lowest_node); + key4move.iov_base = node_key(lowest_node); } + + /* restore cursor after mdbx_page_search_lowest() */ csrc->mc_snum = snum; csrc->mc_top = snum - 1; csrc->mc_ki[csrc->mc_top] = 0; + /* paranoia */ mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); mdbx_cassert(csrc, IS_BRANCH(psrc)); @@ -10031,25 +12148,53 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { rc = mdbx_page_search_lowest(&mn); if (unlikely(rc)) return rc; - MDBX_page *const pdst2 = mn.mc_pg[mn.mc_top]; - mdbx_cassert(cdst, IS_LEAF(pdst2)); - if (unlikely(!IS_LEAF(pdst2))) + MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; + mdbx_cassert(cdst, IS_LEAF(lowest_page)); + if (unlikely(!IS_LEAF(lowest_page))) goto bailout; MDBX_val key; - if (IS_LEAF2(pdst2)) { + if (IS_LEAF2(lowest_page)) { key.iov_len = mn.mc_db->md_xsize; - key.iov_base = LEAF2KEY(pdst2, 0, key.iov_len); + key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len); } else { - MDBX_node *s2 = NODEPTR(pdst2, 0); - key.iov_len = NODEKSZ(s2); - key.iov_base = NODEKEY(s2); + MDBX_node *lowest_node = page_node(lowest_page, 0); + key.iov_len = node_ks(lowest_node); + key.iov_base = node_key(lowest_node); } + + /* restore cursor after mdbx_page_search_lowest() */ mn.mc_snum = snum; mn.mc_top = snum - 1; mn.mc_ki[mn.mc_top] = 0; + + const intptr_t delta = + EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0))); + const intptr_t needed = + branch_size(cdst->mc_txn->mt_env, &key4move) + delta; + const intptr_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = mdbx_page_touch(csrc)) || + (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + rc = mdbx_update_key(&mn, &key); if (unlikely(rc)) return rc; + } else { + const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move); + const size_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = mdbx_page_touch(csrc)) || + (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; } mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO @@ -10062,25 +12207,35 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { } break; case P_LEAF: { - const MDBX_node *srcnode = NODEPTR(psrc, csrc->mc_ki[csrc->mc_top]); + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); MDBX_val data; - data.iov_len = NODEDSZ(srcnode); - data.iov_base = NODEDATA(srcnode); - key4move.iov_len = NODEKSZ(srcnode); - key4move.iov_base = NODEKEY(srcnode); + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO " to node %u on page %" PRIaPGNO, "leaf", csrc->mc_ki[csrc->mc_top], DKEY(&key4move), psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, - srcnode->mn_flags); + node_flags(srcnode)); } break; case P_LEAF | P_LEAF2: { + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; key4move.iov_len = csrc->mc_db->md_xsize; key4move.iov_base = - LEAF2KEY(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); + page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO " to node %u on page %" PRIaPGNO, "leaf2", csrc->mc_ki[csrc->mc_top], DKEY(&key4move), @@ -10160,11 +12315,11 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { MDBX_val key; if (IS_LEAF2(psrc)) { key.iov_len = psrc->mp_leaf2_ksize; - key.iov_base = LEAF2KEY(psrc, 0, key.iov_len); + key.iov_base = page_leaf2key(psrc, 0, key.iov_len); } else { - MDBX_node *srcnode = NODEPTR(psrc, 0); - key.iov_len = NODEKSZ(srcnode); - key.iov_base = NODEKEY(srcnode); + MDBX_node *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); } mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", psrc->mp_pgno, DKEY(&key)); @@ -10195,11 +12350,11 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { MDBX_val key; if (IS_LEAF2(pdst)) { key.iov_len = pdst->mp_leaf2_ksize; - key.iov_base = LEAF2KEY(pdst, 0, key.iov_len); + key.iov_base = page_leaf2key(pdst, 0, key.iov_len); } else { - MDBX_node *srcnode = NODEPTR(pdst, 0); - key.iov_len = NODEKSZ(srcnode); - key.iov_base = NODEKEY(srcnode); + MDBX_node *srcnode = page_node(pdst, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); } mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", pdst->mp_pgno, DKEY(&key)); @@ -10237,19 +12392,12 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { - MDBX_node *srcnode; MDBX_val key; int rc; mdbx_cassert(csrc, csrc != cdst); - - /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) - return rc; - - MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; - MDBX_page *const pdst = cdst->mc_pg[cdst->mc_top]; - + const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; + MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, pdst->mp_pgno); @@ -10262,24 +12410,35 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE(psrc); /* Move all nodes from src to dst */ - const unsigned nkeys = NUMKEYS(pdst); - unsigned j = nkeys; - if (unlikely(pagetype & P_LEAF2)) { - key.iov_len = csrc->mc_db->md_xsize; - key.iov_base = PAGEDATA(psrc); - for (unsigned i = 0; i < NUMKEYS(psrc); i++, j++) { - rc = mdbx_node_add_leaf2(cdst, j, &key); - if (unlikely(rc != MDBX_SUCCESS)) + const unsigned dst_nkeys = page_numkeys(pdst); + const unsigned src_nkeys = page_numkeys(psrc); + mdbx_cassert(cdst, dst_nkeys + src_nkeys >= + (unsigned)(IS_LEAF(psrc) ? 1 : MDBX_MINKEYS)); + if (likely(src_nkeys)) { + unsigned j = dst_nkeys; + if (unlikely(pagetype & P_LEAF2)) { + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) return rc; - key.iov_base = (char *)key.iov_base + key.iov_len; - } - } else { - for (unsigned i = 0; i < NUMKEYS(psrc); i++, j++) { - srcnode = NODEPTR(psrc, i); - if (i == 0 && (pagetype & P_BRANCH)) { + + key.iov_len = csrc->mc_db->md_xsize; + key.iov_base = page_data(psrc); + unsigned i = 0; + do { + rc = mdbx_node_add_leaf2(cdst, j++, &key); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + key.iov_base = (char *)key.iov_base + key.iov_len; + } while (++i != src_nkeys); + } else { + MDBX_node *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + if (pagetype & P_BRANCH) { MDBX_cursor mn; mdbx_cursor_copy(csrc, &mn); mn.mc_xcursor = NULL; @@ -10287,39 +12446,51 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { rc = mdbx_page_search_lowest(&mn); if (unlikely(rc)) return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - key.iov_len = mn.mc_db->md_xsize; - key.iov_base = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.iov_len); + MDBX_node *lowest = page_node(mn.mc_pg[mn.mc_top], 0); + key.iov_len = node_ks(lowest); + key.iov_base = node_key(lowest); + + const size_t dst_room = page_room(pdst); + const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); + const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; + if (unlikely(space_needed > dst_room)) + return MDBX_RESULT_TRUE; + } + + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) + return rc; + + unsigned i = 0; + while (true) { + if (pagetype & P_LEAF) { + MDBX_val data; + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); } else { - MDBX_node *s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - key.iov_len = NODEKSZ(s2); - key.iov_base = NODEKEY(s2); + mdbx_cassert(csrc, node_flags(srcnode) == 0); + rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } - } else { - key.iov_len = srcnode->mn_ksize; - key.iov_base = NODEKEY(srcnode); - } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (pagetype & P_LEAF) { - MDBX_val data; - data.iov_len = NODEDSZ(srcnode); - data.iov_base = NODEDATA(srcnode); - rc = mdbx_node_add_leaf(cdst, j, &key, &data, srcnode->mn_flags); - } else { - mdbx_cassert(csrc, srcnode->mn_flags == 0); - rc = mdbx_node_add_branch(cdst, j, &key, NODEPGNO(srcnode)); + if (++i == src_nkeys) + break; + srcnode = page_node(psrc, i); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); } - if (unlikely(rc != MDBX_SUCCESS)) - return rc; } - } - mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", - pdst->mp_pgno, NUMKEYS(pdst), - PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10.24); + pdst = cdst->mc_pg[cdst->mc_top]; + mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", + pdst->mp_pgno, page_numkeys(pdst), + page_fill(cdst->mc_txn->mt_env, pdst)); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + } /* Unlink the src page from parent and add to free list. */ csrc->mc_top--; @@ -10349,8 +12520,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { continue; if (m3->mc_pg[top] == psrc) { m3->mc_pg[top] = pdst; - mdbx_cassert(m3, nkeys + m3->mc_ki[top] <= UINT16_MAX); - m3->mc_ki[top] += (indx_t)nkeys; + mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); + m3->mc_ki[top] += (indx_t)dst_nkeys; m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { @@ -10361,9 +12532,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } } - /* If not operating on FreeDB, allow this page to be reused + /* If not operating on GC, allow this page to be reused * in this txn. Otherwise just add to free list. */ - rc = mdbx_page_loose(csrc, psrc); + rc = mdbx_page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; @@ -10391,6 +12562,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { return MDBX_SUCCESS; } + mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + if (pagetype != PAGETYPE(top_page)) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ goto bailout; @@ -10453,8 +12626,6 @@ bailout: * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - unsigned i; - mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= *csrc->mc_txn->mt_env->me_oldest); cdst->mc_txn = csrc->mc_txn; @@ -10465,7 +12636,7 @@ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; - for (i = 0; i < csrc->mc_snum; i++) { + for (unsigned i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; cdst->mc_ki[i] = csrc->mc_ki[i]; } @@ -10475,28 +12646,34 @@ static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ static int mdbx_rebalance(MDBX_cursor *mc) { - MDBX_node *node; int rc; - unsigned minkeys, thresh; mdbx_cassert(mc, mc->mc_snum > 0); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]); - if (pagetype == P_BRANCH) { - minkeys = 2; - thresh = 1; - } else { - minkeys = 1; - thresh = FILL_THRESHOLD; - } + + const unsigned minkeys = (P_BRANCH == 1) ? (pagetype & P_BRANCH) + 1 + : (pagetype & P_BRANCH) ? 2 : 1; + + /* The threshold of minimum page fill factor, in form of a negative binary + * exponent, i.e. 2 means 1/(2**3) == 1/4 == 25%. Pages emptier than this + * are candidates for merging. */ + const unsigned threshold_fill_exp2 = 2; + + /* The threshold of minimum page fill factor, as a number of free bytes on a + * page. Pages emptier than this are candidates for merging. */ + const unsigned spaceleft_threshold = + page_space(mc->mc_txn->mt_env) - + (page_space(mc->mc_txn->mt_env) >> threshold_fill_exp2); + mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)", (pagetype & P_LEAF) ? "leaf" : "branch", - mc->mc_pg[mc->mc_top]->mp_pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), - PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10.24); + mc->mc_pg[mc->mc_top]->mp_pgno, + page_numkeys(mc->mc_pg[mc->mc_top]), + page_fill(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top])); - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + if (page_fill_enough(mc->mc_pg[mc->mc_top], spaceleft_threshold, minkeys)) { mdbx_debug("no need to rebalance page %" PRIaPGNO ", above fill threshold", mc->mc_pg[mc->mc_top]->mp_pgno); mdbx_cassert(mc, mc->mc_db->md_entries > 0); @@ -10505,27 +12682,21 @@ static int mdbx_rebalance(MDBX_cursor *mc) { if (mc->mc_snum < 2) { MDBX_page *const mp = mc->mc_pg[0]; - const unsigned nkeys = NUMKEYS(mp); + const unsigned nkeys = page_numkeys(mp); mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); if (IS_SUBP(mp)) { - mdbx_debug("Can't rebalance a subpage, ignoring"); + mdbx_debug("%s", "Can't rebalance a subpage, ignoring"); mdbx_cassert(mc, pagetype & P_LEAF); return MDBX_SUCCESS; } if (nkeys == 0) { mdbx_cassert(mc, IS_LEAF(mp)); - mdbx_debug("tree is completely empty"); + mdbx_debug("%s", "tree is completely empty"); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 && mc->mc_db->md_overflow_pages == 0 && mc->mc_db->md_leaf_pages == 1); - mc->mc_db->md_leaf_pages = 0; - if (mc->mc_flags & C_SUB) - mdbx_outer_db(mc)->md_leaf_pages -= 1; - rc = mdbx_pnl_append(&mc->mc_txn->mt_befree_pages, mp->mp_pgno); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; /* Adjust cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; @@ -10543,19 +12714,17 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_snum = 0; mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - } else if (IS_BRANCH(mp) && nkeys == 1) { - mdbx_debug("collapsing root page!"); - rc = mdbx_pnl_append(&mc->mc_txn->mt_befree_pages, mp->mp_pgno); - if (unlikely(rc)) + + rc = mdbx_page_retire(mc, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + } else if (IS_BRANCH(mp) && nkeys == 1) { + mdbx_debug("%s", "collapsing root page!"); + mc->mc_db->md_root = node_pgno(page_node(mp, 0)); rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - if (mc->mc_flags & C_SUB) - mdbx_outer_db(mc)->md_branch_pages -= 1; mc->mc_ki[0] = mc->mc_ki[1]; for (int i = 1; i < mc->mc_db->md_depth; i++) { mc->mc_pg[i] = mc->mc_pg[i + 1]; @@ -10565,7 +12734,6 @@ static int mdbx_rebalance(MDBX_cursor *mc) { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) @@ -10579,11 +12747,14 @@ static int mdbx_rebalance(MDBX_cursor *mc) { m3->mc_top--; } } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + + rc = mdbx_page_retire(mc, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } else { mdbx_debug("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", @@ -10597,7 +12768,7 @@ static int mdbx_rebalance(MDBX_cursor *mc) { const unsigned pre_top = mc->mc_top - 1; mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top])); mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0])); - mdbx_cassert(mc, NUMKEYS(mc->mc_pg[pre_top]) > 1); + mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or @@ -10608,82 +12779,258 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mdbx_cursor_copy(mc, &mn); mn.mc_xcursor = NULL; - indx_t oldki = mc->mc_ki[mc->mc_top]; - bool fromleft; - if (mc->mc_ki[pre_top] == 0) { - /* We're the leftmost leaf in our parent. */ - mdbx_debug("reading right neighbor"); - mn.mc_ki[pre_top]++; - node = NODEPTR(mc->mc_pg[pre_top], mn.mc_ki[pre_top]); - rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + MDBX_page *left = nullptr, *right = nullptr; + if (mn.mc_ki[pre_top] > 0) { + rc = mdbx_page_get( + &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), + &left, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) == - PAGETYPE(mc->mc_pg[mc->mc_top])); - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - fromleft = false; - } else { - /* There is at least one neighbor to the left. */ - mdbx_debug("reading left neighbor"); - mn.mc_ki[pre_top]--; - node = NODEPTR(mc->mc_pg[pre_top], mn.mc_ki[pre_top]); - rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); + } + if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { + rc = mdbx_page_get( + &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), + &right, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(mn.mc_pg[mn.mc_top]) == - PAGETYPE(mc->mc_pg[mc->mc_top])); - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); + } + + const indx_t ki_top = mc->mc_ki[mc->mc_top]; + const indx_t ki_pre_top = mn.mc_ki[pre_top]; + const indx_t nkeys = (indx_t)page_numkeys(mn.mc_pg[mn.mc_top]); + if (left && page_room(left) > spaceleft_threshold && + (!right || page_room(right) < page_room(left))) { + /* try merge with left */ + mdbx_cassert(mc, page_numkeys(left) >= minkeys); + mn.mc_pg[mn.mc_top] = left; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; + mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); mc->mc_ki[mc->mc_top] = 0; - fromleft = true; + const indx_t new_ki = (indx_t)(ki_top + page_numkeys(left)); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + if (likely(rc != MDBX_RESULT_TRUE)) { + mdbx_cursor_copy(&mn, mc); + mc->mc_ki[mc->mc_top] = new_ki; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + if (right && page_room(right) > spaceleft_threshold) { + /* try merge with right */ + mdbx_cassert(mc, page_numkeys(right) >= minkeys); + mn.mc_pg[mn.mc_top] = right; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = nkeys; + rc = mdbx_page_merge(&mn, mc); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = ki_top; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + if (left && page_numkeys(left) > minkeys && + (!right || page_numkeys(right) <= minkeys || + page_room(right) > page_room(left))) { + /* try move from left */ + mn.mc_pg[mn.mc_top] = left; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; + mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); + mc->mc_ki[mc->mc_top] = 0; + rc = mdbx_node_move(&mn, mc, true); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = ki_top + 1; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + if (right && page_numkeys(right) > minkeys) { + /* try move from right */ + mn.mc_pg[mn.mc_top] = right; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = nkeys; + rc = mdbx_node_move(&mn, mc, false); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = ki_top; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } } - mdbx_debug("found neighbor page %" PRIaPGNO " (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10.24); + if (nkeys >= minkeys) + return MDBX_SUCCESS; - /* If the neighbor page is above threshold and has enough keys, - * move one key from it. Otherwise we should try to merge them. - * (A branch page must never have less than 2 keys.) */ - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && - NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdbx_node_move(&mn, mc, fromleft); - if (unlikely(rc != MDBX_SUCCESS)) + if (left && (!right || page_room(left) > page_room(right))) { + /* try merge with left */ + mdbx_cassert(mc, page_numkeys(left) >= minkeys); + mn.mc_pg[mn.mc_top] = left; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top - 1; + mn.mc_ki[mn.mc_top] = (indx_t)(page_numkeys(left) - 1); + mc->mc_ki[mc->mc_top] = 0; + const indx_t new_ki = (indx_t)(ki_top + page_numkeys(left)); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + if (likely(rc != MDBX_RESULT_TRUE)) { + mdbx_cursor_copy(&mn, mc); + mc->mc_ki[mc->mc_top] = new_ki; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; - oldki += fromleft /* if we inserted on left, bump position up */; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - } else { - if (!fromleft) { - rc = mdbx_page_merge(&mn, mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + } + } else if (likely(right)) { + /* try merge with right */ + mdbx_cassert(mc, page_numkeys(right) >= minkeys); + mn.mc_pg[mn.mc_top] = right; + mn.mc_ki[mn.mc_top - 1] = ki_pre_top + 1; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = nkeys; + rc = mdbx_page_merge(&mn, mc); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = ki_top; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + return MDBX_PROBLEM; +} + +static __cold int mdbx_page_check(MDBX_env *env, const MDBX_page *const mp, + bool maybe_unfinished) { + const unsigned nkeys = page_numkeys(mp); + char *const end_of_page = (char *)mp + env->me_psize; + mdbx_assert(env, mp->mp_pgno >= MIN_PAGENO && mp->mp_pgno <= MAX_PAGENO); + if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) + return MDBX_CORRUPTED; + if (IS_OVERFLOW(mp)) { + mdbx_assert(env, mp->mp_pages >= 1 && mp->mp_pages < MAX_PAGENO / 2); + if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) + return MDBX_CORRUPTED; + mdbx_assert(env, mp->mp_pgno <= MAX_PAGENO - mp->mp_pages); + if (unlikely(mp->mp_pgno > MAX_PAGENO - mp->mp_pages)) + return MDBX_CORRUPTED; + return MDBX_SUCCESS; + } + if (!(IS_DIRTY(mp) && maybe_unfinished)) { + mdbx_assert(env, nkeys >= 2 || !IS_BRANCH(mp)); + if (unlikely(nkeys < 2 && IS_BRANCH(mp))) + return MDBX_CORRUPTED; + } + + for (unsigned i = IS_LEAF(mp) ? 0 : 1; i < nkeys; ++i) { + if (IS_LEAF2(mp)) { + const size_t ksize = mp->mp_leaf2_ksize; + const char *const key = page_leaf2key(mp, i, ksize); + mdbx_assert(env, key + ksize <= end_of_page); + if (unlikely(end_of_page < key + ksize)) + return MDBX_CORRUPTED; } else { - oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_cursor_copy(&mn, mc); - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + const MDBX_node *const node = page_node(mp, i); + const char *node_end = (char *)node + NODESIZE; + mdbx_assert(env, node_end <= end_of_page); + if (unlikely(node_end > end_of_page)) + return MDBX_CORRUPTED; + if (IS_LEAF(mp) || i > 0) { + size_t ksize = node_ks(node); + char *key = node_key(node); + mdbx_assert(env, key + ksize <= end_of_page); + if (unlikely(end_of_page < key + ksize)) + return MDBX_CORRUPTED; + } + if (IS_BRANCH(mp)) + continue; + if (node_flags(node) == F_BIGDATA /* data on large-page */) { + continue; + } + const size_t dsize = node_ds(node); + const char *const data = node_data(node); + mdbx_assert(env, data + dsize <= end_of_page); + if (unlikely(end_of_page < data + dsize)) + return MDBX_CORRUPTED; + + switch (node_flags(node)) { + default: + mdbx_assert(env, false); + return MDBX_CORRUPTED; + case 0 /* usual */: + break; + case F_SUBDATA /* sub-db */: + mdbx_assert(env, dsize >= sizeof(MDBX_db)); + if (unlikely(dsize < sizeof(MDBX_db))) + return MDBX_CORRUPTED; + break; + case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: + mdbx_assert(env, dsize == sizeof(MDBX_db)); + if (unlikely(dsize != sizeof(MDBX_db))) + return MDBX_CORRUPTED; + break; + case F_DUPDATA /* short sub-page */: + mdbx_assert(env, dsize > PAGEHDRSZ); + if (unlikely(dsize <= PAGEHDRSZ)) + return MDBX_CORRUPTED; + else { + const MDBX_page *const sp = (MDBX_page *)data; + const char *const end_of_subpage = data + dsize; + const int nsubkeys = page_numkeys(sp); + switch (sp->mp_flags & ~P_DIRTY /* ignore for sub-pages */) { + case P_LEAF | P_SUBP: + case P_LEAF | P_LEAF2 | P_SUBP: + break; + default: + mdbx_assert(env, false); + return MDBX_CORRUPTED; + } + + for (int j = 0; j < nsubkeys; j++) { + if (IS_LEAF2(sp)) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + size_t sub_ksize = sp->mp_leaf2_ksize; + char *sub_key = page_leaf2key(sp, j, sub_ksize); + mdbx_assert(env, sub_key + sub_ksize <= end_of_subpage); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) + return MDBX_CORRUPTED; + } else { + mdbx_assert(env, IS_LEAF(sp)); + if (unlikely(!IS_LEAF(sp))) + return MDBX_CORRUPTED; + const MDBX_node *const sub_node = page_node(sp, j); + const char *sub_node_end = (char *)sub_node + NODESIZE; + mdbx_assert(env, sub_node_end <= end_of_subpage); + if (unlikely(sub_node_end > end_of_subpage)) + return MDBX_CORRUPTED; + mdbx_assert(env, node_flags(sub_node) == 0); + if (unlikely(node_flags(sub_node) != 0)) + return MDBX_CORRUPTED; + + size_t sub_ksize = node_ks(sub_node); + char *sub_key = node_key(sub_node); + size_t sub_dsize = node_ds(sub_node); + char *sub_data = node_data(sub_node); + mdbx_assert(env, sub_key + sub_ksize <= end_of_subpage); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) + return MDBX_CORRUPTED; + mdbx_assert(env, sub_data + sub_dsize <= end_of_subpage); + if (unlikely(end_of_subpage < sub_data + sub_dsize)) + return MDBX_CORRUPTED; + } + } + } + break; + } } - mc->mc_flags &= ~C_EOF; } - mc->mc_ki[mc->mc_top] = oldki; return MDBX_SUCCESS; } static __cold int mdbx_cursor_check(MDBX_cursor *mc, bool pending) { + mdbx_tassert(mc->mc_txn, mc->mc_txn->mt_parent || + mc->mc_txn->tw.dirtyroom + + mc->mc_txn->tw.dirtylist->length == + MDBX_DPL_TXNFULL); mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1); if (unlikely(mc->mc_top != mc->mc_snum - 1)) return MDBX_CURSOR_FULL; @@ -10695,7 +13042,7 @@ static __cold int mdbx_cursor_check(MDBX_cursor *mc, bool pending) { for (int n = 0; n < mc->mc_snum; ++n) { MDBX_page *mp = mc->mc_pg[n]; - const unsigned numkeys = NUMKEYS(mp); + const unsigned nkeys = page_numkeys(mp); const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; const bool expect_nested_leaf = (n + 1 == mc->mc_db->md_depth - 1) ? true : false; @@ -10704,26 +13051,30 @@ static __cold int mdbx_cursor_check(MDBX_cursor *mc, bool pending) { if (unlikely(branch != expect_branch)) return MDBX_CURSOR_FULL; if (!pending) { - mdbx_cassert(mc, numkeys > mc->mc_ki[n] || - (!branch && numkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0)); - if (unlikely(numkeys <= mc->mc_ki[n] && - !(!branch && numkeys == mc->mc_ki[n] && + mdbx_cassert(mc, + nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0)); + if (unlikely(nkeys <= mc->mc_ki[n] && + !(!branch && nkeys == mc->mc_ki[n] && (mc->mc_flags & C_EOF) != 0))) return MDBX_CURSOR_FULL; } else { - mdbx_cassert(mc, numkeys + 1 >= mc->mc_ki[n]); - if (unlikely(numkeys + 1 < mc->mc_ki[n])) + mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]); + if (unlikely(nkeys + 1 < mc->mc_ki[n])) return MDBX_CURSOR_FULL; } - for (unsigned i = 0; i < numkeys; ++i) { - MDBX_node *node = NODEPTR(mp, i); + int err = mdbx_page_check(mc->mc_txn->mt_env, mp, pending); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + for (unsigned i = 0; i < nkeys; ++i) { if (branch) { - mdbx_cassert(mc, node->mn_flags == 0); - if (unlikely(node->mn_flags != 0)) + MDBX_node *node = page_node(mp, i); + mdbx_cassert(mc, node_flags(node) == 0); + if (unlikely(node_flags(node) != 0)) return MDBX_CURSOR_FULL; - pgno_t pgno = NODEPGNO(node); + pgno_t pgno = node_pgno(node); MDBX_page *np; int rc = mdbx_page_get(mc, pgno, &np, NULL); mdbx_cassert(mc, rc == MDBX_SUCCESS); @@ -10733,6 +13084,9 @@ static __cold int mdbx_cursor_check(MDBX_cursor *mc, bool pending) { mdbx_cassert(mc, nested_leaf == expect_nested_leaf); if (unlikely(nested_leaf != expect_nested_leaf)) return MDBX_CURSOR_FULL; + err = mdbx_page_check(mc->mc_txn->mt_env, np, pending); + if (unlikely(err != MDBX_SUCCESS)) + return err; } } } @@ -10793,7 +13147,7 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = NUMKEYS(mp); + nkeys = page_numkeys(mp); mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && nkeys == 0)); @@ -10820,15 +13174,15 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && (m3->mc_flags & C_EOF) == 0) { MDBX_node *node = - NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); /* If this node has dupdata, it may need to be reinited * because its data has moved. * If the xcursor was not initd it must be reinited. * Else if node points to a subDB, nothing is needed. */ - if (node->mn_flags & F_DUPDATA) { + if (node_flags(node) & F_DUPDATA) { if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node->mn_flags & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); } else { rc = mdbx_xcursor_init1(m3, node); if (unlikely(rc != MDBX_SUCCESS)) @@ -10850,15 +13204,15 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && (mc->mc_flags & C_EOF) == 0) { - MDBX_node *node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* If this node has dupdata, it may need to be reinited * because its data has moved. * If the xcursor was not initd it must be reinited. * Else if node points to a subDB, nothing is needed. */ - if (node->mn_flags & F_DUPDATA) { + if (node_flags(node) & F_DUPDATA) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node->mn_flags & F_SUBDATA)) - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + if (!(node_flags(node) & F_SUBDATA)) + mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); } else { rc = mdbx_xcursor_init1(mc, node); if (likely(rc != MDBX_SUCCESS)) @@ -10878,20 +13232,18 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { } int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - if (unlikely(!key || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!key)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; return mdbx_del0(txn, dbi, key, data, 0); } @@ -10953,7 +13305,6 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, pgno_t pgno = 0; unsigned i, ptop; MDBX_env *env = mc->mc_txn->mt_env; - MDBX_node *node; MDBX_val sepkey, rkey, xdata; MDBX_page *copy = NULL; MDBX_page *rp, *pp; @@ -10962,13 +13313,15 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned newindx = mc->mc_ki[mc->mc_top]; - unsigned nkeys = NUMKEYS(mp); + unsigned nkeys = page_numkeys(mp); if (mdbx_audit_enabled()) { - int err = mdbx_cursor_check(mc, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; + rc = mdbx_cursor_check(mc, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } + mdbx_cassert(mc, + nkeys >= (unsigned)(IS_BRANCH(mp) ? MDBX_MINKEYS * 2 - 1 : 1)); mdbx_debug("-----> splitting %s page %" PRIaPGNO " and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), @@ -11033,12 +13386,11 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, split_indx = (nkeys + 1) / 2; if (IS_LEAF2(rp)) { char *split, *ins; - int x; unsigned lsize, rsize, ksize; /* Move half of the keys to the right sibling */ - x = mc->mc_ki[mc->mc_top] - split_indx; + const int x = mc->mc_ki[mc->mc_top] - split_indx; ksize = mc->mc_db->md_xsize; - split = LEAF2KEY(mp, split_indx, ksize); + split = page_leaf2key(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; lsize = (nkeys - split_indx) * sizeof(indx_t); mdbx_cassert(mc, mp->mp_lower >= lsize); @@ -11057,7 +13409,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } if (x < 0) { mdbx_cassert(mc, ksize >= sizeof(indx_t)); - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(rp->mp_ptrs, split, rsize); sepkey.iov_base = rp->mp_ptrs; memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); @@ -11069,22 +13421,21 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } else { if (x) memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); + ins = page_leaf2key(rp, x, ksize); memcpy(ins, newkey->iov_base, ksize); memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); mdbx_cassert(mc, UINT16_MAX - rp->mp_lower >= (int)sizeof(indx_t)); rp->mp_lower += sizeof(indx_t); mdbx_cassert(mc, rp->mp_upper >= ksize - sizeof(indx_t)); rp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mdbx_cassert(mc, x <= UINT16_MAX); + mdbx_cassert(mc, x <= (int)UINT16_MAX); mc->mc_ki[mc->mc_top] = (indx_t)x; } } else { - size_t psize, nsize, k; /* Maximum free space in an empty page */ - const unsigned pmax = env->me_psize - PAGEHDRSZ; - nsize = IS_LEAF(mp) ? mdbx_leaf_size(env, newkey, newdata) - : mdbx_branch_size(env, newkey); + const unsigned pmax = page_space(env); + const size_t nsize = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) + : branch_size(env, newkey); /* grab a page to hold a temporary copy */ copy = mdbx_page_malloc(mc->mc_txn, 1); @@ -11095,8 +13446,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; copy->mp_lower = 0; - mdbx_cassert(mc, env->me_psize - PAGEHDRSZ <= UINT16_MAX); - copy->mp_upper = (indx_t)(env->me_psize - PAGEHDRSZ); + copy->mp_upper = (indx_t)page_space(env); /* prepare to insert */ for (unsigned j = i = 0; i < nkeys; i++) { @@ -11118,66 +13468,67 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, * As a final tweak, if the new item goes on the last * spot on the page (and thus, onto the new page), bias * the split so the new page is emptier than the old page. - * This yields better packing during sequential inserts. - */ - int dir; + * This yields better packing during sequential inserts. */ if (nkeys < 32 || nsize > pmax / 16 || newindx >= nkeys) { /* Find split point */ - psize = 0; + int dir; + size_t psize = 0; + unsigned k; if (newindx <= split_indx || newindx >= nkeys) { i = 0; dir = 1; k = (newindx >= nkeys) ? nkeys : split_indx + 1 + IS_LEAF(mp); + split_indx = k - 1; } else { i = nkeys; dir = -1; k = split_indx - 1; + split_indx += 1; } - for (; i != k; i += dir) { + do { if (i == newindx) { psize += nsize; - node = NULL; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } + MDBX_node *node = + (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); + psize += NODESIZE + node_ks(node) + sizeof(indx_t); + if (IS_LEAF(mp)) + psize += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t) + : node_ds(node); psize = EVEN(psize); } - if (psize > pmax || i == k - dir) { + if (psize > pmax) { split_indx = i + (dir < 0); break; } - } + i += dir; + } while (i != k); } + if (split_indx == newindx) { sepkey.iov_len = newkey->iov_len; sepkey.iov_base = newkey->iov_base; } else { - node = + MDBX_node *node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ); - sepkey.iov_len = node->mn_ksize; - sepkey.iov_base = NODEKEY(node); + sepkey.iov_len = node_ks(node); + sepkey.iov_base = node_key(node); } } } mdbx_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); if (mdbx_audit_enabled()) { - int err = mdbx_cursor_check(mc, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = mdbx_cursor_check(&mn, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; + rc = mdbx_cursor_check(mc, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + rc = mdbx_cursor_check(&mn, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; } /* Copy separator key to the parent. */ - if (SIZELEFT(mn.mc_pg[ptop]) < mdbx_branch_size(env, &sepkey)) { + if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { const int snum = mc->mc_snum; const int depth = mc->mc_db->md_depth; mn.mc_snum--; @@ -11190,9 +13541,9 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, goto done; mdbx_cassert(mc, mc->mc_snum - snum == mc->mc_db->md_depth - depth); if (mdbx_audit_enabled()) { - int err = mdbx_cursor_check(mc, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; + rc = mdbx_cursor_check(mc, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; } /* root split? */ @@ -11201,7 +13552,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, /* Right page might now have changed parent. * Check if left page also changed parent. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { for (i = 0; i < ptop; i++) { mc->mc_pg[i] = mn.mc_pg[i]; mc->mc_ki[i] = mn.mc_ki[i]; @@ -11271,16 +13622,17 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = n; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); - rkey.iov_base = NODEKEY(node); - rkey.iov_len = node->mn_ksize; + MDBX_node *node = + (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); + rkey.iov_base = node_key(node); + rkey.iov_len = node_ks(node); if (IS_LEAF(mp)) { - xdata.iov_base = NODEDATA(node); - xdata.iov_len = NODEDSZ(node); + xdata.iov_base = node_data(node); + xdata.iov_len = node_ds(node); rdata = &xdata; } else - pgno = NODEPGNO(node); - flags = node->mn_flags; + pgno = node_pgno(node); + flags = node_flags(node); } switch (PAGETYPE(rp)) { @@ -11318,12 +13670,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } } while (i != split_indx); - nkeys = NUMKEYS(copy); + nkeys = page_numkeys(copy); for (i = 0; i < nkeys; i++) mp->mp_ptrs[i] = copy->mp_ptrs[i]; mp->mp_lower = copy->mp_lower; mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), + memcpy(page_node(mp, nkeys - 1), page_node(copy, nkeys - 1), env->me_psize - copy->mp_upper - PAGEHDRSZ); /* reset back to original page */ @@ -11334,7 +13686,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, mc->mc_ki[ptop]++; /* Make sure mc_ki is still valid. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { for (i = 0; i <= ptop; i++) { mc->mc_pg[i] = mn.mc_pg[i]; mc->mc_ki[i] = mn.mc_ki[i]; @@ -11342,9 +13694,9 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } } if (nflags & MDBX_RESERVE) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->iov_base = NODEDATA(node); + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node_flags(node) & F_BIGDATA)) + newdata->iov_base = node_data(node); } } else { if (newindx >= split_indx) { @@ -11352,7 +13704,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, mc->mc_ki[ptop]++; /* Make sure mc_ki is still valid. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { for (i = 0; i <= ptop; i++) { mc->mc_pg[i] = mn.mc_pg[i]; mc->mc_ki[i] = mn.mc_ki[i]; @@ -11365,7 +13717,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; - nkeys = NUMKEYS(mp); + nkeys = page_numkeys(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; @@ -11409,7 +13761,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } - mdbx_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); + mdbx_debug("mp left: %d, rp left: %d", page_room(mp), page_room(rp)); done: if (copy) /* tmp page */ @@ -11421,16 +13773,13 @@ done: int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(!key || !data || !txn)) + if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -11438,11 +13787,11 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -11454,9 +13803,9 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (likely(rc == MDBX_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT)) { /* LY: allows update (explicit overwrite) only for unique keys */ - MDBX_node *leaf = NODEPTR(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && cx.outer.mc_xcursor->mx_db.md_entries > 1); rc = MDBX_EMULTIVAL; @@ -11471,6 +13820,8 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, return rc; } +/**** COPYING *****************************************************************/ + #ifndef MDBX_WBUF #define MDBX_WBUF ((size_t)1024 * 1024) #endif @@ -11500,7 +13851,6 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { uint8_t *ptr; int toggle = 0; int rc; - size_t offset = pgno2bytes(my->mc_env, NUM_METAS); mdbx_condmutex_lock(&my->mc_condmutex); while (!my->mc_error) { @@ -11512,12 +13862,11 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { - rc = mdbx_pwrite(my->mc_fd, ptr, wsize, offset); + rc = mdbx_write(my->mc_fd, ptr, wsize); if (rc != MDBX_SUCCESS) { my->mc_error = rc; break; } - offset += wsize; } /* If there's an overflow page tail, write it too */ @@ -11561,7 +13910,6 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDBX_cursor mc; - MDBX_node *ni; MDBX_page *mo, *mp, *leaf; char *buf, *ptr; int rc, toggle; @@ -11600,13 +13948,13 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { while (mc.mc_snum > 0) { unsigned n; mp = mc.mc_pg[mc.mc_top]; - n = NUMKEYS(mp); + n = page_numkeys(mp); if (IS_LEAF(mp)) { if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { for (i = 0; i < n; i++) { - ni = NODEPTR(mp, i); - if (ni->mn_flags & F_BIGDATA) { + MDBX_node *node = page_node(mp, i); + if (node_flags(node) & F_BIGDATA) { MDBX_page *omp; /* Need writable leaf */ @@ -11614,12 +13962,11 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { mc.mc_pg[mc.mc_top] = leaf; mdbx_page_copy(leaf, mp, my->mc_env->me_psize); mp = leaf; - ni = NODEPTR(mp, i); + node = page_node(mp, i); } - pgno_t pgno; - memcpy(&pgno, NODEDATA(ni), sizeof(pgno)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + const pgno_t pgno = node_largedata_pgno(node); + poke_pgno(node_data(node), my->mc_next_pgno); rc = mdbx_page_get(&mc, pgno, &omp, NULL); if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -11642,35 +13989,37 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { goto done; toggle = my->mc_toggle; } - } else if (ni->mn_flags & F_SUBDATA) { - MDBX_db db; + } else if (node_flags(node) & F_SUBDATA) { + if (node_ds(node) < sizeof(MDBX_db)) { + rc = MDBX_CORRUPTED; + goto done; + } /* Need writable leaf */ if (mp != leaf) { mc.mc_pg[mc.mc_top] = leaf; mdbx_page_copy(leaf, mp, my->mc_env->me_psize); mp = leaf; - ni = NODEPTR(mp, i); + node = page_node(mp, i); } - memcpy(&db, NODEDATA(ni), sizeof(db)); + MDBX_db db; + memcpy(&db, node_data(node), sizeof(MDBX_db)); my->mc_toggle = (short)toggle; - rc = mdbx_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); if (rc) goto done; toggle = my->mc_toggle; - memcpy(NODEDATA(ni), &db, sizeof(db)); + memcpy(node_data(node), &db, sizeof(MDBX_db)); } } } } else { mc.mc_ki[mc.mc_top]++; if (mc.mc_ki[mc.mc_top] < n) { - pgno_t pgno; again: - ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); - pgno = NODEPGNO(ni); - rc = mdbx_page_get(&mc, pgno, &mp, NULL); + rc = mdbx_page_get(&mc, node_pgno(page_node(mp, mc.mc_ki[mc.mc_top])), + &mp, NULL); if (unlikely(rc != MDBX_SUCCESS)) goto done; mc.mc_top++; @@ -11698,8 +14047,8 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { my->mc_wlen[toggle] += my->mc_env->me_psize; if (mc.mc_top) { /* Update parent if there is one */ - ni = NODEPTR(mc.mc_pg[mc.mc_top - 1], mc.mc_ki[mc.mc_top - 1]); - SETPGNO(ni, mo->mp_pgno); + node_set_pgno(page_node(mc.mc_pg[mc.mc_top - 1], mc.mc_ki[mc.mc_top - 1]), + mo->mp_pgno); mdbx_cursor_pop(&mc); } else { /* Otherwise we're done */ @@ -11712,14 +14061,45 @@ done: return rc; } +static __cold void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { + /* Calculate filesize taking in account shrink/growing thresholds */ + if (meta->mm_geo.next > meta->mm_geo.now) { + const pgno_t aligned = pgno_align2os_pgno( + env, + pgno_add(meta->mm_geo.next, + meta->mm_geo.grow - meta->mm_geo.next % meta->mm_geo.grow)); + meta->mm_geo.now = aligned; + } else if (meta->mm_geo.next < meta->mm_geo.now) { + meta->mm_geo.now = meta->mm_geo.next; + const pgno_t aligner = + meta->mm_geo.grow ? meta->mm_geo.grow : meta->mm_geo.shrink; + const pgno_t aligned = pgno_align2os_pgno( + env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner); + meta->mm_geo.now = aligned; + } + + if (meta->mm_geo.now < meta->mm_geo.lower) + meta->mm_geo.now = meta->mm_geo.lower; + if (meta->mm_geo.now > meta->mm_geo.upper) + meta->mm_geo.now = meta->mm_geo.upper; + + /* Update signature */ + assert(meta->mm_geo.now >= meta->mm_geo.next); + meta->mm_datasync_sign = mdbx_meta_sign(meta); +} + /* Copy environment with compaction. */ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer) { - MDBX_page *const meta = mdbx_init_metas(env, buffer); + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe) { + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + uint8_t *const data_buffer = + buffer + roundup_powerof2(meta_bytes, env->me_os_psize); + MDBX_meta *const meta = mdbx_init_metas(env, buffer); /* copy canary sequenses if present */ if (read_txn->mt_canary.v) { - meta->mp_meta.mm_canary = read_txn->mt_canary; - meta->mp_meta.mm_canary.v = mdbx_meta_txnid_stable(env, &meta->mp_meta); + meta->mm_canary = read_txn->mt_canary; + meta->mm_canary.v = mdbx_meta_txnid_stable(env, meta); } /* Set metapage 1 with current main DB */ @@ -11727,10 +14107,15 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, if ((new_root = root) == P_INVALID) { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ - meta->mp_meta.mm_dbs[MAIN_DBI].md_flags = - read_txn->mt_dbs[MAIN_DBI].md_flags; + meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; + compact_fixup_meta(env, meta); + if (dest_is_pipe) { + int rc = mdbx_write(fd, buffer, meta_bytes); + if (rc != MDBX_SUCCESS) + return rc; + } } else { - /* Count free pages + freeDB pages. Subtract from last_pg + /* Count free pages + GC pages. Subtract from last_pg * to find the new last_pg, which also becomes the new root. */ pgno_t freecount = 0; MDBX_cursor mc; @@ -11749,9 +14134,9 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, read_txn->mt_dbs[FREE_DBI].md_overflow_pages; new_root = read_txn->mt_next_pgno - 1 - freecount; - meta->mp_meta.mm_geo.next = new_root + 1; - meta->mp_meta.mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; - meta->mp_meta.mm_dbs[MAIN_DBI].md_root = new_root; + meta->mm_geo.next = new_root + 1; + meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; + meta->mm_dbs[MAIN_DBI].md_root = new_root; mdbx_copy ctx; memset(&ctx, 0, sizeof(ctx)); @@ -11759,9 +14144,9 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, if (unlikely(rc != MDBX_SUCCESS)) return rc; - ctx.mc_wbuf[0] = buffer + pgno2bytes(env, NUM_METAS); - memset(ctx.mc_wbuf[0], 0, MDBX_WBUF * 2); - ctx.mc_wbuf[1] = ctx.mc_wbuf[0] + MDBX_WBUF; + memset(data_buffer, 0, MDBX_WBUF * 2); + ctx.mc_wbuf[0] = data_buffer; + ctx.mc_wbuf[1] = data_buffer + MDBX_WBUF; ctx.mc_next_pgno = NUM_METAS; ctx.mc_env = env; ctx.mc_fd = fd; @@ -11770,7 +14155,12 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, mdbx_thread_t thread; int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { - rc = mdbx_env_cwalk(&ctx, &root, 0); + if (dest_is_pipe) { + compact_fixup_meta(env, meta); + rc = mdbx_write(fd, buffer, meta_bytes); + } + if (rc == MDBX_SUCCESS) + rc = mdbx_env_cwalk(&ctx, &root, 0); mdbx_env_cthr_toggle(&ctx, 1 | MDBX_EOF); thread_err = mdbx_thread_join(thread); mdbx_condmutex_destroy(&ctx.mc_condmutex); @@ -11782,59 +14172,58 @@ static int __cold mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, if (unlikely(ctx.mc_error != MDBX_SUCCESS)) return ctx.mc_error; - if (root > new_root) { - mdbx_error("post-compactification root %" PRIaPGNO - " GT expected %" PRIaPGNO " (source DB corrupted)", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - if (root < new_root) { - mdbx_notice("post-compactification root %" PRIaPGNO - " LT expected %" PRIaPGNO " (page leak(s) in source DB)", - root, new_root); - /* fixup meta */ - meta->mp_meta.mm_dbs[MAIN_DBI].md_root = root; - meta->mp_meta.mm_geo.next = root + 1; + if (dest_is_pipe) { + if (root != new_root) { + mdbx_error("post-compactification root %" PRIaPGNO + " NE expected %" PRIaPGNO + " (source DB corrupted or has a page leak(s))", + root, new_root); + return MDBX_CORRUPTED; /* page leak or corrupt DB */ + } + } else { + if (root > new_root) { + mdbx_error("post-compactification root %" PRIaPGNO + " GT expected %" PRIaPGNO " (source DB corrupted)", + root, new_root); + return MDBX_CORRUPTED; /* page leak or corrupt DB */ + } + if (root < new_root) { + mdbx_notice("post-compactification root %" PRIaPGNO + " LT expected %" PRIaPGNO " (page leak(s) in source DB)", + root, new_root); + /* fixup meta */ + meta->mm_dbs[MAIN_DBI].md_root = root; + meta->mm_geo.next = root + 1; + } + compact_fixup_meta(env, meta); } } - /* Calculate filesize taking in account shrink/growing thresholds */ - if (meta->mp_meta.mm_geo.next > meta->mp_meta.mm_geo.now) { - const pgno_t aligned = - pgno_align2os_pgno(env, pgno_add(meta->mp_meta.mm_geo.next, - meta->mp_meta.mm_geo.grow - - meta->mp_meta.mm_geo.next % - meta->mp_meta.mm_geo.grow)); - meta->mp_meta.mm_geo.now = aligned; - } else if (meta->mp_meta.mm_geo.next < meta->mp_meta.mm_geo.now) { - meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.next; - const pgno_t aligner = meta->mp_meta.mm_geo.grow - ? meta->mp_meta.mm_geo.grow - : meta->mp_meta.mm_geo.shrink; - const pgno_t aligned = - pgno_align2os_pgno(env, meta->mp_meta.mm_geo.next + aligner - - meta->mp_meta.mm_geo.next % aligner); - meta->mp_meta.mm_geo.now = aligned; - } - - if (meta->mp_meta.mm_geo.now < meta->mp_meta.mm_geo.lower) - meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.lower; - if (meta->mp_meta.mm_geo.now > meta->mp_meta.mm_geo.upper) - meta->mp_meta.mm_geo.now = meta->mp_meta.mm_geo.upper; - - /* Update signature */ - assert(meta->mp_meta.mm_geo.now >= meta->mp_meta.mm_geo.next); - meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); - /* Extend file if required */ - return (meta->mp_meta.mm_geo.now != meta->mp_meta.mm_geo.next) - ? mdbx_ftruncate(fd, pgno2bytes(env, meta->mp_meta.mm_geo.now)) - : MDBX_SUCCESS; + if (meta->mm_geo.now != meta->mm_geo.next) { + const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); + if (!dest_is_pipe) + return mdbx_ftruncate(fd, whole_size); + + const size_t used_size = pgno2bytes(env, meta->mm_geo.next); + memset(data_buffer, 0, MDBX_WBUF); + for (size_t offset = used_size; offset < whole_size;) { + const size_t chunk = + (MDBX_WBUF < whole_size - offset) ? MDBX_WBUF : whole_size - offset; + /* copy to avoit EFAULT in case swapped-out */ + int rc = mdbx_write(fd, data_buffer, chunk); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + offset += chunk; + } + } + return MDBX_SUCCESS; } /* Copy environment as-is. */ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer) { + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe) { /* We must start the actual read txn after blocking writers */ int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) @@ -11863,35 +14252,67 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, mdbx_txn_unlock(env); /* Copy the data */ - const uint64_t whole_size = - mdbx_roundup2(pgno2bytes(env, read_txn->mt_end_pgno), env->me_os_psize); + const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); mdbx_jitter4testing(false); + + if (dest_is_pipe) + rc = mdbx_write(fd, buffer, meta_bytes); + + uint8_t *const data_buffer = + buffer + roundup_powerof2(meta_bytes, env->me_os_psize); + for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { + if (dest_is_pipe) { +#if defined(__linux__) || defined(__gnu_linux__) + off_t in_offset = offset; + const intptr_t written = + sendfile(fd, env->me_fd, &in_offset, used_size - offset); + if (unlikely(written <= 0)) { + rc = written ? errno : MDBX_ENODATA; + break; + } + offset = in_offset; + continue; +#endif + } else { #if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) - for (off_t in_offset = meta_bytes; in_offset < (off_t)used_size;) { - off_t out_offset = in_offset; - ssize_t bytes_copied = copy_file_range( - env->me_fd, &in_offset, fd, &out_offset, used_size - in_offset, 0); - if (unlikely(bytes_copied <= 0)) { - rc = bytes_copied ? errno : MDBX_ENODATA; - break; + off_t in_offset = offset, out_offset = offset; + ssize_t bytes_copied = copy_file_range( + env->me_fd, &in_offset, fd, &out_offset, used_size - offset, 0); + if (unlikely(bytes_copied <= 0)) { + rc = bytes_copied ? errno : MDBX_ENODATA; + break; + } + offset = in_offset; + continue; +#endif } - } -#else - uint8_t *data_buffer = buffer + meta_bytes; - for (size_t offset = meta_bytes; offset < used_size;) { + + /* fallback to portable */ const size_t chunk = (MDBX_WBUF < used_size - offset) ? MDBX_WBUF : used_size - offset; + /* copy to avoit EFAULT in case swapped-out */ memcpy(data_buffer, env->me_map + offset, chunk); - rc = mdbx_pwrite(fd, data_buffer, chunk, offset); - if (unlikely(rc != MDBX_SUCCESS)) - break; + rc = mdbx_write(fd, data_buffer, chunk); offset += chunk; } -#endif - if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) - rc = mdbx_ftruncate(fd, whole_size); + /* Extend file if required */ + if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { + if (!dest_is_pipe) + rc = mdbx_ftruncate(fd, whole_size); + else { + memset(data_buffer, 0, MDBX_WBUF); + for (size_t offset = used_size; + rc == MDBX_SUCCESS && offset < whole_size;) { + const size_t chunk = + (MDBX_WBUF < whole_size - offset) ? MDBX_WBUF : whole_size - offset; + /* copy to avoit EFAULT in case swapped-out */ + rc = mdbx_write(fd, data_buffer, chunk); + offset += chunk; + } + } + } return rc; } @@ -11904,49 +14325,61 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - int rc = mdbx_fseek(fd, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const int dest_is_pipe = mdbx_is_pipe(fd); + if (MDBX_IS_ERROR(dest_is_pipe)) + return dest_is_pipe; + + if (!dest_is_pipe) { + int rc = mdbx_fseek(fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } const size_t buffer_size = - pgno2bytes(env, NUM_METAS) + - ((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : MDBX_WBUF); + pgno_align2os_bytes(env, NUM_METAS) + + roundup_powerof2(((flags & MDBX_CP_COMPACT) ? MDBX_WBUF * 2 : MDBX_WBUF), + env->me_os_psize); uint8_t *buffer = NULL; - rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + int rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_txn *read_txn = NULL; /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. */ + * write txn. Otherwise other read txns could block writers. */ rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &read_txn); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_memalign_free(buffer); return rc; } - /* Firstly write a stub to meta-pages. - * Now we sure to incomplete copy will not be used. */ - memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + if (!dest_is_pipe) { + /* Firstly write a stub to meta-pages. + * Now we sure to incomplete copy will not be used. */ + memset(buffer, -1, pgno2bytes(env, NUM_METAS)); + rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + } + if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); rc = (flags & MDBX_CP_COMPACT) - ? mdbx_env_compact(env, read_txn, fd, buffer) - : mdbx_env_copy_asis(env, read_txn, fd, buffer); + ? mdbx_env_compact(env, read_txn, fd, buffer, dest_is_pipe) + : mdbx_env_copy_asis(env, read_txn, fd, buffer, dest_is_pipe); } mdbx_txn_abort(read_txn); - if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + if (!dest_is_pipe) { + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); - /* Write actual meta */ - if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + /* Write actual meta */ + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); - if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_filesync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } mdbx_memalign_free(buffer); return rc; @@ -12005,6 +14438,8 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *dest_path, unsigned flags) { return rc; } +/******************************************************************************/ + int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) { if (unlikely(!env)) return MDBX_EINVAL; @@ -12013,7 +14448,13 @@ int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) { return MDBX_EBADSIGN; if (unlikely(flags & ~CHANGEABLE)) - return MDBX_EINVAL; + return MDBX_EPERM; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + + if (unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) + return MDBX_BUSY; int rc = mdbx_txn_lock(env, false); if (unlikely(rc)) @@ -12097,31 +14538,32 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) { * [in] db the MDBX_db record containing the stats to return. * [out] arg the address of an MDBX_stat structure to receive the stats. * Returns 0, this function always succeeds. */ -static int __cold mdbx_stat0(const MDBX_env *env, const MDBX_db *db, - MDBX_stat *arg) { - arg->ms_psize = env->me_psize; - arg->ms_depth = db->md_depth; - arg->ms_branch_pages = db->md_branch_pages; - arg->ms_leaf_pages = db->md_leaf_pages; - arg->ms_overflow_pages = db->md_overflow_pages; - arg->ms_entries = db->md_entries; - return MDBX_SUCCESS; -} - -int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { - return mdbx_env_stat2(env, NULL, arg, bytes); -} - -int __cold mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn, - MDBX_stat *arg, size_t bytes) { - if (unlikely((env == NULL && txn == NULL) || arg == NULL)) +static void mdbx_stat0(const MDBX_env *env, const MDBX_db *db, MDBX_stat *dest, + size_t bytes) { + dest->ms_psize = env->me_psize; + dest->ms_depth = db->md_depth; + dest->ms_branch_pages = db->md_branch_pages; + dest->ms_leaf_pages = db->md_leaf_pages; + dest->ms_overflow_pages = db->md_overflow_pages; + dest->ms_entries = db->md_entries; + if (likely(bytes >= + offsetof(MDBX_stat, ms_mod_txnid) + sizeof(dest->ms_mod_txnid))) + dest->ms_mod_txnid = db->md_mod_txnid; +} + +int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *dest, size_t bytes) { + return mdbx_env_stat_ex(env, NULL, dest, bytes); +} + +int __cold mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *dest, size_t bytes) { + if (unlikely((env == NULL && txn == NULL) || dest == NULL)) return MDBX_EINVAL; if (txn) { - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; } if (env) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) @@ -12130,28 +14572,39 @@ int __cold mdbx_env_stat2(const MDBX_env *env, const MDBX_txn *txn, return MDBX_EINVAL; } - if (unlikely(bytes != sizeof(MDBX_stat))) + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) return MDBX_EINVAL; - const MDBX_db *db = - txn ? &txn->mt_dbs[MAIN_DBI] : &mdbx_meta_head(env)->mm_dbs[MAIN_DBI]; - return mdbx_stat0(txn ? txn->mt_env : env, db, arg); + if (txn) { + mdbx_stat0(txn->mt_env, &txn->mt_dbs[MAIN_DBI], dest, bytes); + return MDBX_SUCCESS; + } + + while (1) { + const MDBX_meta *const recent_meta = mdbx_meta_head(env); + const txnid_t txnid = mdbx_meta_txnid_fluid(env, recent_meta); + mdbx_stat0(env, &recent_meta->mm_dbs[MAIN_DBI], dest, bytes); + mdbx_compiler_barrier(); + if (likely(txnid == mdbx_meta_txnid_fluid(env, recent_meta) && + recent_meta == mdbx_meta_head(env))) + return MDBX_SUCCESS; + } } int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { - return mdbx_env_info2(env, NULL, arg, bytes); + return mdbx_env_info_ex(env, NULL, arg, bytes); } -int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *arg, size_t bytes) { +int __cold mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *arg, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; if (txn) { - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; } if (env) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) @@ -12162,12 +14615,14 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, env = txn->mt_env; } - if (unlikely(bytes != sizeof(MDBX_envinfo))) + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid) return MDBX_EINVAL; const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); + pgno_t unsynced_pages; while (1) { if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; @@ -12180,6 +14635,14 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, arg->mi_meta1_sign = meta1->mm_datasync_sign; arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); arg->mi_meta2_sign = meta2->mm_datasync_sign; + if (likely(bytes > size_before_bootid)) { + arg->mi_bootid.meta0.l = meta0->mm_bootid.x; + arg->mi_bootid.meta1.l = meta0->mm_bootid.x; + arg->mi_bootid.meta2.l = meta0->mm_bootid.x; + arg->mi_bootid.meta0.h = meta0->mm_bootid.y; + arg->mi_bootid.meta1.h = meta0->mm_bootid.y; + arg->mi_bootid.meta2.h = meta0->mm_bootid.y; + } const MDBX_meta *txn_meta = recent_meta; arg->mi_last_pgno = txn_meta->mm_geo.next - 1; @@ -12188,8 +14651,9 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, arg->mi_last_pgno = txn->mt_next_pgno - 1; arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); - const txnid_t wanna_meta_txnid = - (txn->mt_flags & MDBX_RDONLY) ? txn->mt_txnid : txn->mt_txnid - 1; + const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_RDONLY) + ? txn->mt_txnid + : txn->mt_txnid - MDBX_TXNID_STEP; txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; @@ -12198,8 +14662,10 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.shrink = pgno2bytes(env, txn_meta->mm_geo.shrink); arg->mi_geo.grow = pgno2bytes(env, txn_meta->mm_geo.grow); + unsynced_pages = *env->me_unsynced_pages + + (*env->me_meta_sync_txnid != (uint32_t)arg->mi_last_pgno); - arg->mi_mapsize = env->me_mapsize; + arg->mi_mapsize = env->me_dxb_mmap.limit; mdbx_compiler_barrier(); if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) && arg->mi_meta0_sign == meta0->mm_datasync_sign && @@ -12217,15 +14683,32 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, arg->mi_dxb_pagesize = env->me_psize; arg->mi_sys_pagesize = env->me_os_psize; + const MDBX_lockinfo *const lck = env->me_lck; + if (likely(bytes > size_before_bootid)) { + arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); + const uint64_t monotime_now = mdbx_osal_monotime(); + arg->mi_since_sync_seconds16dot16 = + mdbx_osal_monotime_to_16dot16(monotime_now - *env->me_sync_timestamp); + arg->mi_since_reader_check_seconds16dot16 = + lck ? mdbx_osal_monotime_to_16dot16(monotime_now - + lck->mti_reader_check_timestamp) + : 0; + arg->mi_autosync_threshold = pgno2bytes(env, *env->me_autosync_threshold); + arg->mi_autosync_period_seconds16dot16 = + mdbx_osal_monotime_to_16dot16(*env->me_autosync_period); + arg->mi_bootid.current.l = bootid.x; + arg->mi_bootid.current.h = bootid.y; + arg->mi_mode = lck ? lck->mti_envmode : env->me_flags; + } + arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = 0; - if (env->me_lck) { - MDBX_reader *r = env->me_lck->mti_readers; + if (lck) { arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = arg->mi_recent_txnid; for (unsigned i = 0; i < arg->mi_numreaders; ++i) { - const mdbx_pid_t pid = r[i].mr_pid; + const uint32_t pid = lck->mti_readers[i].mr_pid; if (pid) { - const txnid_t txnid = r[i].mr_txnid; + const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); if (arg->mi_latter_reader_txnid > txnid) arg->mi_latter_reader_txnid = txnid; if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid) @@ -12240,14 +14723,14 @@ int __cold mdbx_env_info2(const MDBX_env *env, const MDBX_txn *txn, static MDBX_cmp_func *mdbx_default_keycmp(unsigned flags) { return (flags & MDBX_REVERSEKEY) ? mdbx_cmp_memnr - : (flags & MDBX_INTEGERKEY) ? mdbx_cmp_int_a2 : mdbx_cmp_memn; + : (flags & MDBX_INTEGERKEY) ? mdbx_cmp_int_align2 : mdbx_cmp_memn; } static MDBX_cmp_func *mdbx_default_datacmp(unsigned flags) { return !(flags & MDBX_DUPSORT) ? mdbx_cmp_memn : ((flags & MDBX_INTEGERDUP) - ? mdbx_cmp_int_ua + ? mdbx_cmp_int_unaligned : ((flags & MDBX_REVERSEDUP) ? mdbx_cmp_memnr : mdbx_cmp_memn)); } @@ -12270,7 +14753,7 @@ static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, * seems that is case #1 above */ user_flags = txn->mt_dbs[dbi].md_flags; } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { - if (txn->mt_flags & MDBX_TXN_RDONLY) + if (txn->mt_flags & MDBX_RDONLY) return /* FIXME: return extended info */ MDBX_EACCESS; /* make sure flags changes get committed */ txn->mt_dbs[dbi].md_flags = user_flags & PERSISTENT_FLAGS; @@ -12302,7 +14785,11 @@ static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - if (unlikely(!txn || !dbi || (user_flags & ~VALID_FLAGS) != 0)) + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!dbi || (user_flags & ~VALID_FLAGS) != 0)) return MDBX_EINVAL; switch (user_flags & @@ -12319,15 +14806,6 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, break; } - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - /* main table? */ if (!table_name) { *dbi = MAIN_DBI; @@ -12373,7 +14851,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, key.iov_len = len; key.iov_base = (void *)table_name; MDBX_cursor mc; - int rc = mdbx_cursor_init(&mc, txn, MAIN_DBI); + rc = mdbx_cursor_init(&mc, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_cursor_set(&mc, &key, &data, MDBX_SET, &exact); @@ -12382,12 +14860,14 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, return rc; } else { /* make sure this is actually a table */ - MDBX_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + MDBX_node *node = page_node(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) return MDBX_INCOMPATIBLE; + if (unlikely(data.iov_len < sizeof(MDBX_db))) + return MDBX_CORRUPTED; } - if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_RDONLY)) return MDBX_EACCESS; /* Done here so we cannot fail after creating a new DB */ @@ -12488,34 +14968,32 @@ int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, unsigned table_flags, return mdbx_dbi_open_ex(txn, table_name, table_flags, dbi, nullptr, nullptr); } -int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *arg, +int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { - if (unlikely(!arg || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!dest)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; - if (unlikely(bytes != sizeof(MDBX_stat))) + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { - MDBX_cursor_couple cx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); + mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], dest, bytes); + return MDBX_SUCCESS; } static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { @@ -12556,14 +15034,12 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, unsigned *state) { - if (unlikely(!txn || !flags || !state)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!flags || !state)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; @@ -12587,7 +15063,6 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); if (likely(rc == MDBX_SUCCESS)) { MDBX_txn *txn = mc->mc_txn; - MDBX_node *ni; MDBX_cursor mx; unsigned i; @@ -12595,38 +15070,35 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { * This also avoids any P_LEAF2 pages, which have no nodes. * Also if the DB doesn't have sub-DBs and has no overflow * pages, omit scanning leaves. */ - - if (mc->mc_flags & C_SUB) { - MDBX_db *outer = mdbx_outer_db(mc); - outer->md_branch_pages -= mc->mc_db->md_branch_pages; - outer->md_leaf_pages -= mc->mc_db->md_leaf_pages; - outer->md_overflow_pages -= mc->mc_db->md_overflow_pages; - mdbx_cursor_pop(mc); - } else if (!subs && !mc->mc_db->md_overflow_pages) + if ((mc->mc_flags & C_SUB) || (subs | mc->mc_db->md_overflow_pages) == 0) mdbx_cursor_pop(mc); + rc = mdbx_pnl_need(&txn->tw.retired_pages, + mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages + + mc->mc_db->md_overflow_pages); + if (unlikely(rc)) + goto done; + mdbx_cursor_copy(mc, &mx); while (mc->mc_snum > 0) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); + unsigned n = page_numkeys(mp); if (IS_LEAF(mp)) { for (i = 0; i < n; i++) { - ni = NODEPTR(mp, i); - if (ni->mn_flags & F_BIGDATA) { + MDBX_node *node = page_node(mp, i); + if (node_flags(node) & F_BIGDATA) { MDBX_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdbx_page_get(mc, pg, &omp, NULL); + rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, NULL); if (unlikely(rc)) goto done; mdbx_cassert(mc, IS_OVERFLOW(omp)); - rc = mdbx_page_befree(mc, omp); + rc = mdbx_page_retire(mc, omp); if (unlikely(rc)) goto done; if (!mc->mc_db->md_overflow_pages && !subs) break; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - rc = mdbx_xcursor_init1(mc, ni); + } else if (subs && (node_flags(node) & F_SUBDATA)) { + rc = mdbx_xcursor_init1(mc, node); if (unlikely(rc != MDBX_SUCCESS)) goto done; rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); @@ -12637,14 +15109,11 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { if (!subs && !mc->mc_db->md_overflow_pages) goto pop; } else { - if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, n)) != 0)) - goto done; for (i = 0; i < n; i++) { - pgno_t pg; - ni = NODEPTR(mp, i); - pg = NODEPGNO(ni); /* free it */ - mdbx_pnl_xappend(txn->mt_befree_pages, pg); + rc = mdbx_retire_pgno(mc, node_pgno(page_node(mp, i))); + if (unlikely(rc)) + goto done; } } if (!mc->mc_top) @@ -12667,7 +15136,7 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { } } /* free it */ - rc = mdbx_pnl_append(&txn->mt_befree_pages, mc->mc_db->md_root); + rc = mdbx_retire_pgno(mc, mc->mc_db->md_root); done: if (unlikely(rc)) txn->mt_flags |= MDBX_TXN_ERROR; @@ -12679,14 +15148,12 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { } int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { - if (unlikely(1 < (unsigned)del || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(1 < (unsigned)del)) + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -12694,21 +15161,11 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { if (unlikely(TXN_DBI_CHANGED(txn, dbi))) return MDBX_BAD_DBI; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) - return MDBX_EACCESS; - MDBX_cursor *mc; - int rc = mdbx_cursor_open(txn, dbi, &mc); + rc = mdbx_cursor_open(txn, dbi, &mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_env *env = txn->mt_env; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_cursor_close(mc); - return rc; - } - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) { rc = MDBX_EINVAL; goto bailout; @@ -12729,9 +15186,17 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); - if (likely(!rc)) { + if (likely(rc == MDBX_SUCCESS)) { txn->mt_dbflags[dbi] = DB_STALE; + MDBX_env *env = txn->mt_env; + rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_flags |= MDBX_TXN_ERROR; + goto bailout; + } mdbx_dbi_close_locked(env, dbi); + mdbx_ensure(env, + mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } else { txn->mt_flags |= MDBX_TXN_ERROR; } @@ -12750,19 +15215,13 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { bailout: mdbx_cursor_close(mc); - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); return rc; } int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -12772,14 +15231,9 @@ int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { } int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -12788,53 +15242,73 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { return MDBX_SUCCESS; } -int __cold mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx) { - char buf[64]; - int rc = 0, first = 1; - +int __cold mdbx_reader_list(MDBX_env *env, MDBX_reader_list_func *func, + void *ctx) { if (unlikely(!env || !func)) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - const MDBX_lockinfo *const lck = env->me_lck; - if (likely(lck)) { - const unsigned snap_nreaders = lck->mti_numreaders; + int rc = MDBX_RESULT_TRUE; + int serial = 0; + if (likely(env->me_lck)) { + const unsigned snap_nreaders = env->me_lck->mti_numreaders; for (unsigned i = 0; i < snap_nreaders; i++) { - if (lck->mti_readers[i].mr_pid) { - const txnid_t txnid = lck->mti_readers[i].mr_txnid; - if (txnid == ~(txnid_t)0) - snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " -\n", - (uintptr_t)lck->mti_readers[i].mr_pid, - (uintptr_t)lck->mti_readers[i].mr_tid); - else - snprintf(buf, sizeof(buf), - "%10" PRIuPTR " %" PRIxPTR " %" PRIaTXN "\n", - (uintptr_t)lck->mti_readers[i].mr_pid, - (uintptr_t)lck->mti_readers[i].mr_tid, txnid); - - if (first) { - first = 0; - rc = func(" pid thread txnid\n", ctx); - if (rc < 0) - break; - } - rc = func(buf, ctx); - if (rc < 0) - break; + const MDBX_reader *r = env->me_lck->mti_readers + i; + retry_reader:; + const uint32_t pid = r->mr_pid; + if (!pid) + continue; + txnid_t txnid = safe64_read(&r->mr_txnid); + const size_t tid = r->mr_tid; + const pgno_t pages_used = r->mr_snapshot_pages_used; + const uint64_t reader_pages_retired = r->mr_snapshot_pages_retired; + mdbx_compiler_barrier(); + if (unlikely(tid != r->mr_tid || + pages_used != r->mr_snapshot_pages_used || + reader_pages_retired != r->mr_snapshot_pages_retired || + txnid != safe64_read(&r->mr_txnid) || pid != r->mr_pid)) + goto retry_reader; + + mdbx_assert(env, txnid > 0); + if (txnid >= SAFE64_INVALID_THRESHOLD) + txnid = 0; + + size_t bytes_used = 0; + size_t bytes_retained = 0; + uint64_t lag = 0; + if (txnid) { + retry_header:; + const MDBX_meta *const recent_meta = mdbx_meta_head(env); + const uint64_t head_pages_retired = recent_meta->mm_pages_retired; + const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, recent_meta); + mdbx_compiler_barrier(); + if (unlikely(recent_meta != mdbx_meta_head(env) || + head_pages_retired != recent_meta->mm_pages_retired) || + head_txnid != mdbx_meta_txnid_fluid(env, recent_meta)) + goto retry_header; + + lag = (head_txnid - txnid) / MDBX_TXNID_STEP; + bytes_used = pgno2bytes(env, pages_used); + bytes_retained = (head_pages_retired > reader_pages_retired) + ? pgno2bytes(env, (pgno_t)(head_pages_retired - + reader_pages_retired)) + : 0; } + rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used, + bytes_retained); + if (unlikely(rc != MDBX_SUCCESS)) + break; } } - if (first) - rc = func("(no active readers)\n", ctx); return rc; } /* Insert pid into list if not already present. * return -1 if already present. */ -static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { +static int __cold mdbx_pid_insert(uint32_t *ids, uint32_t pid) { /* binary search of pid in list */ unsigned base = 0; unsigned cursor = 1; @@ -12901,12 +15375,13 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { return MDBX_SUCCESS; } + lck->mti_reader_check_timestamp = mdbx_osal_monotime(); const unsigned snap_nreaders = lck->mti_numreaders; - mdbx_pid_t pidsbuf_onstask[142]; - mdbx_pid_t *const pids = + uint32_t pidsbuf_onstask[142]; + uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) ? pidsbuf_onstask - : mdbx_malloc((snap_nreaders + 1) * sizeof(mdbx_pid_t)); + : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t)); if (unlikely(!pids)) return MDBX_ENOMEM; @@ -12914,7 +15389,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { int rc = MDBX_SUCCESS, count = 0; for (unsigned i = 0; i < snap_nreaders; i++) { - const mdbx_pid_t pid = lck->mti_readers[i].mr_pid; + const uint32_t pid = lck->mti_readers[i].mr_pid; if (pid == 0) continue /* skip empty */; if (pid == env->me_pid) @@ -12941,7 +15416,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { rdt_locked = -1; if (err == MDBX_RESULT_TRUE) { - /* mutex recovered, the mdbx_mutex_failed() checked all readers */ + /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ rc = MDBX_RESULT_TRUE; break; } @@ -12964,7 +15439,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { for (unsigned j = i; j < snap_nreaders; j++) { if (lck->mti_readers[j].mr_pid == pid) { mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, - (size_t)pid, lck->mti_readers[j].mr_txnid); + (size_t)pid, lck->mti_readers[j].mr_txnid.inconsistent); lck->mti_readers[j].mr_pid = 0; lck->mti_readers_refresh_flag = true; count++; @@ -12983,15 +15458,22 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { return rc; } -int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { - const int rc = mdbx_runtime_flags; +int __cold mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { + const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); + +#if !MDBX_DEBUG + (void)loglevel; +#else + if (loglevel != -1) + mdbx_loglevel = (uint8_t)loglevel; +#endif + if (flags != -1) { #if !MDBX_DEBUG flags &= MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN; #else - flags &= MDBX_DBG_ASSERT | MDBX_DBG_PRINT | MDBX_DBG_TRACE | - MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | MDBX_DBG_DUMP | - MDBX_DBG_LEGACY_MULTIOPEN; + flags &= MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | + MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN; #endif #if defined(__linux__) || defined(__gnu_linux__) if ((mdbx_runtime_flags ^ flags) & MDBX_DBG_DUMP) { @@ -13023,7 +15505,7 @@ int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { } } #endif /* Linux */ - mdbx_runtime_flags = flags; + mdbx_runtime_flags = (uint8_t)flags; } if (-1 != (intptr_t)logger) @@ -13032,11 +15514,11 @@ int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { } static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { - mdbx_debug("DB size maxed out"); + mdbx_debug("%s", "DB size maxed out"); int retry; for (retry = 0; retry < INT_MAX; ++retry) { - txnid_t oldest = mdbx_reclaiming_detent(env); + txnid_t oldest = mdbx_recent_steady_txnid(env); mdbx_assert(env, oldest < env->me_txn0->mt_txnid); mdbx_assert(env, oldest >= laggard); mdbx_assert(env, oldest >= *env->me_oldest); @@ -13046,15 +15528,27 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { if (MDBX_IS_ERROR(mdbx_reader_check0(env, false, NULL))) break; - MDBX_reader *const rtbl = env->me_lck->mti_readers; MDBX_reader *asleep = nullptr; - for (int i = env->me_lck->mti_numreaders; --i >= 0;) { - if (rtbl[i].mr_pid) { - mdbx_jitter4testing(true); - const txnid_t snap = rtbl[i].mr_txnid; - if (oldest > snap && laggard <= /* ignore pending updates */ snap) { - oldest = snap; - asleep = &rtbl[i]; + MDBX_lockinfo *const lck = env->me_lck; + uint64_t oldest_retired = UINT64_MAX; + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + /* mdbx_jitter4testing(true); */ + const uint64_t snap_retired = + lck->mti_readers[i].mr_snapshot_pages_retired; + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + mdbx_memory_barrier(); + if (unlikely(snap_retired != + lck->mti_readers[i].mr_snapshot_pages_retired || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + goto retry; + if (oldest > snap_txnid && + laggard <= /* ignore pending updates */ snap_txnid) { + oldest = snap_txnid; + oldest_retired = snap_retired; + asleep = &lck->mti_readers[i]; } } } @@ -13064,7 +15558,8 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { /* LY: notify end of oom-loop */ const txnid_t gap = oldest - laggard; env->me_oom_func(env, 0, 0, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, -retry); + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0, + -retry); } mdbx_notice("oom-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, *env->me_oldest, oldest); @@ -13072,44 +15567,49 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { return *env->me_oldest = oldest; } - mdbx_tid_t tid; - mdbx_pid_t pid; - int rc; - if (!env->me_oom_func) break; - pid = asleep->mr_pid; - tid = asleep->mr_tid; - if (asleep->mr_txnid != laggard || pid <= 0) + uint32_t pid = asleep->mr_pid; + size_t tid = asleep->mr_tid; + if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) continue; + const MDBX_meta *head_meta = mdbx_meta_head(env); const txnid_t gap = - mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - laggard; - rc = env->me_oom_func(env, pid, tid, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, retry); + (mdbx_meta_txnid_stable(env, head_meta) - laggard) / MDBX_TXNID_STEP; + const uint64_t head_retired = head_meta->mm_pages_retired; + const size_t space = + (oldest_retired > head_retired) + ? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired)) + : 0; + int rc = env->me_oom_func(env, pid, (mdbx_tid_t)tid, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, + space, retry); if (rc < 0) break; - if (rc) { - asleep->mr_txnid = ~(txnid_t)0; - env->me_lck->mti_readers_refresh_flag = true; - if (rc > 1) { + if (rc > 0) { + if (rc == 1) { + safe64_reset_compare(&asleep->mr_txnid, laggard); + } else { + safe64_reset(&asleep->mr_txnid, true); asleep->mr_tid = 0; asleep->mr_pid = 0; - mdbx_flush_noncoherent_cpu_writeback(); } + lck->mti_readers_refresh_flag = true; + mdbx_flush_incoherent_cpu_writeback(); } } if (retry && env->me_oom_func) { /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, laggard, 0, -retry); + env->me_oom_func(env, 0, 0, laggard, 0, 0, -retry); } return mdbx_find_oldest(env->me_txn); } -int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { +int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { if (unlikely(!env)) return MDBX_EINVAL; @@ -13122,8 +15622,13 @@ int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { if (unlikely(!env->me_map)) return MDBX_EPERM; - *env->me_autosync_threshold = bytes2pgno(env, bytes + env->me_psize - 1); - return bytes ? mdbx_env_sync(env, false) : MDBX_SUCCESS; + *env->me_autosync_threshold = bytes2pgno(env, threshold + env->me_psize - 1); + if (threshold) { + int err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + return MDBX_SUCCESS; } int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { @@ -13140,7 +15645,12 @@ int __cold mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { return MDBX_EPERM; *env->me_autosync_period = mdbx_osal_16dot16_to_monotime(seconds_16dot16); - return seconds_16dot16 ? mdbx_env_sync(env, false) : MDBX_SUCCESS; + if (seconds_16dot16) { + int err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + return MDBX_SUCCESS; } int __cold mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oomfunc) { @@ -13166,14 +15676,9 @@ __attribute__((__no_sanitize_thread__, __noinline__)) #endif int mdbx_txn_straggler(MDBX_txn *txn, int *percent) { - if (unlikely(!txn)) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc > 0) ? -rc : rc; MDBX_env *env = txn->mt_env; if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) { @@ -13195,7 +15700,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) } } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta))); - txnid_t lag = recent - txn->mt_ro_reader->mr_txnid; + txnid_t lag = (recent - txn->mt_txnid) / MDBX_TXNID_STEP; return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -13216,9 +15721,13 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const int nkeys = NUMKEYS(mp); + rc = mdbx_page_check(ctx->mw_cursor.mc_txn->mt_env, mp, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const int nkeys = page_numkeys(mp); size_t header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower; - size_t unused_size = SIZELEFT(mp); + size_t unused_size = page_room(mp); size_t payload_size = 0; size_t align_bytes = 0; MDBX_page_type_t type; @@ -13250,27 +15759,31 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, continue; } - MDBX_node *node = NODEPTR(mp, i); - payload_size += NODESIZE + NODEKSZ(node); + MDBX_node *node = page_node(mp, i); + payload_size += NODESIZE + node_ks(node); - if (type == MDBX_page_branch) + if (type == MDBX_page_branch) { + assert(i > 0 || node_ks(node) == 0); continue; + } assert(type == MDBX_page_leaf); - switch (node->mn_flags) { + switch (node_flags(node)) { case 0 /* usual node */: { - payload_size += NODEDSZ(node); + payload_size += node_ds(node); } break; case F_BIGDATA /* long data on the large/overflow page */: { payload_size += sizeof(pgno_t); + const pgno_t large_pgno = node_largedata_pgno(node); MDBX_page *op; - pgno_t large_pgno; - memcpy(&large_pgno, NODEDATA(node), sizeof(pgno_t)); rc = mdbx_page_get(&ctx->mw_cursor, large_pgno, &op, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; + rc = mdbx_page_check(ctx->mw_cursor.mc_txn->mt_env, op, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; /* LY: Don't use mask here, e.g bitwise * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). @@ -13279,7 +15792,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, return MDBX_CORRUPTED; const size_t over_header = PAGEHDRSZ; - const size_t over_payload = NODEDSZ(node); + const size_t over_payload = node_ds(node); const size_t over_unused = pgno2bytes(ctx->mw_cursor.mc_txn->mt_env, op->mp_pages) - over_payload - over_header; @@ -13291,27 +15804,27 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, } break; case F_SUBDATA /* sub-db */: { - const size_t namelen = NODEKSZ(node); - if (unlikely(namelen == 0 || NODEDSZ(node) != sizeof(MDBX_db))) + const size_t namelen = node_ks(node); + if (unlikely(namelen == 0 || node_ds(node) < sizeof(MDBX_db))) return MDBX_CORRUPTED; - payload_size += sizeof(MDBX_db); + payload_size += node_ds(node); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: { - if (unlikely(NODEDSZ(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) return MDBX_CORRUPTED; payload_size += sizeof(MDBX_db); } break; case F_DUPDATA /* short sub-page */: { - if (unlikely(NODEDSZ(node) < PAGEHDRSZ)) + if (unlikely(node_ds(node) <= PAGEHDRSZ)) return MDBX_CORRUPTED; - MDBX_page *sp = NODEDATA(node); - const int nsubkeys = NUMKEYS(sp); + MDBX_page *sp = node_data(node); + const int nsubkeys = page_numkeys(sp); size_t subheader_size = IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; - size_t subunused_size = SIZELEFT(sp); + size_t subunused_size = page_room(sp); size_t subpayload_size = 0; size_t subalign_bytes = 0; MDBX_page_type_t subtype; @@ -13335,14 +15848,14 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, subpayload_size += sp->mp_leaf2_ksize; } else { assert(subtype == MDBX_subpage_leaf); - MDBX_node *subnode = NODEPTR(sp, j); - subpayload_size += NODESIZE + NODEKSZ(subnode) + NODEDSZ(subnode); - if (unlikely(subnode->mn_flags != 0)) + MDBX_node *subnode = page_node(sp, j); + subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); + if (unlikely(node_flags(subnode) != 0)) return MDBX_CORRUPTED; } } - rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, dbi, NODEDSZ(node), + rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, dbi, node_ds(node), subtype, nsubkeys, subpayload_size, subheader_size, subunused_size + subalign_bytes); header_size += subheader_size; @@ -13370,9 +15883,9 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (type == MDBX_page_dupfixed_leaf) continue; - MDBX_node *node = NODEPTR(mp, i); + MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { - rc = mdbx_env_walk(ctx, dbi, NODEPGNO(node), deep + 1); + rc = mdbx_env_walk(ctx, dbi, node_pgno(node), deep + 1); if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_RESULT_TRUE) return rc; @@ -13383,13 +15896,13 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, assert(type == MDBX_page_leaf); MDBX_db db; - switch (node->mn_flags) { + switch (node_flags(node)) { default: continue; case F_SUBDATA /* sub-db */: { - const size_t namelen = NODEKSZ(node); - if (unlikely(namelen == 0 || NODEDSZ(node) != sizeof(MDBX_db))) + const size_t namelen = node_ks(node); + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) return MDBX_CORRUPTED; char namebuf_onstask[142]; @@ -13397,9 +15910,9 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, ? namebuf_onstask : mdbx_malloc(namelen + 1); if (name) { - memcpy(name, NODEKEY(node), namelen); + memcpy(name, node_key(node), namelen); name[namelen] = 0; - memcpy(&db, NODEDATA(node), sizeof(db)); + memcpy(&db, node_data(node), sizeof(db)); rc = mdbx_env_walk(ctx, name, db.md_root, deep + 1); if (name != namebuf_onstask) mdbx_free(name); @@ -13409,10 +15922,10 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(NODEDSZ(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) return MDBX_CORRUPTED; - memcpy(&db, NODEDATA(node), sizeof(db)); + memcpy(&db, node_data(node), sizeof(db)); rc = mdbx_env_walk(ctx, dbi, db.md_root, deep + 1); break; } @@ -13426,14 +15939,9 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user) { - if (unlikely(!txn)) - return MDBX_BAD_TXN; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; mdbx_walk_ctx_t ctx; memset(&ctx, 0, sizeof(ctx)); @@ -13442,11 +15950,11 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_user = user; ctx.mw_visitor = visitor; - int rc = visitor( - 0, NUM_METAS, user, 0, MDBX_PGWALK_META, - pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, NUM_METAS, - sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, - (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * NUM_METAS); + rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META, + pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, NUM_METAS, + sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, + (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * + NUM_METAS); if (!MDBX_IS_ERROR(rc)) rc = mdbx_env_walk(&ctx, MDBX_PGWALK_GC, txn->mt_dbs[FREE_DBI].md_root, 0); if (!MDBX_IS_ERROR(rc)) @@ -13459,20 +15967,9 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, } int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) - return MDBX_EACCESS; + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (likely(canary)) { if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && @@ -13483,23 +15980,18 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { txn->mt_canary.z = canary->z; } txn->mt_canary.v = txn->mt_txnid; + txn->mt_flags |= MDBX_TXN_DIRTY; - if ((txn->mt_flags & MDBX_TXN_DIRTY) == 0) { - txn->mt_flags |= MDBX_TXN_DIRTY; - *txn->mt_env->me_unsynced_pages += 1; - } return MDBX_SUCCESS; } int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary) { - if (unlikely(txn == NULL || canary == NULL)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(canary == NULL)) + return MDBX_EINVAL; *canary = txn->mt_canary; return MDBX_SUCCESS; @@ -13534,7 +16026,7 @@ int mdbx_cursor_on_last(MDBX_cursor *mc) { return MDBX_RESULT_FALSE; for (unsigned i = 0; i < mc->mc_snum; ++i) { - unsigned nkeys = NUMKEYS(mc->mc_pg[i]); + unsigned nkeys = page_numkeys(mc->mc_pg[i]); if (mc->mc_ki[i] < nkeys - 1) return MDBX_RESULT_FALSE; } @@ -13556,7 +16048,7 @@ int mdbx_cursor_eof(MDBX_cursor *mc) { return MDBX_RESULT_TRUE; if ((mc->mc_flags & C_EOF) && - mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) return MDBX_RESULT_TRUE; return MDBX_RESULT_FALSE; @@ -13582,6 +16074,13 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, x->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; + int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(x->mc_txn != y->mc_txn)) + return MDBX_BAD_TXN; + if (unlikely(y->mc_dbi != x->mc_dbi)) return MDBX_EINVAL; @@ -13592,7 +16091,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) return MDBX_PROBLEM; - int nkeys = NUMKEYS(y->mc_pg[r->level]); + int nkeys = page_numkeys(y->mc_pg[r->level]); assert(nkeys > 0); if (r->level == 0) r->root_nkeys = nkeys; @@ -13616,7 +16115,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, * STACK[i ]: | * STACK[+1]: ...y++N|0++x... */ - nkeys = NUMKEYS(y->mc_pg[r->level]); + nkeys = page_numkeys(y->mc_pg[r->level]); r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level]; assert(r->diff > 0); } @@ -13630,7 +16129,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, * STACK[i ]: | * STACK[+1]: ...x--N|0--y... */ - nkeys = NUMKEYS(x->mc_pg[r->level]); + nkeys = page_numkeys(x->mc_pg[r->level]); r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level]; assert(r->diff < 0); } @@ -13638,7 +16137,7 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, return MDBX_SUCCESS; } - r->diff = mdbx_cmp2int(x->mc_flags & C_EOF, y->mc_flags & C_EOF); + r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF); return MDBX_SUCCESS; } @@ -13648,7 +16147,7 @@ __hot static ptrdiff_t estimate(const MDBX_db *db, * level-1: branch-page(s) => scale = leaf-factor * branch-factor^2 * level-2: branch-page(s) => scale = leaf-factor * branch-factor * level-N: branch-page(s) => scale = leaf-factor - * last-level: leaf-page(s) => scale = 1 + * leaf-level: leaf-page(s) => scale = 1 */ ptrdiff_t btree_power = db->md_depth - 2 - dr->level; if (btree_power < 0) @@ -13742,6 +16241,10 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, if (unlikely(cursor->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; + int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (!(cursor->mc_flags & C_INITIALIZED)) return MDBX_ENODATA; @@ -13750,7 +16253,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, next.outer.mc_xcursor = NULL; if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; - int rc = mdbx_xcursor_init0(&next.outer); + rc = mdbx_xcursor_init0(&next.outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; @@ -13775,7 +16278,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, key = &stub; } - int rc = mdbx_cursor_get(&next.outer, key, data, move_op); + rc = mdbx_cursor_get(&next.outer, key, data, move_op); if (unlikely(rc != MDBX_SUCCESS && (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) return rc; @@ -13791,8 +16294,11 @@ static int mdbx_is_samedata(const MDBX_val *a, const MDBX_val *b) { int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, MDBX_val *begin_data, MDBX_val *end_key, MDBX_val *end_data, ptrdiff_t *size_items) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(!txn || !size_items)) + if (unlikely(!size_items)) return MDBX_EINVAL; if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON))) @@ -13804,21 +16310,12 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) return MDBX_EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ - int rc = mdbx_cursor_init(&begin.outer, txn, dbi); + rc = mdbx_cursor_init(&begin.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -13870,9 +16367,9 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } *size_items = 1; if (begin.outer.mc_xcursor != NULL) { - MDBX_node *leaf = NODEPTR(begin.outer.mc_pg[begin.outer.mc_top], - begin.outer.mc_ki[begin.outer.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], + begin.outer.mc_ki[begin.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { /* LY: return the number of duplicates for given key */ mdbx_tassert(txn, begin.outer.mc_xcursor == &begin.inner && @@ -13972,14 +16469,12 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, */ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, unsigned flags) { - if (unlikely(!key || !old_data || !txn || old_data == new_data)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + if (unlikely(!key || !old_data || old_data == new_data)) + return MDBX_EINVAL; if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) return MDBX_EINVAL; @@ -13994,11 +16489,8 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -14046,8 +16538,8 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, if (flags & MDBX_CURRENT) { /* для не-уникальных ключей позволяем update/delete только если ключ * один */ - MDBX_node *leaf = NODEPTR(page, cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && cx.outer.mc_xcursor->mx_db.md_entries > 1); if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { @@ -14060,7 +16552,7 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, *old_data = *new_data; goto bailout; } - /* В оригинальной LMDB фладок MDBX_CURRENT здесь приведет + /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет * к замене данных без учета MDBX_DUPSORT сортировки, * но здесь это в любом случае допустимо, так как мы * проверили что для ключа есть только одно значение. */ @@ -14104,58 +16596,6 @@ bailout: return rc; } -int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - size_t *values_count) { - DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - - if (unlikely(!key || !data || !txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return MDBX_EINVAL; - - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - - MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - int exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY, &exact); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && values_count) - *values_count = 0; - return rc; - } - - if (values_count) { - *values_count = 1; - if (cx.outer.mc_xcursor != NULL) { - MDBX_node *leaf = NODEPTR(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && - (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); - *values_count = - (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || - cx.inner.mx_db.md_entries <= PTRDIFF_MAX) - ? (size_t)cx.inner.mx_db.md_entries - : PTRDIFF_MAX; - } - } - } - return MDBX_SUCCESS; -} - /* Функция сообщает находится ли указанный адрес в "грязной" странице у * заданной пишущей транзакции. В конечном счете это позволяет избавиться от * лишнего копирования данных из НЕ-грязных страниц. @@ -14171,7 +16611,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * перед передачей в качестве аргументов для дальнейших модификаций, либо * отвергнуты на стадии проверки корректности аргументов. * - * Таким образом, функция позволяет как избавится от лишнего копирования, + * Таким образом, функция позволяет как избавиться от лишнего копирования, * так и выполнить более полную проверку аргументов. * * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только @@ -14179,48 +16619,38 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * расположен в той-же странице памяти, в том числе для многостраничных * P_OVERFLOW страниц с длинными данными. */ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + if (txn->mt_flags & MDBX_RDONLY) return MDBX_RESULT_FALSE; const MDBX_env *env = txn->mt_env; - const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); - const MDBX_page *page = (const MDBX_page *)((uintptr_t)ptr & mask); - - /* LY: Тут не всё хорошо с абсолютной достоверностью результата, - * так как флажок P_DIRTY в LMDB может означать не совсем то, - * что было исходно задумано, детали см в логике кода mdbx_page_touch(). - * - * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через - * malloc(), т.е. находятся вне mmap-диапазона и тогда чтобы отличить - * действительно грязную страницу от указателя на данные пользователя - * следует сканировать dirtylist, что накладно. - * - * Тем не менее, однозначно страница "не грязная" (не будет переписана - * во время транзакции) если адрес находится внутри mmap-диапазона - * и в заголовке страницы нет флажка P_DIRTY. */ - if (env->me_map < (uint8_t *)page) { - const size_t usedbytes = pgno2bytes(env, txn->mt_next_pgno); - if ((uint8_t *)page < env->me_map + usedbytes) { - /* страница внутри диапазона, смотрим на флажки */ - return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) - ? MDBX_RESULT_TRUE - : MDBX_RESULT_FALSE; - } - /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то - * в пределах mmap, но за границей распределенных страниц. Это тяжелая - * ошибка, к которой не возможно прийти без каких-то больших нарушений. - * Поэтому не проверяем этот случай кроме как assert-ом, на то что - * страница вне mmap-диаппазона. */ - mdbx_tassert(txn, (uint8_t *)page >= env->me_map + env->me_mapsize); + const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; + if (offset >= 0) { + const pgno_t pgno = bytes2pgno(env, offset); + if (likely(pgno < txn->mt_next_pgno)) { + const MDBX_page *page = pgno2page(env, pgno); + if (unlikely(page->mp_pgno != pgno)) { + /* The ptr pointed into middle of a large page, + * not to the beginning of a data. */ + return MDBX_EINVAL; + } + if (unlikely(page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP))) + return MDBX_RESULT_TRUE; + if (likely(txn->tw.spill_pages == nullptr)) + return MDBX_RESULT_FALSE; + return mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1) ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; + } + if ((size_t)offset < env->me_dxb_mmap.limit) { + /* Указатель адресует что-то в пределах mmap, но за границей + * распределенных страниц. Такое может случится если mdbx_is_dirty() + * вызывает после операции, в ходе которой гразная страница попала + * в loose и затем была возвращена в нераспределенное пространство. */ + return MDBX_RESULT_TRUE; + } } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был @@ -14243,14 +16673,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t increment) { - if (unlikely(!txn)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(txn->mt_owner != mdbx_thread_self())) - return MDBX_THREAD_MISMATCH; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -14259,9 +16684,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, return MDBX_BAD_DBI; if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { - MDBX_cursor_couple cx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = mdbx_fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14271,10 +16694,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, *result = dbs->md_seq; if (likely(increment > 0)) { - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_RDONLY)) return MDBX_EACCESS; uint64_t new = dbs->md_seq + increment; @@ -14292,28 +16712,13 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ -__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_syspagesize(); - else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2((size_t)pagesize))) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; - - return mdbx_maxkey(mdbx_nodemax(pagesize)); -} - -__cold int mdbx_limits_pgsize_min(void) { return MIN_PAGESIZE; } - -__cold int mdbx_limits_pgsize_max(void) { return MAX_PAGESIZE; } - __cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { if (pagesize < 1) pagesize = (intptr_t)mdbx_syspagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2((size_t)pagesize))) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + !is_powerof2((size_t)pagesize))) + return -1; return MIN_PAGENO * pagesize; } @@ -14323,8 +16728,8 @@ __cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { pagesize = (intptr_t)mdbx_syspagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2((size_t)pagesize))) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + !is_powerof2((size_t)pagesize))) + return -1; const uint64_t limit = MAX_PAGENO * (uint64_t)pagesize; return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit @@ -14336,14 +16741,16 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { pagesize = (intptr_t)mdbx_syspagesize(); else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || - !mdbx_is_power2((size_t)pagesize))) - return (MDBX_EINVAL > 0) ? -MDBX_EINVAL : MDBX_EINVAL; + !is_powerof2((size_t)pagesize))) + return -1; - return pagesize * (MDBX_DPL_TXNFULL - 1); + const uint64_t limit = pagesize * (uint64_t)(MDBX_DPL_TXNFULL - 1); + return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit + : (intptr_t)MAX_MAPSIZE; } -/*----------------------------------------------------------------------------*/ -/* attribute support functions for Nexenta */ +/*** Attribute support functions for Nexenta **********************************/ +#ifdef MDBX_NEXENTA_ATTRS static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) @@ -14430,8 +16837,8 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor_couple cx; MDBX_val old_data; @@ -14466,8 +16873,220 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, txn->mt_cursors[dbi] = cx.outer.mc_next; return rc; } +#endif /* MDBX_NEXENTA_ATTRS */ -//---------------------------------------------------------------------------- +/******************************************************************************/ +/* *INDENT-OFF* */ +/* clang-format off */ + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const mdbx_build_info mdbx_build = { +#ifdef MDBX_BUILD_TIMESTAMP + MDBX_BUILD_TIMESTAMP +#else + __DATE__ " " __TIME__ +#endif /* MDBX_BUILD_TIMESTAMP */ + + , +#ifdef MDBX_BUILD_TARGET + MDBX_BUILD_TARGET +#else + #if defined(__ANDROID__) + "Android" + #elif defined(__linux__) || defined(__gnu_linux__) + "Linux" + #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__) + "webassembly" + #elif defined(__CYGWIN__) + "CYGWIN" + #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \ + || defined(__WINDOWS__) + "Windows" + #elif defined(__APPLE__) + #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \ + || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR) + "iOS" + #else + "MacOS" + #endif + #elif defined(__FreeBSD__) + "FreeBSD" + #elif defined(__DragonFly__) + "DragonFlyBSD" + #elif defined(__NetBSD__) + "NetBSD" + #elif defined(__OpenBSD__) + "OpenBSD" + #elif defined(__bsdi__) + "UnixBSDI" + #elif defined(__MACH__) + "MACH" + #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) + "HPUX" + #elif defined(_AIX) + "AIX" + #elif defined(__sun) && defined(__SVR4) + "Solaris" + #elif defined(__BSD__) || defined(BSD) + "UnixBSD" + #elif defined(__unix__) || defined(UNIX) || defined(__unix) \ + || defined(__UNIX) || defined(__UNIX__) + "UNIX" + #elif defined(_POSIX_VERSION) + "POSIX" STRINGIFY(_POSIX_VERSION) + #else + "UnknownOS" + #endif /* Target OS */ + + "-" + + #if defined(__amd64__) + "AMD64" + #elif defined(__ia32__) + "IA32" + #elif defined(__e2k__) || defined(__elbrus__) + "Elbrus" + #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + "Alpha" + #elif defined(__aarch64__) || defined(_M_ARM64) + "ARM64" + #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \ + || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \ + || defined(_M_ARMT) || defined(__arm) + "ARM" + #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64)) + "MIPS64" + #elif if defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__) + "MIPS" + #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64) + "PARISC64" + #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) + "PARISC" + #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \ + || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__) + "Itanium" + #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \ + || defined(__powerpc64) || defined(_ARCH_PPC64) + "PowerPC64" + #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \ + || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__) + "PowerPC" + #elif defined(__sparc64__) || defined(__sparc64) + "SPARC64" + #elif defined(__sparc__) || defined(__sparc) + "SPARC" + #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch) + "S390" + #else + "UnknownARCH" + #endif +#endif /* MDBX_BUILD_TARGET */ + +#ifdef MDBX_BUILD_CONFIG +# if defined(_MSC_VER) +# pragma message("Configuration-depended MDBX_BUILD_CONFIG: " MDBX_BUILD_CONFIG) +# endif + "-" MDBX_BUILD_CONFIG +#endif /* MDBX_BUILD_CONFIG */ + , + "MDBX_DEBUG=" STRINGIFY(MDBX_DEBUG) +#ifdef MDBX_LOGLEVEL_BUILD + " MDBX_LOGLEVEL_BUILD=" STRINGIFY(MDBX_LOGLEVEL_BUILD) +#endif /* MDBX_LOGLEVEL_BUILD */ + " MDBX_WORDBITS=" STRINGIFY(MDBX_WORDBITS) + " BYTE_ORDER=" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "LITTLE_ENDIAN" +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + "BIG_ENDIAN" +#else + #error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + " MDBX_TXN_CHECKPID=" MDBX_TXN_CHECKPID_CONFIG + " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG + " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG + " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG + " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG +#ifdef __SANITIZE_ADDRESS__ + " SANITIZE_ADDRESS=YES" +#endif /* __SANITIZE_ADDRESS__ */ +#ifdef MDBX_USE_VALGRIND + " MDBX_USE_VALGRIND=YES" +#endif /* MDBX_USE_VALGRIND */ +#ifdef _GNU_SOURCE + " _GNU_SOURCE=YES" +#else + " _GNU_SOURCE=NO" +#endif /* _GNU_SOURCE */ +#ifdef __APPLE__ + " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) +#endif /* MacOS */ +#if defined(_WIN32) || defined(_WIN64) + " MDBX_AVOID_CRT=" STRINGIFY(MDBX_AVOID_CRT) + " MDBX_CONFIG_MANUAL_TLS_CALLBACK=" STRINGIFY(MDBX_CONFIG_MANUAL_TLS_CALLBACK) + " MDBX_BUILD_SHARED_LIBRARY=" STRINGIFY(MDBX_BUILD_SHARED_LIBRARY) + " WINVER=" STRINGIFY(WINVER) +#else /* Windows */ + " MDBX_LOCKING=" MDBX_LOCKING_CONFIG + " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG +#endif /* !Windows */ + " MDBX_CACHELINE_SIZE=" STRINGIFY(MDBX_CACHELINE_SIZE) + " MDBX_CPU_WRITEBACK_INCOHERENT=" STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT) + " MDBX_MMAP_INCOHERENT_CPU_CACHE=" STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE) + " MDBX_MMAP_INCOHERENT_FILE_WRITE=" STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE) + " MDBX_UNALIGNED_OK=" STRINGIFY(MDBX_UNALIGNED_OK) + " MDBX_PNL_ASCENDING=" STRINGIFY(MDBX_PNL_ASCENDING) + , +#ifdef MDBX_BUILD_COMPILER + MDBX_BUILD_COMPILER +#else + #ifdef __INTEL_COMPILER + "Intel C/C++ " STRINGIFY(__INTEL_COMPILER) + #elsif defined(__apple_build_version__) + "Apple clang " STRINGIFY(__apple_build_version__) + #elif defined(__ibmxl__) + "IBM clang C " STRINGIFY(__ibmxl_version__) "." STRINGIFY(__ibmxl_release__) + "." STRINGIFY(__ibmxl_modification__) "." STRINGIFY(__ibmxl_ptf_fix_level__) + #elif defined(__clang__) + "clang " STRINGIFY(__clang_version__) + #elif defined(__MINGW64__) + "MINGW-64 " STRINGIFY(__MINGW64_MAJOR_VERSION) "." STRINGIFY(__MINGW64_MINOR_VERSION) + #elif defined(__MINGW32__) + "MINGW-32 " STRINGIFY(__MINGW32_MAJOR_VERSION) "." STRINGIFY(__MINGW32_MINOR_VERSION) + #elif defined(__IBMC__) + "IBM C " STRINGIFY(__IBMC__) + #elif defined(__GNUC__) + "GNU C/C++ " + #ifdef __VERSION__ + __VERSION__ + #else + STRINGIFY(__GNUC__) "." STRINGIFY(__GNUC_MINOR__) "." STRINGIFY(__GNUC_PATCHLEVEL__) + #endif + #elif defined(_MSC_VER) + "MSVC " STRINGIFY(_MSC_FULL_VER) "-" STRINGIFY(_MSC_BUILD) + #else + "Unknown compiler" + #endif +#endif /* MDBX_BUILD_COMPILER */ + , +#ifdef MDBX_BUILD_FLAGS + MDBX_BUILD_FLAGS +#endif /* MDBX_BUILD_FLAGS */ +#ifdef MDBX_BUILD_FLAGS_CONFIG + MDBX_BUILD_FLAGS_CONFIG +#endif /* MDBX_BUILD_FLAGS_CONFIG */ +}; #ifdef __SANITIZE_ADDRESS__ LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { @@ -14490,3 +17109,6 @@ LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { "abort_on_error=1"; } #endif /* __SANITIZE_ADDRESS__ */ + +/* *INDENT-ON* */ +/* clang-format on */ diff --git a/libs/libmdbx/src/src/elements/debug_begin.h b/libs/libmdbx/src/src/elements/debug_begin.h new file mode 100644 index 0000000000..f3306b5dc0 --- /dev/null +++ b/libs/libmdbx/src/src/elements/debug_begin.h @@ -0,0 +1,34 @@ +#pragma push_macro("mdbx_trace") +#pragma push_macro("mdbx_debug") +#pragma push_macro("mdbx_verbose") +#pragma push_macro("mdbx_notice") +#pragma push_macro("mdbx_warning") +#pragma push_macro("mdbx_error") +#pragma push_macro("mdbx_assert") + +#undef mdbx_trace +#define mdbx_trace(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_debug +#define mdbx_debug(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_verbose +#define mdbx_verbose(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_notice +#define mdbx_notice(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_warning +#define mdbx_warning(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_error +#define mdbx_error(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) + +#undef mdbx_assert +#define mdbx_assert(env, expr) mdbx_ensure(env, expr) diff --git a/libs/libmdbx/src/src/elements/debug_end.h b/libs/libmdbx/src/src/elements/debug_end.h new file mode 100644 index 0000000000..e361119bc0 --- /dev/null +++ b/libs/libmdbx/src/src/elements/debug_end.h @@ -0,0 +1,7 @@ +#pragma pop_macro("mdbx_trace") +#pragma pop_macro("mdbx_debug") +#pragma pop_macro("mdbx_verbose") +#pragma pop_macro("mdbx_notice") +#pragma pop_macro("mdbx_warning") +#pragma pop_macro("mdbx_error") +#pragma pop_macro("mdbx_assert") diff --git a/libs/libmdbx/src/src/defs.h b/libs/libmdbx/src/src/elements/defs.h index 622f1c13b6..f4265502a9 100644 --- a/libs/libmdbx/src/src/defs.h +++ b/libs/libmdbx/src/src/elements/defs.h @@ -133,33 +133,16 @@ # endif #endif /* __must_check_result */ -#ifndef __deprecated -# if defined(__GNUC__) || __has_attribute(__deprecated__) -# define __deprecated __attribute__((__deprecated__)) -# elif defined(_MSC_VER) -# define __deprecated __declspec(deprecated) +#ifndef __maybe_unused +# if defined(__GNUC__) || __has_attribute(__unused__) +# define __maybe_unused __attribute__((__unused__)) # else -# define __deprecated +# define __maybe_unused # endif -#endif /* __deprecated */ +#endif /* __maybe_unused */ #if !defined(__noop) && !defined(_MSC_VER) -# ifdef __cplusplus - static inline void __noop_consume_args() {} - template <typename First, typename... Rest> - static inline void - __noop_consume_args(const First &first, const Rest &... rest) { - (void) first; __noop_consume_args(rest...); - } -# define __noop(...) __noop_consume_args(__VA_ARGS__) -# elif defined(__GNUC__) && (!defined(__STRICT_ANSI__) || !__STRICT_ANSI__) - static __inline void __noop_consume_args(void* anchor, ...) { - (void) anchor; - } -# define __noop(...) __noop_consume_args(0, ##__VA_ARGS__) -# else # define __noop(...) do {} while(0) -# endif #endif /* __noop */ #ifndef __fallthrough @@ -337,25 +320,32 @@ typedef __complex__ float __cfloat128 __attribute__ ((__mode__ (__TC__))); typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__))); #endif /* Workaround for Coverity Scan */ -/* Wrapper around __func__, which is a C99 feature */ -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# define mdbx_func_ __func__ -#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) || defined(_MSC_VER) -# define mdbx_func_ __FUNCTION__ -#else -# define mdbx_func_ "<mdbx_unknown>" -#endif +#ifndef __printf_args +# if defined(__GNUC__) || __has_attribute(__format__) +# define __printf_args(format_index, first_arg) \ + __attribute__((__format__(printf, format_index, first_arg))) +# else +# define __printf_args(format_index, first_arg) +# endif +#endif /* __printf_args */ -#if defined(__GNUC__) || __has_attribute(__format__) -#define __printf_args(format_index, first_arg) \ - __attribute__((__format__(printf, format_index, first_arg))) -#else -#define __printf_args(format_index, first_arg) -#endif +#ifndef __anonymous_struct_extension__ +# if defined(__GNUC__) +# define __anonymous_struct_extension__ __extension__ +# else +# define __anonymous_struct_extension__ +# endif +#endif /* __anonymous_struct_extension__ */ + +#ifndef __Wpedantic_format_voidptr + static __inline __maybe_unused const void* __pure_function + __Wpedantic_format_voidptr(const void* ptr) {return ptr;} +# define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) +#endif /* __Wpedantic_format_voidptr */ /*----------------------------------------------------------------------------*/ -#if defined(USE_VALGRIND) +#if defined(MDBX_USE_VALGRIND) # include <valgrind/memcheck.h> # ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -377,7 +367,7 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__))); # define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) # define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) # define RUNNING_ON_VALGRIND (0) -#endif /* USE_VALGRIND */ +#endif /* MDBX_USE_VALGRIND */ #ifdef __SANITIZE_ADDRESS__ # include <sanitizer/asan_interface.h> @@ -409,6 +399,9 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__))); # define STRINGIFY(x) STRINGIFY_HELPER(x) #endif /* STRINGIFY */ +#define CONCAT(a,b) a##b +#define XCONCAT(a,b) CONCAT(a,b) + #ifndef offsetof # define offsetof(type, member) __builtin_offsetof(type, member) #endif /* offsetof */ diff --git a/libs/libmdbx/src/src/bits.h b/libs/libmdbx/src/src/elements/internals.h index fd36cac918..66c9f9e7cf 100644 --- a/libs/libmdbx/src/src/bits.h +++ b/libs/libmdbx/src/src/elements/internals.h @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -12,16 +12,20 @@ * <http://www.OpenLDAP.org/license.html>. */ #pragma once +#ifdef MDBX_CONFIG_H +#include MDBX_CONFIG_H +#endif + /* *INDENT-OFF* */ /* clang-format off */ /* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */ #ifndef MDBX_DEBUG -#ifdef NDEBUG -# define MDBX_DEBUG 0 -#else -# define MDBX_DEBUG 1 -#endif +# ifdef NDEBUG +# define MDBX_DEBUG 0 +# else +# define MDBX_DEBUG 1 +# endif #endif /* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ @@ -29,10 +33,23 @@ # undef NDEBUG #endif -#define MDBX_OSX_WANNA_DURABILITY 0 /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ -#define MDBX_OSX_WANNA_SPEED 1 /* using fsync() with chance of data lost on power failure */ -#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY -#define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY +#ifdef MDBX_ALLOY +/* Amalgamated build */ +# define MDBX_INTERNAL_FUNC static +# define MDBX_INTERNAL_VAR static +#else +/* Non-amalgamated build */ +# define MDBX_INTERNAL_FUNC +# define MDBX_INTERNAL_VAR extern +#endif /* MDBX_ALLOY */ + +#ifndef MDBX_DISABLE_GNU_SOURCE +#define MDBX_DISABLE_GNU_SOURCE 0 +#endif +#if MDBX_DISABLE_GNU_SOURCE +#undef _GNU_SOURCE +#elif defined(__linux__) || defined(__gnu_linux__) +#define _GNU_SOURCE #endif /*----------------------------------------------------------------------------*/ @@ -70,10 +87,11 @@ #pragma warning(disable : 4820) /* bytes padding added after data member for aligment */ #pragma warning(disable : 4548) /* expression before comma has no effect; expected expression with side - effect */ #pragma warning(disable : 4366) /* the result of the unary '&' operator may be unaligned */ +#pragma warning(disable : 4200) /* nonstandard extension used: zero-sized array in struct/union */ #endif /* _MSC_VER (warnings) */ -#include "../mdbx.h" -#include "./defs.h" +#include "../../mdbx.h" +#include "defs.h" #if defined(__GNUC__) && !__GNUC_PREREQ(4,2) /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. @@ -128,16 +146,17 @@ # endif #endif /* -Walignment-reduction-ignored */ -#include "./osal.h" - /* *INDENT-ON* */ /* clang-format on */ -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul -#define MDBX_WORDBITS 64 -#else -#define MDBX_WORDBITS 32 -#endif /* MDBX_WORDBITS */ +#include "osal.h" + +#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) +#if defined(MDBX_TOOLS) +extern LIBMDBX_API const char *const mdbx_sourcery_anchor; +#endif + +#include "options.h" /*----------------------------------------------------------------------------*/ /* Basic constants and types */ @@ -172,6 +191,9 @@ /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ #define CORE_DBS 2 #define MAX_DBI (INT16_MAX - CORE_DBS) +#if MAX_DBI != MDBX_MAX_DBI +#error "Opps, MAX_DBI != MDBX_MAX_DBI" +#endif /* Number of meta pages - also hardcoded elsewhere */ #define NUM_METAS 3 @@ -188,11 +210,16 @@ typedef uint32_t pgno_t; /* A transaction ID. */ typedef uint64_t txnid_t; #define PRIaTXN PRIi64 -#if MDBX_DEBUG -#define MIN_TXNID UINT64_C(0x100000000) -#else #define MIN_TXNID UINT64_C(1) -#endif /* MIN_TXNID */ +/* LY: for testing non-atomic 64-bit txnid on 32-bit arches. + * #define MDBX_TXNID_STEP (UINT32_MAX / 3) */ +#ifndef MDBX_TXNID_STEP +#if MDBX_64BIT_CAS +#define MDBX_TXNID_STEP 1u +#else +#define MDBX_TXNID_STEP 2u +#endif +#endif /* MDBX_TXNID_STEP */ /* Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, @@ -205,6 +232,26 @@ typedef uint16_t indx_t; /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) +typedef union mdbx_safe64 { + volatile uint64_t inconsistent; +#if MDBX_64BIT_ATOMIC + volatile uint64_t atomic; +#endif /* MDBX_64BIT_ATOMIC */ + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + volatile uint32_t low; + volatile uint32_t high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + volatile uint32_t high; + volatile uint32_t low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +} mdbx_safe64_t; + +#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) + /* Information about a single database in the environment. */ typedef struct MDBX_db { uint16_t md_flags; /* see mdbx_dbi_open */ @@ -216,9 +263,20 @@ typedef struct MDBX_db { pgno_t md_overflow_pages; /* number of overflow pages */ uint64_t md_seq; /* table sequence counter */ uint64_t md_entries; /* number of data items */ - uint64_t md_merkle; /* Merkle tree checksum */ + uint64_t md_mod_txnid; /* txnid of last commited modification */ } MDBX_db; +/* database size-related parameters */ +typedef struct mdbx_geo_t { + uint16_t grow; /* datafile growth step in pages */ + uint16_t shrink; /* datafile shrink threshold in pages */ + pgno_t lower; /* minimal size of datafile in pages */ + pgno_t upper; /* maximal size of datafile in pages */ + pgno_t now; /* current size of datafile in pages */ + pgno_t next; /* first unused page in the datafile, + * but actually the file may be shorter. */ +} mdbx_geo_t; + /* Meta page content. * A meta page is the start point for accessing a database snapshot. * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ @@ -228,7 +286,7 @@ typedef struct MDBX_meta { uint64_t mm_magic_and_version; /* txnid that committed this page, the first of a two-phase-update pair */ - volatile txnid_t mm_txnid_a; + mdbx_safe64_t mm_txnid_a; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -236,15 +294,7 @@ typedef struct MDBX_meta { uint8_t mm_extra_pagehdr; /* extra bytes in the page header, * zero (nothing) for now */ - struct { - uint16_t grow; /* datafile growth step in pages */ - uint16_t shrink; /* datafile shrink threshold in pages */ - pgno_t lower; /* minimal size of datafile in pages */ - pgno_t upper; /* maximal size of datafile in pages */ - pgno_t now; /* current size of datafile in pages */ - pgno_t next; /* first unused page in the datafile, - * but actually the file may be shorter. */ - } mm_geo; + mdbx_geo_t mm_geo; /* database size-related parameters */ MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ @@ -255,20 +305,26 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u -#define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) volatile uint64_t mm_datasync_sign; /* txnid that committed this page, the second of a two-phase-update pair */ - volatile txnid_t mm_txnid_b; + mdbx_safe64_t mm_txnid_b; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. * This value in couple with mr_snapshot_pages_retired allows fast estimation * of "how much reader is restraining GC recycling". */ uint64_t mm_pages_retired; + + /* The analogue /proc/sys/kernel/random/boot_id or similar to determine + * whether the system was rebooted after the last use of the database files. + * If there was no reboot, but there is no need to rollback to the last + * steady sync point. Zeros mean that no relevant information is available + * from the system. */ + bin128_t mm_bootid; + } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -287,11 +343,10 @@ typedef struct MDBX_meta { * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. * * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. */ + * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { union { - struct MDBX_page *mp_next; /* for in-memory list of freed pages, - * must be first field, see NEXT_LOOSE_PAGE */ + struct MDBX_page *mp_next; /* for in-memory list of freed pages */ uint64_t mp_validator; /* checksum of page content or a txnid during * which the page has been updated */ }; @@ -316,48 +371,40 @@ typedef struct MDBX_page { pgno_t mp_pgno; /* page number */ /* dynamic size */ - union { - indx_t mp_ptrs[1]; - MDBX_meta mp_meta; - uint8_t mp_data[1]; - }; + indx_t mp_ptrs[/* C99 */]; } MDBX_page; /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) -/* The maximum size of a database page. - * - * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE 0x10000u -#define MIN_PAGESIZE 512u +#pragma pack(pop) -#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) -#if defined(_WIN32) || defined(_WIN64) -#define MAX_MAPSIZE32 UINT32_C(0x38000000) -#else -#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES +#define MDBX_CLOCK_SIGN UINT32_C(0xF10C) +typedef void mdbx_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + +#define MDBX_CLOCK_SIGN UINT32_C(0xF18D) +typedef mdbx_pid_t mdbx_ipclock_t; +#ifndef EOWNERDEAD +#define EOWNERDEAD MDBX_RESULT_TRUE #endif -#define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) -#if MDBX_WORDBITS >= 64 -#define MAX_MAPSIZE MAX_MAPSIZE64 +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#define MDBX_CLOCK_SIGN UINT32_C(0x8017) +typedef pthread_mutex_t mdbx_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 +#define MDBX_CLOCK_SIGN UINT32_C(0xFC29) +typedef sem_t mdbx_ipclock_t; #else -#define MAX_MAPSIZE MAX_MAPSIZE32 -#endif /* MDBX_WORDBITS */ +#error "FIXME" +#endif /* MDBX_LOCKING */ -#pragma pack(pop) +#if MDBX_LOCKING > MDBX_LOCKING_SYSV +MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +#endif /* MDBX_LOCKING */ /* Reader Lock Table * @@ -405,7 +452,7 @@ typedef struct MDBX_reader { * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - volatile txnid_t mr_txnid; + mdbx_safe64_t /* txnid_t */ mr_txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -417,15 +464,16 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ - union { - volatile mdbx_tid_t mr_tid; - volatile uint64_t mr_tid_u64; - }; +#if MDBX_WORDBITS >= 64 + volatile uint64_t mr_tid; +#else + volatile uint32_t mr_tid; + volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch, + unused for now */ +#endif /* The process ID of the process owning this reader txn. */ - union { - volatile mdbx_pid_t mr_pid; - volatile uint32_t mr_pid_u32; - }; + volatile uint32_t mr_pid; + /* The number of pages used in the reader's MVCC snapshot, * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ volatile pgno_t mr_snapshot_pages_used; @@ -450,6 +498,11 @@ typedef struct MDBX_lockinfo { /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ volatile pgno_t mti_autosync_threshold; + + /* Low 32-bit of txnid with which meta-pages was synced, + * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ + volatile uint32_t mti_meta_sync_txnid; + /* Period for timed auto-sync feature, i.e. at the every steady checkpoint * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. * The time value is represented in a suitable system-dependent form, for @@ -457,30 +510,38 @@ typedef struct MDBX_lockinfo { * Zero means timed auto-sync is disabled. */ volatile uint64_t mti_autosync_period; + /* Marker to distinguish uniqueness of DB/CLK.*/ + volatile uint64_t mti_bait_uniqueness; + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ -#ifdef MDBX_OSAL_LOCK - /* Mutex protecting write-txn. */ - MDBX_OSAL_LOCK mti_wmutex; -#endif + + /* Write transation lock. */ +#if MDBX_LOCKING > 0 + mdbx_ipclock_t mti_wlock; +#endif /* MDBX_LOCKING > 0 */ volatile txnid_t mti_oldest_reader; - /* Timestamp for auto-sync feature, i.e. the steady checkpoint should be - * created at the first commit that will be not early this timestamp. - * The time value is represented in a suitable system-dependent form, for - * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). - * Zero means timed auto-sync is not pending. */ - volatile uint64_t mti_unsynced_timeout; + /* Timestamp of the last steady sync. Value is represented in a suitable + * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or + * clock_gettime(CLOCK_MONOTONIC). */ + volatile uint64_t mti_sync_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ volatile pgno_t mti_unsynced_pages; + /* Number of page which was discarded last time by madvise(MADV_FREE). */ + volatile pgno_t mti_discarded_tail; + + /* Timestamp of the last readers check. */ + volatile uint64_t mti_reader_check_timestamp; + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ -#ifdef MDBX_OSAL_LOCK - /* Mutex protecting readers registration access to this table. */ - MDBX_OSAL_LOCK mti_rmutex; -#endif + /* Readeaders registration lock. */ +#if MDBX_LOCKING > 0 + mdbx_ipclock_t mti_rlock; +#endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented @@ -489,15 +550,16 @@ typedef struct MDBX_lockinfo { volatile unsigned mti_readers_refresh_flag; alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ - MDBX_reader mti_readers[1]; + MDBX_reader mti_readers[/* C99 */]; } MDBX_lockinfo; /* Lockfile format signature: version, features and field layout */ #define MDBX_LOCK_FORMAT \ - (MDBX_OSAL_LOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ + (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ - (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 29) + (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) #define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) #define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) @@ -508,6 +570,40 @@ typedef struct MDBX_lockinfo { #define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u) #endif /* MDBX_ASSUME_MALLOC_OVERHEAD */ +/* The maximum size of a database page. + * + * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. + * + * MDBX will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE MDBX_MAX_PAGESIZE +#define MIN_PAGESIZE MDBX_MIN_PAGESIZE + +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#if defined(_WIN32) || defined(_WIN64) +#define MAX_MAPSIZE32 UINT32_C(0x38000000) +#else +#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#endif +#define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) + +#if MDBX_WORDBITS >= 64 +#define MAX_MAPSIZE MAX_MAPSIZE64 +#define MDBX_READERS_LIMIT \ + ((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#else +#define MDBX_READERS_LIMIT 1024 +#define MAX_MAPSIZE MAX_MAPSIZE32 +#endif /* MDBX_WORDBITS */ + /*----------------------------------------------------------------------------*/ /* Two kind lists of pages (aka PNL) */ @@ -526,17 +622,17 @@ typedef pgno_t *MDBX_PNL; #define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) #endif -/* List of txnid, only for MDBX_env.mt_lifo_reclaimed */ +/* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef union MDBX_DP { struct { pgno_t pgno; - void *ptr; + MDBX_page *ptr; }; struct { - pgno_t unused; + unsigned sorted; unsigned length; }; } MDBX_DP; @@ -546,7 +642,7 @@ typedef union MDBX_DP { * elements are in the array. */ typedef MDBX_DP *MDBX_DPL; -/* PNL sizes - likely should be even bigger */ +/* PNL sizes */ #define MDBX_PNL_GRANULATE 1024 #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -567,6 +663,14 @@ typedef MDBX_DP *MDBX_DPL; #define MDBX_PNL_BEGIN(pl) (&(pl)[1]) #define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) +#else +#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) +#endif + #define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) #define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) @@ -590,32 +694,31 @@ struct MDBX_txn { MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ MDBX_txn *mt_child; - pgno_t mt_next_pgno; /* next unallocated page */ - pgno_t mt_end_pgno; /* corresponding to the current size of datafile */ + mdbx_geo_t mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now + + /* Transaction Flags */ + /* mdbx_txn_begin() flags */ +#define MDBX_TXN_BEGIN_FLAGS \ + (MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_MAPASYNC | MDBX_RDONLY | MDBX_TRYTXN) + /* internal txn flags */ +#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ +#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ +#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ +#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ +#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ + /* most operations on the txn are currently illegal */ +#define MDBX_TXN_BLOCKED \ + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD) + unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; MDBX_env *mt_env; /* the DB environment */ - /* The list of reclaimed txns from freeDB */ - MDBX_TXL mt_lifo_reclaimed; - /* The list of pages that became unused during this transaction. */ - MDBX_PNL mt_befree_pages; - /* The list of loose pages that became unused and may be reused - * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ - MDBX_page *mt_loose_pages; - /* Number of loose pages (mt_loose_pages) */ - unsigned mt_loose_count; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL mt_spill_pages; - union { - /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_DPL mt_rw_dirtylist; - /* For read txns: This thread/txn's reader table slot, or NULL. */ - MDBX_reader *mt_ro_reader; - }; /* Array of records for each DB known in the environment. */ MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ @@ -623,7 +726,7 @@ struct MDBX_txn { /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; -/* Transaction DB Flags */ + /* Transaction DB Flags */ #define DB_DIRTY MDBX_TBL_DIRTY /* DB was written in this txn */ #define DB_STALE MDBX_TBL_STALE /* Named-DB record is older than txnID */ #define DB_FRESH MDBX_TBL_FRESH /* Named-DB handle opened in this txn */ @@ -631,6 +734,7 @@ struct MDBX_txn { #define DB_VALID 0x10 /* DB handle is valid, see also MDBX_VALID */ #define DB_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ #define DB_DUPDATA 0x40 /* DB is MDBX_DUPSORT data */ +#define DB_AUDITED 0x80 /* Internal flag for accounting during audit */ /* In write txns, array of cursors for each DB */ MDBX_cursor **mt_cursors; /* Array of flags for each DB */ @@ -639,39 +743,55 @@ struct MDBX_txn { * This number only ever increments until the txn finishes; we * don't decrement it when individual DB handles are closed. */ MDBX_dbi mt_numdbs; - -/* Transaction Flags */ -/* mdbx_txn_begin() flags */ -#define MDBX_TXN_BEGIN_FLAGS \ - (MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_RDONLY | MDBX_TRYTXN) -#define MDBX_TXN_NOMETASYNC \ - MDBX_NOMETASYNC /* don't sync meta for this txn on commit */ -#define MDBX_TXN_NOSYNC MDBX_NOSYNC /* don't sync this txn on commit */ -#define MDBX_TXN_RDONLY MDBX_RDONLY /* read-only transaction */ - /* internal txn flags */ -#define MDBX_TXN_WRITEMAP MDBX_WRITEMAP /* copy of MDBX_env flag in writers */ -#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ -#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ -#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ -#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ -#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ -/* most operations on the txn are currently illegal */ -#define MDBX_TXN_BLOCKED \ - (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD) - unsigned mt_flags; - /* dirtylist room: Array size - dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirtylist into mt_parent after freeing hidden mt_parent pages. */ - unsigned mt_dirtyroom; - mdbx_tid_t mt_owner; /* thread ID that owns this transaction */ + size_t mt_owner; /* thread ID that owns this transaction */ mdbx_canary mt_canary; + + union { + struct { + /* For read txns: This thread/txn's reader table slot, or NULL. */ + MDBX_reader *reader; + } to; + struct { + pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ + pgno_t loose_refund_wl /* FIXME: describe */; + /* dirtylist room: Dirty array size - dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirtylist into mt_parent after freeing hidden mt_parent pages. */ + unsigned dirtyroom; + /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ + MDBX_DPL dirtylist; + /* The list of reclaimed txns from GC */ + MDBX_TXL lifo_reclaimed; + /* The list of pages that became unused during this transaction. */ + MDBX_PNL retired_pages; + /* The list of loose pages that became unused and may be reused + * in this transaction, linked through `mp_next`. */ + MDBX_page *loose_pages; + /* Number of loose pages (tw.loose_pages) */ + unsigned loose_count; + /* Number of retired to parent pages (tw.retired2parent_pages) */ + unsigned retired2parent_count; + /* The list of parent's txn dirty pages that retired (became unused) + * in this transaction, linked through `mp_next`. */ + MDBX_page *retired2parent_pages; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL spill_pages; + } tw; + }; }; /* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to * raise this on a 64 bit machine. */ -#define CURSOR_STACK 32 +#if MDBX_WORDBITS >= 64 +#define CURSOR_STACK 28 +#else +#define CURSOR_STACK 20 +#endif struct MDBX_xcursor; @@ -711,8 +831,8 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ -#define C_RECLAIMING 0x20 /* FreeDB lookup is prohibited */ -#define C_GCFREEZE 0x40 /* me_reclaimed_pglist must not be updated */ +#define C_RECLAIMING 0x20 /* GC lookup is prohibited */ +#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ @@ -738,28 +858,6 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; -/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - -/* Update sub-page pointer, if any, in mc->mc_xcursor. - * Needed when the node which contains the sub-page may have moved. - * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ -#define XCURSOR_REFRESH(mc, mp, ki) \ - do { \ - MDBX_page *xr_pg = (mp); \ - MDBX_node *xr_node = NODEPTR(xr_pg, ki); \ - if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ - } while (0) - -/* State of FreeDB old pages, stored in the MDBX_env */ -typedef struct MDBX_pgstate { - pgno_t *mf_reclaimed_pglist; /* Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_last_reclaimed; /* ID of last used record, or 0 if - !mf_reclaimed_pglist */ -} MDBX_pgstate; - /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) @@ -767,18 +865,17 @@ struct MDBX_env { mdbx_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.dxb #define me_fd me_dxb_mmap.fd -#define me_mapsize me_dxb_mmap.length mdbx_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd #define me_lck me_lck_mmap.lck -/* Failed to update the meta page. Probably an I/O error. */ + /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) -/* Additional flag for mdbx_sync_locked() */ + /* Additional flag for mdbx_sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) -/* Some fields are initialized. */ + /* Some fields are initialized. */ #define MDBX_ENV_ACTIVE UINT32_C(0x20000000) -/* me_txkey is set */ + /* me_txkey is set */ #define MDBX_ENV_TXKEY UINT32_C(0x10000000) uint32_t me_flags; /* see mdbx_env */ unsigned me_psize; /* DB page size, inited from me_os_psize */ @@ -788,63 +885,81 @@ struct MDBX_env { mdbx_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_dbi me_maxdbs; /* size of the DB table */ - mdbx_pid_t me_pid; /* process ID of this env */ + uint32_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ char *me_path; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ -#ifdef MDBX_OSAL_LOCK - MDBX_OSAL_LOCK *me_wmutex; /* write-txn mutex */ -#endif + + /* write-txn lock */ +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + union { + key_t key; + int semid; + } me_sysv_ipc; +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + +#if MDBX_LOCKING > 0 + mdbx_ipclock_t *me_wlock; +#endif /* MDBX_LOCKING > 0 */ + MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ - MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ -#define me_last_reclaimed me_pgstate.mf_last_reclaimed -#define me_reclaimed_pglist me_pgstate.mf_reclaimed_pglist - MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ - /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_free_pgs; + MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + /* PNL of pages that became unused in a write txn */ + MDBX_PNL me_retired_pages; /* MDBX_DP of pages written during a write txn. Length MDBX_DPL_TXNFULL. */ MDBX_DPL me_dirtylist; /* Number of freelist items that can fit in a single overflow page */ unsigned me_maxgc_ov1page; - /* Max size of a node on a page */ - unsigned me_nodemax; - unsigned me_maxkey_limit; /* max size of a key */ - mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ - volatile uint64_t *me_unsynced_timeout; + unsigned me_branch_nodemax; /* max size of a branch-node */ + uint16_t me_maxkey_nd, me_maxkey_ds; + unsigned me_maxval_nd, me_maxval_ds; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ + volatile uint64_t *me_sync_timestamp; volatile uint64_t *me_autosync_period; volatile pgno_t *me_unsynced_pages; volatile pgno_t *me_autosync_threshold; + volatile pgno_t *me_discarded_tail; + volatile uint32_t *me_meta_sync_txnid; MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ struct { -#ifdef MDBX_OSAL_LOCK - MDBX_OSAL_LOCK wmutex; -#endif +#if MDBX_LOCKING > 0 + mdbx_ipclock_t wlock; +#endif /* MDBX_LOCKING > 0 */ txnid_t oldest; - uint64_t unsynced_timeout; + uint64_t sync_timestamp; uint64_t autosync_period; pgno_t autosync_pending; pgno_t autosync_threshold; + pgno_t discarded_tail; + uint32_t meta_sync_txnid; } me_lckless_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef USE_VALGRIND +#ifdef MDBX_USE_VALGRIND int me_valgrind_handle; #endif +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + pgno_t me_poison_edge; +#endif + MDBX_env *me_lcklist_next; + /* struct me_dbgeo used for accepting db-geo params from user for the new + * database creation, i.e. when mdbx_env_set_geometry() was called before + * mdbx_env_open(). */ struct { size_t lower; /* minimal size of datafile */ size_t upper; /* maximal size of datafile */ size_t now; /* current size of datafile */ size_t grow; /* step to grow datafile */ size_t shrink; /* threshold to shrink datafile */ - } me_dbgeo; /* */ + } me_dbgeo; #if defined(_WIN32) || defined(_WIN64) MDBX_srwlock me_remap_guard; @@ -855,25 +970,26 @@ struct MDBX_env { #endif }; -/* Nested transaction */ -typedef struct MDBX_ntxn { - MDBX_txn mnt_txn; /* the transaction */ - MDBX_pgstate mnt_pgstate; /* parent transaction's saved freestate */ -} MDBX_ntxn; - /*----------------------------------------------------------------------------*/ /* Debug and Logging stuff */ -#ifndef mdbx_runtime_flags /* avoid override from tools */ -extern int mdbx_runtime_flags; -#endif -extern MDBX_debug_func *mdbx_debug_logger; -extern txnid_t mdbx_debug_edge; +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +#ifdef MDBX_ALLOY +static uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +static uint8_t mdbx_loglevel = MDBX_DEBUG; +#else +extern uint8_t mdbx_runtime_flags; +extern uint8_t mdbx_loglevel; +#endif /* MDBX_ALLOY */ +MDBX_INTERNAL_VAR MDBX_debug_func *mdbx_debug_logger; -void mdbx_debug_log(int type, const char *function, int line, const char *fmt, - ...) __printf_args(4, 5); +MDBX_INTERNAL_FUNC void mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) + __printf_args(4, 5); -void mdbx_panic(const char *fmt, ...) __printf_args(1, 2); +MDBX_INTERNAL_FUNC void mdbx_panic(const char *fmt, ...) __printf_args(1, 2); #if MDBX_DEBUG @@ -881,104 +997,95 @@ void mdbx_panic(const char *fmt, ...) __printf_args(1, 2); #define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) -#define mdbx_debug_enabled(type) \ - unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) - +#ifdef MDBX_LOGLEVEL_BUILD +#define mdbx_log_enabled(msg) \ + (msg <= MDBX_LOGLEVEL_BUILD && unlikely(msg <= mdbx_loglevel)) #else -#define mdbx_debug_enabled(type) (0) +#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) +#endif /* MDBX_LOGLEVEL_BUILD */ + +#else /* MDBX_DEBUG */ + #define mdbx_audit_enabled() (0) + #if !defined(NDEBUG) || defined(MDBX_FORCE_ASSERT) #define mdbx_assert_enabled() (1) #else #define mdbx_assert_enabled() (0) #endif /* NDEBUG */ -#endif /* MDBX_DEBUG */ -LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, - const char *func, int line); - -#define mdbx_print(fmt, ...) \ - mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) +#ifdef MDBX_LOGLEVEL_BUILD +#define mdbx_log_enabled(msg) (msg <= MDBX_LOGLEVEL_BUILD) +#else +#define mdbx_log_enabled(msg) (0) +#endif /* MDBX_LOGLEVEL_BUILD */ -#define mdbx_trace(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ - mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ - ##__VA_ARGS__); \ - } while (0) +#endif /* MDBX_DEBUG */ -#define mdbx_verbose(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ - } while (0) +MDBX_INTERNAL_FUNC void mdbx_assert_fail(const MDBX_env *env, const char *msg, + const char *func, int line); -#define mdbx_info(fmt, ...) \ +#define mdbx_debug_extra(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ + mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_notice(fmt, ...) \ +#define mdbx_debug_extra_print(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ + mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ } while (0) -#define mdbx_warning(fmt, ...) \ +#define mdbx_trace(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ + mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_error(fmt, ...) \ +#define mdbx_debug(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ + mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_fatal(fmt, ...) \ +#define mdbx_verbose(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ - mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ - fmt "\n", ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ + mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_debug(fmt, ...) \ +#define mdbx_notice(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ - mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ - ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ + mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_debug_print(fmt, ...) \ +#define mdbx_warning(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ - mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_WARN)) \ + mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra(fmt, ...) \ +#define mdbx_error(fmt, ...) \ do { \ - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ - mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ - ##__VA_ARGS__); \ + if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ + mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ } while (0) -#define mdbx_debug_extra_print(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ - mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ - } while (0) +#define mdbx_fatal(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); #define mdbx_ensure_msg(env, expr, msg) \ do { \ if (unlikely(!(expr))) \ - mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ + mdbx_assert_fail(env, msg, __func__, __LINE__); \ } while (0) #define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) @@ -996,20 +1103,61 @@ LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, /* assert(3) variant in transaction context */ #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) +#ifndef MDBX_TOOLS /* Avoid using internal mdbx_assert() */ #undef assert #define assert(expr) mdbx_assert(NULL, expr) +#endif + +/*----------------------------------------------------------------------------*/ +/* Cache coherence and mmap invalidation */ + +#if MDBX_CPU_WRITEBACK_INCOHERENT +#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#else +#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ + +static __maybe_unused __inline void +mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + char *const begin = (char *)(-pagesize & (intptr_t)addr); + char *const end = + (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); + int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; + mdbx_assert(nullptr, err == 0); + (void)err; +#else + (void)pagesize; +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + +#if MDBX_MMAP_INCOHERENT_CPU_CACHE +#ifdef DCACHE + /* MIPS has cache coherency issues. + * Note: for any nbytes >= on-chip cache size, entire is flushed. */ + cacheflush(addr, nbytes, DCACHE); +#else +#error "Oops, cacheflush() not available" +#endif /* DCACHE */ +#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ + +#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE + (void)addr; + (void)nbytes; +#endif +} /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); -int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int mdbx_reader_check0(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, + MDBX_reader *begin, MDBX_reader *end); +MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); -void mdbx_rthc_global_init(void); -void mdbx_rthc_global_dtor(void); -void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); +MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); +MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); #define MDBX_IS_ERROR(rc) \ ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) @@ -1017,7 +1165,7 @@ void mdbx_rthc_thread_dtor(void *ptr); /* Internal error codes, not exposed outside libmdbx */ #define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) -/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +/* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) @@ -1026,9 +1174,9 @@ void mdbx_rthc_thread_dtor(void *ptr); #if MDBX_DEBUG #define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] -#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) +#define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) #define DVAL(x) \ - mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) + mdbx_dump_val(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) #else #define DKBUF ((void)(0)) #define DKEY(x) ("-") @@ -1048,7 +1196,7 @@ void mdbx_rthc_thread_dtor(void *ptr); /* Default size of memory map. * This is certainly too small for any actual applications. Apps should * always set the size explicitly using mdbx_env_set_mapsize(). */ -#define DEFAULT_MAPSIZE 1048576 +#define DEFAULT_MAPSIZE MEGABYTE /* Number of slots in the reader table. * This value was chosen somewhat arbitrarily. The 61 is a prime number, @@ -1056,23 +1204,6 @@ void mdbx_rthc_thread_dtor(void *ptr); * Applications should set the table size using mdbx_env_set_maxreaders(). */ #define DEFAULT_READERS 61 -/* Address of first usable data byte in a page, after the header */ -#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - -/* Number of nodes on a page */ -#define NUMKEYS(p) ((unsigned)(p)->mp_lower >> 1) - -/* The amount of space remaining in the page */ -#define SIZELEFT(p) ((indx_t)((p)->mp_upper - (p)->mp_lower)) - -/* The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) \ - (1024UL * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) -/* The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. */ -#define FILL_THRESHOLD 256 - /* Test if a page is a leaf page */ #define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0) /* Test if a page is a LEAF2 page */ @@ -1088,13 +1219,6 @@ void mdbx_rthc_thread_dtor(void *ptr); #define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) -/* The number of overflow pages needed to store the given size. */ -#define OVPAGES(env, size) (bytes2pgno(env, PAGEHDRSZ - 1 + (size)) + 1) - -/* Link in MDBX_txn.mt_loose_pages list. - * Kept outside the page header, which is needed when reusing the page. */ -#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -1109,85 +1233,40 @@ void mdbx_rthc_thread_dtor(void *ptr); * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in * a sub-page/sub-database, and named databases (just F_SUBDATA). */ typedef struct MDBX_node { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ union { struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - union { - struct { - uint16_t mn_lo, mn_hi; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; - uint16_t mn_flags; /* see mdbx_node */ - uint16_t mn_ksize; /* key size */ + uint16_t mn_lo, mn_hi; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + uint32_t mn_pgno32; + }; + uint8_t mn_flags; /* see mdbx_node flags */ + uint8_t mn_extra; + uint16_t mn_ksize; /* key size */ #else - uint16_t mn_ksize; /* key size */ - uint16_t mn_flags; /* see mdbx_node */ - union { - struct { - uint16_t mn_hi, mn_lo; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; -#endif + uint16_t mn_ksize; /* key size */ + uint8_t mn_extra; + uint8_t mn_flags; /* see mdbx_node flags */ + union { + uint32_t mn_pgno32; + uint32_t mn_dsize; + struct { + uint16_t mn_hi, mn_lo; /* part of data size or pgno */ }; - pgno_t mn_ksize_and_pgno; }; +#endif /* __BYTE_ORDER__ */ -/* mdbx_node Flags */ + /* mdbx_node Flags */ #define F_BIGDATA 0x01 /* data put on overflow page */ #define F_SUBDATA 0x02 /* data is a sub-database */ #define F_DUPDATA 0x04 /* data has duplicates */ -/* valid flags for mdbx_node_add() */ + /* valid flags for mdbx_node_add() */ #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) - uint8_t mn_data[1]; /* key and data are appended here */ + uint8_t mn_data[/* C99 */]; /* key and data are appended here */ } MDBX_node; -/* Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDBX_node, mn_data) - -/* Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - -/* Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) - -/* Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) - -/* Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - -/* Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - -/* The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - -/* The address of a key in a LEAF2 page. - * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) - -/* Set the node's key into keyptr, if requested. */ -#define MDBX_GET_MAYNULL_KEYPTR(node, keyptr) \ - do { \ - if ((keyptr) != NULL) { \ - (keyptr)->iov_len = NODEKSZ(node); \ - (keyptr)->iov_base = NODEKEY(node); \ - } \ - } while (0) - -/* Set the node's key into key. */ -#define MDBX_GET_KEYVALUE(node, key) \ - do { \ - key.iov_len = NODEKSZ(node); \ - key.iov_base = NODEKEY(node); \ - } while (0) - #define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) /* mdbx_dbi_open() flags */ @@ -1202,34 +1281,34 @@ typedef struct MDBX_node { #define MDBX_COMMIT_PAGES IOV_MAX #endif -/* LY: fast enough on most systems - * +/* * / * | -1, a < b - * cmp2int(a,b) = < 0, a == b + * CMP2INT(a,b) = < 0, a == b * | 1, a > b * \ */ #if 1 -#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) +/* LY: fast enough on most systems */ +#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) #else -#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) +#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) #endif /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { +static __maybe_unused __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { +static __maybe_unused __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -static __inline void mdbx_jitter4testing(bool tiny) { +static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -1237,16 +1316,3 @@ static __inline void mdbx_jitter4testing(bool tiny) { (void)tiny; #endif } - -/* Controls checking PID against reuse DB environment after the fork() */ -#ifndef MDBX_TXN_CHECKPID -#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) -/* PID check could be ommited: - * - on Linux when madvise(MADV_DONTFORK) is available. i.e. after the fork() - * mapped pages will not be available for child process. - * - in Windows where fork() not available. */ -#define MDBX_TXN_CHECKPID 0 -#else -#define MDBX_TXN_CHECKPID 1 -#endif -#endif /* MDBX_TXN_CHECKPID */ diff --git a/libs/libmdbx/src/src/elements/lck-posix.c b/libs/libmdbx/src/src/elements/lck-posix.c new file mode 100644 index 0000000000..ce2c0c3f23 --- /dev/null +++ b/libs/libmdbx/src/src/elements/lck-posix.c @@ -0,0 +1,727 @@ +/* + * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + */ + +#include "internals.h" +#include <sys/sem.h> + +/*----------------------------------------------------------------------------*/ +/* global constructor/destructor */ + +#if defined(__linux__) || defined(__gnu_linux__) +#include <sys/utsname.h> +#ifndef MDBX_ALLOY +uint32_t mdbx_linux_kernel_version; +#endif /* MDBX_ALLOY */ +#endif /* Linux */ + +static __cold __attribute__((__constructor__)) void +mdbx_global_constructor(void) { +#if defined(__linux__) || defined(__gnu_linux__) + struct utsname buffer; + if (uname(&buffer) == 0) { + int i = 0; + char *p = buffer.release; + while (*p && i < 4) { + if (*p >= '0' && *p <= '9') { + long number = strtol(p, &p, 10); + if (number > 0) { + if (number > 255) + number = 255; + mdbx_linux_kernel_version += number << (24 - i * 8); + } + ++i; + } else { + ++p; + } + } + } +#endif /* Linux */ + + mdbx_rthc_global_init(); +} + +static __cold __attribute__((__destructor__)) void +mdbx_global_destructor(void) { + mdbx_rthc_global_dtor(); +} + +/*----------------------------------------------------------------------------*/ +/* lck */ + +/* Описание реализации блокировок для POSIX & Linux: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct MDBX_lockinfo) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * + * Для блокировки файлов используется fcntl(F_SETLK), так как: + * - lockf() оперирует только эксклюзивной блокировкой и требует + * открытия файла в RW-режиме. + * - flock() не гарантирует атомарности при смене блокировок + * и оперирует только всем файлом целиком. + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive выполняется блокировка + * основного файла БД и при успехе lck-файла. + * + * ---------------------------------------------------------------------------- + * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ + * + * Эксклюзивный режим без lck-файла: + * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, + * в зависимости от MDBX_RDONLY. + * + * Не-операционный режим на время пере-инициализации и разрушении lck-файла: + * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её + * снятия при получении F_RDLCK через F_SETLKW. + * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки + * lck-файла: + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + * ОПЕРАЦИОННЫЙ режим с lck-файлом: + * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут + * получить F_WRLCK и таким образом видят что БД используется. + * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + */ + +#if MDBX_USE_OFDLOCKS +static int op_setlk, op_setlkw, op_getlk; +static void __cold choice_fcntl() { + assert(!op_setlk && !op_setlkw && !op_getlk); + if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 +#if defined(__linux__) || defined(__gnu_linux__) + && mdbx_linux_kernel_version > + 0x030f0000 /* OFD locks are available since 3.15, but engages here + only for 3.16 and larer kernels (LTS) for reliability reasons */ +#endif /* linux */ + ) { + op_setlk = F_OFD_SETLK; + op_setlkw = F_OFD_SETLKW; + op_getlk = F_OFD_GETLK; + return; + } + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; +} +#else +#define op_setlk F_SETLK +#define op_setlkw F_SETLKW +#define op_getlk F_GETLK +#endif /* MDBX_USE_OFDLOCKS */ + +#ifndef OFF_T_MAX +#define OFF_T_MAX \ + ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) +#endif + +static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset, + off_t len) { + mdbx_jitter4testing(true); + for (;;) { + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + int rc = fcntl(fd, cmd, &lock_op); + mdbx_jitter4testing(true); + if (rc != -1) { + if (cmd == op_getlk) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (reader holds a lock). + * MDBX_RESULT_FALSE - if pid is dead (a lock could be placed). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + return MDBX_SUCCESS; + } + rc = errno; + if (rc != EINTR || cmd == op_setlkw) { + mdbx_assert(nullptr, MDBX_IS_ERROR(rc)); + return rc; + } + } +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(pid > 0); + return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); +} + +/*---------------------------------------------------------------------------*/ + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV +MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_init(ipc, false, 1) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_init(ipc, nullptr); +#else +#error "FIXME" +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_destroy(ipc) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_destroy(ipc); +#else +#error "FIXME" +#endif +} +#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { + assert(env->me_fd != INVALID_HANDLE_VALUE); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + + int rc; + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + } + +retry_exclusive: + /* Firstly try to get exclusive locking. */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) { + continue_dxb_exclusive: + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc == MDBX_SUCCESS) + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + + /* the cause may be a collision with POSIX's file-lock recovery. */ + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "dxb-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Fallback to lck-shared */ + } + + /* Here could be one of two:: + * - mdbx_lck_destroy() from the another process was hold the lock + * during a destruction. + * - either mdbx_lck_seize() from the another process was got the exclusive + * lock and doing initialization. + * For distinguish these cases will use size of the lck-file later. */ + + /* Wait for lck-shared now. */ + /* Here may be await during transient processes, for instance until another + * competing process doesn't call lck_downgrade(). */ + rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "try-shared", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* got shared, retry exclusive */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) + goto continue_dxb_exclusive; + + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "try-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Checking file size for detect the situation when we got the shared lock + * immediately after mdbx_lck_destroy(). */ + struct stat st; + if (fstat(env->me_lfd, &st)) { + rc = errno; + mdbx_error("%s(%s) failed: errcode %u", __func__, "check-filesize", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + if (st.st_size < (unsigned)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { + mdbx_verbose("lck-file is too short (%u), retry exclusive-lock", + (unsigned)st.st_size); + rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "retry-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + goto retry_exclusive; + } + + /* Lock against another process operating in without-lck or exclusive mode. */ + rc = + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s(%s) failed: errcode %u", __func__, + "lock-against-without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Done: return with shared locking. */ + return MDBX_RESULT_FALSE; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + + int rc = MDBX_SUCCESS; + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + rc = lck_op(env->me_fd, op_setlk, F_UNLCK, 0, env->me_pid); + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_fd, op_setlk, F_UNLCK, env->me_pid + 1, + OFF_T_MAX - env->me_pid - 1); + } + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); + if (unlikely(rc != 0)) { + mdbx_error("%s(%s) failed: errcode %u", __func__, "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; +} + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor) { + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + + int rc = MDBX_SUCCESS; + if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && + env->me_lck && + /* try get exclusive access */ + lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && + lck_op(env->me_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX) == 0) { + + mdbx_verbose("%s: got exclusive, drown locks", __func__); +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (env->me_sysv_ipc.semid != -1) + rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; +#else + rc = mdbx_ipclock_destroy(&env->me_lck->mti_rlock); + if (rc == 0) + rc = mdbx_ipclock_destroy(&env->me_lck->mti_wlock); +#endif /* MDBX_LOCKING */ + + mdbx_assert(env, rc == 0); + if (rc == 0) { + mdbx_munmap(&env->me_lck_mmap); + rc = ftruncate(env->me_lfd, 0) ? errno : 0; + } + + mdbx_jitter4testing(false); + } + + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored + * after file was closed. + * + * 2) File locks would be released (by kernel) while the file-descriptors will + * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, + * locks should be released here explicitly with properly order. */ + + /* close dxb and restore lock */ + if (env->me_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_fd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-lock */ + rc = lck_op( + inprocess_neighbor->me_fd, F_SETLKW, + (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) + ? 0 + : inprocess_neighbor->me_pid, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); + } + } + + /* close clk and restore locks */ + if (env->me_lfd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_lfd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-locks */ + rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); + if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) + rc = mdbx_rpid_set(inprocess_neighbor); + } + } + + if (inprocess_neighbor && rc != MDBX_SUCCESS) + inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; + return rc; +} + +/*---------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { + if (inprocess_neighbor) + return MDBX_SUCCESS /* currently don't need any initialization + if LCK already opened/used inside current process */ + ; +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + int semid = -1; + if (global_uniqueness_flag) { + struct stat st; + if (fstat(env->me_fd, &st)) + return errno; + sysv_retry_create: + semid = semget(env->me_sysv_ipc.key, 2, + IPC_CREAT | IPC_EXCL | + (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); + if (unlikely(semid == -1)) { + int err = errno; + if (err != EEXIST) + return err; + + /* remove and re-create semaphore set */ + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) { + err = errno; + if (err != ENOENT) + return err; + goto sysv_retry_create; + } + if (semctl(semid, 2, IPC_RMID)) { + err = errno; + if (err != EIDRM) + return err; + } + goto sysv_retry_create; + } + + unsigned short val_array[2] = {1, 1}; + if (semctl(semid, 2, SETALL, val_array)) + return errno; + } else { + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) + return errno; + + /* check read & write access */ + struct semid_ds data[2]; + if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) + return errno; + } + + env->me_sysv_ipc.semid = semid; + + return MDBX_SUCCESS; + +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#warning "TODO" +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + + /* don't initialize semaphores twice */ + if (global_uniqueness_flag == MDBX_RESULT_TRUE) { + if (sem_init(&env->me_lck->mti_rlock, true, 1)) + return errno; + if (sem_init(&env->me_lck->mti_wlock, true, 1)) + return errno; + } + return MDBX_SUCCESS; + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + + /* FIXME: Unfortunately, there is no other reliable way but to long testing + * on each platform. On the other hand, behavior like FreeBSD is incorrect + * and we can expect it to be rare. Moreover, even on FreeBSD without + * additional in-process initialization, the probability of an problem + * occurring is vanishingly small, and the symptom is a return of EINVAL + * while locking a mutex. In other words, in the worst case, the problem + * results in an EINVAL error at the start of the transaction, but NOT data + * loss, nor database corruption, nor other fatal troubles. Thus, the code + * below I am inclined to think the workaround for erroneous platforms (like + * FreeBSD), rather than a defect of libmdbx. */ +#if defined(__FreeBSD__) + /* seems that shared mutexes on FreeBSD required in-process initialization */ + (void)global_uniqueness_flag; +#else + /* shared mutexes on many other platforms (including Darwin and Linux's + * futexes) doesn't need any addition in-process initialization */ + if (global_uniqueness_flag != MDBX_RESULT_TRUE) + return MDBX_SUCCESS; +#endif + + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; + +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust) + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || \ + defined(pthread_mutexattr_setrobust_np) + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#else + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#endif + if (rc) + goto bailout; +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + +#if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 && \ + !defined(MDBX_SAFE4QEMU) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc && rc != ENOTSUP) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ + + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc && rc != ENOTSUP) + goto bailout; + + rc = pthread_mutex_init(&env->me_lck->mti_rlock, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_lck->mti_wlock, &ma); + +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING > 0 */ +} + +static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, + const int err) { + int rc = err; +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV + if (err == EOWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + + const bool rlocked = (env->me_lck && ipc == &env->me_lck->mti_rlock); + rc = MDBX_SUCCESS; + if (!rlocked) { + if (unlikely(env->me_txn)) { + /* env is hosed if the dead thread was ours */ + env->me_flags |= MDBX_FATAL_ERROR; + env->me_txn = NULL; + rc = MDBX_PANIC; + } + } + mdbx_notice("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + + int check_rc = mdbx_reader_check0(env, rlocked, NULL); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; +#else +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) + int mreco_rc = pthread_mutex_consistent(ipc); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) + int mreco_rc = pthread_mutex_consistent_np(ipc); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + int mreco_rc = pthread_mutex_consistent_np(ipc); +#else + int mreco_rc = pthread_mutex_consistent(ipc); +#endif + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + + if (unlikely(mreco_rc)) + mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); + + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) + pthread_mutex_unlock(ipc); +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + return rc; + } +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#warning "TODO" + (void)ipc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + + mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + if (rc != EDEADLK) + env->me_flags |= MDBX_FATAL_ERROR; + return rc; +} + +static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, + const bool dont_wait) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); + rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = MDBX_SUCCESS; + if (dont_wait) { + if (sem_trywait(ipc)) { + rc = errno; + if (rc == EAGAIN) + rc = MDBX_BUSY; + } + } else if (sem_wait(ipc)) + rc = errno; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op = {.sem_num = (ipc != env->me_wlock), + .sem_op = -1, + .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; + int rc; + if (semop(env->me_sysv_ipc.semid, &op, 1)) { + rc = errno; + if (dont_wait && rc == EAGAIN) + rc = MDBX_BUSY; + } else { + rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; + *ipc = env->me_pid; + } +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + + if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) + rc = mdbx_ipclock_failed(env, ipc, rc); + return rc; +} + +static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int rc = pthread_mutex_unlock(ipc); + (void)env; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; + (void)env; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + if (unlikely(*ipc != (pid_t)env->me_pid)) + return EPERM; + *ipc = 0; + struct sembuf op = { + .sem_num = (ipc != env->me_wlock), .sem_op = 1, .sem_flg = SEM_UNDO}; + int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + mdbx_jitter4testing(true); + int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); + mdbx_trace("<< rc %d", rc); + return rc; +} + +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); + mdbx_trace("<< rc %d", rc); + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_panic("%s() failed: errcode %d\n", __func__, rc); + mdbx_jitter4testing(true); +} + +int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { + mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>"); + mdbx_jitter4testing(true); + int rc = mdbx_ipclock_lock(env, env->me_wlock, dont_wait); + mdbx_trace("<< rc %d", rc); + return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; +} + +void mdbx_txn_unlock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + int rc = mdbx_ipclock_unlock(env, env->me_wlock); + mdbx_trace("<< rc %d", rc); + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_panic("%s() failed: errcode %d\n", __func__, rc); + mdbx_jitter4testing(true); +} diff --git a/libs/libmdbx/src/src/lck-windows.c b/libs/libmdbx/src/src/elements/lck-windows.c index 34a6e5a5dc..5d74bb8ccd 100644 --- a/libs/libmdbx/src/src/lck-windows.c +++ b/libs/libmdbx/src/src/elements/lck-windows.c @@ -1,4 +1,4 @@ -/* +/* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> * and other libmdbx authors: please see AUTHORS file. * All rights reserved. @@ -12,7 +12,7 @@ * <http://www.OpenLDAP.org/license.html>. */ -#include "./bits.h" +#include "internals.h" /* PREAMBLE FOR WINDOWS: * @@ -26,19 +26,16 @@ static void mdbx_winnt_import(void); -#ifdef MDBX_BUILD_DLL +#if MDBX_BUILD_SHARED_LIBRARY +#if MDBX_AVOID_CRT && defined(NDEBUG) /* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks. * - * Therefore we don't define dll's entry point for debug/checked builds by MSVC. - * In this case MSVC's will automatically use DllMainCRTStartup() from CRT - * library, which also automatically call DllMain() from our mdbx.dll - * - * On the other side, for RELEASE builds - * we explicitly define DllMain() as the entry point and don't linking with - * any CRT libraries (IgnoreAllDefaultLibraries = Yes). */ -#if !defined(_MSC_VER) || defined(NDEBUG) + * Define dll's entry point only for Release build when NDEBUG is defined and + * MDBX_AVOID_CRT=ON. if the entry point isn't defined then MSVC's will + * automatically use DllMainCRTStartup() from CRT library, which also + * automatically call DllMain() from our mdbx.dll */ #pragma comment(linker, "/ENTRY:DllMain") -#endif +#endif /* MDBX_AVOID_CRT */ BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) #else @@ -46,8 +43,8 @@ BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) static #endif /* !MDBX_CONFIG_MANUAL_TLS_CALLBACK */ void NTAPI - mdbx_dll_callback(PVOID module, DWORD reason, PVOID reserved) -#endif /* MDBX_BUILD_DLL */ + mdbx_dll_handler(PVOID module, DWORD reason, PVOID reserved) +#endif /* MDBX_BUILD_SHARED_LIBRARY */ { (void)reserved; switch (reason) { @@ -65,12 +62,12 @@ static mdbx_rthc_thread_dtor(module); break; } -#ifdef MDBX_BUILD_DLL +#if MDBX_BUILD_SHARED_LIBRARY return TRUE; #endif } -#if !defined(MDBX_BUILD_DLL) && !MDBX_CONFIG_MANUAL_TLS_CALLBACK +#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_CONFIG_MANUAL_TLS_CALLBACK /* *INDENT-OFF* */ /* clang-format off */ #if defined(_MSC_VER) @@ -94,7 +91,7 @@ static # pragma data_seg(".CRT$XLB") # endif - __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_dll_callback; + __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_dll_handler; # pragma data_seg(pop) # pragma const_seg(pop) @@ -102,13 +99,13 @@ static # ifdef _WIN64 const # endif - PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_dll_callback; + PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_dll_handler; #else # error FIXME #endif /* *INDENT-ON* */ /* clang-format on */ -#endif /* !defined(MDBX_BUILD_DLL) && !MDBX_CONFIG_MANUAL_TLS_CALLBACK */ +#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_CONFIG_MANUAL_TLS_CALLBACK */ /*----------------------------------------------------------------------------*/ @@ -169,7 +166,7 @@ void mdbx_txn_unlock(MDBX_env *env) { (env->me_flags & MDBX_EXCLUSIVE) ? TRUE : funlock(env->me_fd, LCK_BODY); LeaveCriticalSection(&env->me_windowsbug_lock); if (!rc) - mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); + mdbx_panic("%s failed: errcode %u", __func__, GetLastError()); } /*----------------------------------------------------------------------------*/ @@ -183,7 +180,7 @@ void mdbx_txn_unlock(MDBX_env *env) { #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN -int mdbx_rdt_lock(MDBX_env *env) { +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { mdbx_srwlock_AcquireShared(&env->me_remap_guard); if (env->me_lfd == INVALID_HANDLE_VALUE) return MDBX_SUCCESS; /* readonly database in readonly filesystem */ @@ -198,12 +195,12 @@ int mdbx_rdt_lock(MDBX_env *env) { return rc; } -void mdbx_rdt_unlock(MDBX_env *env) { +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* transite from S-E (locked) to S-? (used), e.g. unlock upper-part */ if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && !funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); + mdbx_panic("%s failed: errcode %u", __func__, GetLastError()); } mdbx_srwlock_ReleaseShared(&env->me_remap_guard); } @@ -245,15 +242,15 @@ static int suspend_and_append(mdbx_handle_array_t **array, return MDBX_SUCCESS; } -int mdbx_suspend_threads_before_remap(MDBX_env *env, - mdbx_handle_array_t **array) { - const mdbx_pid_t CurrentTid = GetCurrentThreadId(); +MDBX_INTERNAL_FUNC int +mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + const size_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck) { /* Scan LCK for threads of the current process */ const MDBX_reader *const begin = env->me_lck->mti_readers; const MDBX_reader *const end = begin + env->me_lck->mti_numreaders; - const mdbx_tid_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; + const size_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; for (const MDBX_reader *reader = begin; reader < end; ++reader) { if (reader->mr_pid != env->me_pid || !reader->mr_tid) { skip_lck: @@ -268,7 +265,7 @@ int mdbx_suspend_threads_before_remap(MDBX_env *env, goto skip_lck; } - rc = suspend_and_append(array, reader->mr_tid); + rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid); if (rc != MDBX_SUCCESS) { bailout_lck: (void)mdbx_resume_threads_after_remap(*array); @@ -276,7 +273,7 @@ int mdbx_suspend_threads_before_remap(MDBX_env *env, } } if (WriteTxnOwner && WriteTxnOwner != CurrentTid) { - rc = suspend_and_append(array, WriteTxnOwner); + rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner); if (rc != MDBX_SUCCESS) goto bailout_lck; } @@ -320,7 +317,8 @@ int mdbx_suspend_threads_before_remap(MDBX_env *env, return MDBX_SUCCESS; } -int mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { +MDBX_INTERNAL_FUNC int +mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { int rc = MDBX_SUCCESS; for (unsigned i = 0; i < array->count; ++i) { const HANDLE hThread = array->handles[i]; @@ -340,26 +338,106 @@ int mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { /* global `initial` lock for lockfile initialization, * exclusive/shared locking first cacheline */ -/* FIXME: locking schema/algo descritpion. - ?-? = free - S-? = used - E-? = exclusive-read - ?-S - ?-E = middle - S-S - S-E = locked - E-S - E-E = exclusive-write -*/ - -int mdbx_lck_init(MDBX_env *env) { +/* Briefly descritpion of locking schema/algorithm: + * - Windows does not support upgrading or downgrading for file locking. + * - Therefore upgrading/downgrading is emulated by shared and exclusive + * locking of upper and lower halves. + * - In other words, we have FSM with possible 9 states, + * i.e. free/shared/exclusive x free/shared/exclusive == 9. + * Only 6 states of FSM are used, which 2 of ones are transitive. + * + * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, + * or to the "used" (and returns MDBX_RESULT_FALSE). + * + * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * state to the "used" (i.e. shared) state. + * + * States: + * ?-? = free, i.e. unlocked + * S-? = used, i.e. shared lock + * E-? = exclusive-read, i.e. operational exclusive + * ?-S + * ?-E = middle (transitive state) + * S-S + * S-E = locked (transitive state) + * E-S + * E-E = exclusive-write, i.e. exclusive due (re)initialization + */ + +static void lck_unlock(MDBX_env *env) { + int rc; + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* double `unlock` for robustly remove overlapped shared/exclusive locks */ + while (funlock(env->me_lfd, LCK_LOWER)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_lfd, LCK_UPPER)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + } + + if (env->me_fd != INVALID_HANDLE_VALUE) { + /* explicitly unlock to avoid latency for other processes (windows kernel + * releases such locks via deferred queues) */ + while (funlock(env->me_fd, LCK_BODY)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_fd, LCK_META)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_fd, LCK_WHOLE)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + } +} + +MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { (void)env; + (void)inprocess_neighbor; + (void)global_uniqueness_flag; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor) { + (void)inprocess_neighbor; + + /* LY: should unmap before releasing the locks to avoid race condition and + * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ + if (env->me_map) + mdbx_munmap(&env->me_dxb_mmap); + if (env->me_lck) + mdbx_munmap(&env->me_lck_mmap); + + lck_unlock(env); return MDBX_SUCCESS; } /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) - * or as 'used' (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error - */ + * or as 'used' (S-? and returns MDBX_RESULT_FALSE). + * Oherwise returns an error. */ static int internal_seize_lck(HANDLE lfd) { int rc; assert(lfd != INVALID_HANDLE_VALUE); @@ -368,7 +446,7 @@ static int internal_seize_lck(HANDLE lfd) { mdbx_jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = GetLastError() /* 2) something went wrong, give up */; - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_error("%s(%s) failed: errcode %u", __func__, "?-?(free) >> ?-E(middle)", rc); return rc; } @@ -384,7 +462,7 @@ static int internal_seize_lck(HANDLE lfd) { if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "?-E(middle) >> ?-?(free)", GetLastError()); return rc; } @@ -396,20 +474,20 @@ static int internal_seize_lck(HANDLE lfd) { mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_error("%s(%s) failed: errcode %u", __func__, "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transite to S-? (used) or ?-? (free) */ if (!funlock(lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "X-E(locked/middle) >> X-?(used/free)", GetLastError()); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ return rc; } -int mdbx_lck_seize(MDBX_env *env) { +MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { int rc; assert(env->me_fd != INVALID_HANDLE_VALUE); @@ -423,7 +501,7 @@ int mdbx_lck_seize(MDBX_env *env) { mdbx_jitter4testing(false); if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = GetLastError(); - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc); return rc; } return MDBX_RESULT_FALSE; @@ -440,14 +518,14 @@ int mdbx_lck_seize(MDBX_env *env) { * while opening db in valid (non-conflict) mode. */ if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { rc = GetLastError(); - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_error("%s(%s) failed: errcode %u", __func__, "lock-against-without-lck", rc); mdbx_jitter4testing(false); - mdbx_lck_destroy(env); + lck_unlock(env); } else { mdbx_jitter4testing(false); if (!funlock(env->me_fd, LCK_BODY)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "unlock-against-without-lck", GetLastError()); } } @@ -455,100 +533,56 @@ int mdbx_lck_seize(MDBX_env *env) { return rc; } -int mdbx_lck_downgrade(MDBX_env *env, bool complete) { +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { /* Transite from exclusive state (E-?) to used (S-?) */ assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); +#if 1 if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; - +#else /* 1) must be at E-E (exclusive-write) */ - if (!complete) { + if (env->me_flags & MDBX_EXCLUSIVE) { /* transite from E-E to E_? (exclusive-read) */ if (!funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError()); return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */; } +#endif /* 3) now at E-E (exclusive-write), transite to ?_E (middle) */ if (!funlock(env->me_lfd, LCK_LOWER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "E-E(exclusive-write) >> ?-E(middle)", GetLastError()); /* 4) now at ?-E (middle), transite to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { int rc = GetLastError() /* 5) something went wrong, give up */; - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_error("%s(%s) failed: errcode %u", __func__, "?-E(middle) >> S-E(locked)", rc); return rc; } /* 6) got S-E (locked), continue transition to S-? (used) */ if (!funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + mdbx_panic("%s(%s) failed: errcode %u", __func__, "S-E(locked) >> S-?(used)", GetLastError()); return MDBX_SUCCESS /* 7) now at S-? (used), done */; } -void mdbx_lck_destroy(MDBX_env *env) { - int rc; - - if (env->me_lfd != INVALID_HANDLE_VALUE) { - /* double `unlock` for robustly remove overlapped shared/exclusive locks */ - while (funlock(env->me_lfd, LCK_LOWER)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - - while (funlock(env->me_lfd, LCK_UPPER)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - } - - if (env->me_fd != INVALID_HANDLE_VALUE) { - /* explicitly unlock to avoid latency for other processes (windows kernel - * releases such locks via deferred queues) */ - while (funlock(env->me_fd, LCK_BODY)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - - while (funlock(env->me_fd, LCK_META)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - - while (funlock(env->me_fd, LCK_WHOLE)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - } -} - /*----------------------------------------------------------------------------*/ /* reader checking (by pid) */ -int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } -int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } @@ -559,7 +593,7 @@ int mdbx_rpid_clear(MDBX_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { (void)env; HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); int rc; @@ -574,7 +608,7 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { switch (rc) { case ERROR_INVALID_PARAMETER: - /* pid seem invalid */ + /* pid seems invalid */ return MDBX_RESULT_FALSE; case WAIT_OBJECT_0: /* process just exited */ @@ -663,12 +697,38 @@ MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, /*----------------------------------------------------------------------------*/ +#if 0 /* LY: unused for now */ +static DWORD WINAPI stub_DiscardVirtualMemory(PVOID VirtualAddress, + SIZE_T Size) { + return VirtualAlloc(VirtualAddress, Size, MEM_RESET, PAGE_NOACCESS) + ? ERROR_SUCCESS + : GetLastError(); +} +#endif /* unused for now */ + +static uint64_t WINAPI stub_GetTickCount64(void) { + LARGE_INTEGER Counter, Frequency; + return (QueryPerformanceFrequency(&Frequency) && + QueryPerformanceCounter(&Counter)) + ? Counter.QuadPart * 1000ul / Frequency.QuadPart + : 0; +} + +/*----------------------------------------------------------------------------*/ +#ifndef MDBX_ALLOY MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; -MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_GetTickCount64 mdbx_GetTickCount64; +#if 0 /* LY: unused for now */ +MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory; +MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory; +MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory; +#endif /* unused for now */ +#endif /* MDBX_ALLOY */ static void mdbx_winnt_import(void) { const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); @@ -694,12 +754,22 @@ static void mdbx_winnt_import(void) { #define GET_KERNEL32_PROC(ENTRY) \ mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(hKernel32dll, #ENTRY) - GET_KERNEL32_PROC(GetFileInformationByHandleEx); GET_KERNEL32_PROC(GetVolumeInformationByHandleW); GET_KERNEL32_PROC(GetFinalPathNameByHandleW); GET_KERNEL32_PROC(SetFileInformationByHandle); GET_KERNEL32_PROC(PrefetchVirtualMemory); + GET_KERNEL32_PROC(GetTickCount64); + if (!mdbx_GetTickCount64) + mdbx_GetTickCount64 = stub_GetTickCount64; +#if 0 /* LY: unused for now */ + GET_KERNEL32_PROC(DiscardVirtualMemory); + if (!mdbx_DiscardVirtualMemory) + mdbx_DiscardVirtualMemory = stub_DiscardVirtualMemory; + GET_KERNEL32_PROC(OfferVirtualMemory); + GET_KERNEL32_PROC(ReclaimVirtualMemory); +#endif /* unused for now */ +#undef GET_KERNEL32_PROC const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); mdbx_NtFsControlFile = diff --git a/libs/libmdbx/src/src/ntdll.def b/libs/libmdbx/src/src/elements/ntdll.def index e3a6e33c94..e3a6e33c94 100644 --- a/libs/libmdbx/src/src/ntdll.def +++ b/libs/libmdbx/src/src/elements/ntdll.def diff --git a/libs/libmdbx/src/src/elements/options.h b/libs/libmdbx/src/src/elements/options.h new file mode 100644 index 0000000000..2694e92011 --- /dev/null +++ b/libs/libmdbx/src/src/elements/options.h @@ -0,0 +1,211 @@ +/******************************************************************************* + ******************************************************************************* + ******************************************************************************* + * + * + * #### ##### ##### # #### # # #### + * # # # # # # # # ## # # + * # # # # # # # # # # # #### + * # # ##### # # # # # # # # + * # # # # # # # # ## # # + * #### # # # #### # # #### + * + * + */ + +/* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ +#define MDBX_OSX_WANNA_DURABILITY 0 +/* using fsync() with chance of data lost on power failure */ +#define MDBX_OSX_WANNA_SPEED 1 + +#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY +#define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY +#endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */ + +/* Controls checking PID against reuse DB environment after the fork() */ +#ifndef MDBX_TXN_CHECKPID +#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) +/* PID check could be ommited: + * - on Linux when madvise(MADV_DONTFORK) is available. i.e. after the fork() + * mapped pages will not be available for child process. + * - in Windows where fork() not available. */ +#define MDBX_TXN_CHECKPID 0 +#else +#define MDBX_TXN_CHECKPID 1 +#endif +#define MDBX_TXN_CHECKPID_CONFIG "AUTO=" STRINGIFY(MDBX_TXN_CHECKPID) +#else +#define MDBX_TXN_CHECKPID_CONFIG STRINGIFY(MDBX_TXN_CHECKPID) +#endif /* MDBX_TXN_CHECKPID */ + +/* Controls checking transaction owner thread against misuse transactions from + * other threads. */ +#ifndef MDBX_TXN_CHECKOWNER +#define MDBX_TXN_CHECKOWNER 1 +#define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" STRINGIFY(MDBX_TXN_CHECKOWNER) +#else +#define MDBX_TXN_CHECKOWNER_CONFIG STRINGIFY(MDBX_TXN_CHECKOWNER) +#endif /* MDBX_TXN_CHECKOWNER */ + +/* Does a system have battery-backed Real-Time Clock or just a fake. */ +#ifndef MDBX_TRUST_RTC +#if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) || \ + defined(__OpenBSD__) +#define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */ +#else +#define MDBX_TRUST_RTC 1 +#endif +#define MDBX_TRUST_RTC_CONFIG "AUTO=" STRINGIFY(MDBX_TRUST_RTC) +#else +#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) +#endif /* MDBX_TRUST_RTC */ + +//------------------------------------------------------------------------------ + +#define MDBX_LOCKING_WIN32FILES -1 /* Win32 File Locking API */ +#define MDBX_LOCKING_SYSV 5 /* SystemV IPC semaphores */ +#define MDBX_LOCKING_POSIX1988 1988 /* POSIX-1 Shared anonymous semaphores */ +#define MDBX_LOCKING_POSIX2001 2001 /* POSIX-2001 Shared Mutexes */ +#define MDBX_LOCKING_POSIX2008 2008 /* POSIX-2008 Robust Mutexes */ +#define MDBX_LOCKING_BENAPHORE 1995 /* BeOS Benaphores, aka Futexes */ + +#if defined(_WIN32) || defined(_WIN64) +#define MDBX_LOCKING MDBX_LOCKING_WIN32FILES +#else +#ifndef MDBX_LOCKING +#if defined(_POSIX_THREAD_PROCESS_SHARED) && \ + _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__) + +/* Some platforms define the EOWNERDEAD error code even though they + * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */ +#if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L && \ + (defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) || \ + defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) || \ + defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \ + (!defined(__GLIBC__) || \ + __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */) +#define MDBX_LOCKING MDBX_LOCKING_POSIX2008 +#else +#define MDBX_LOCKING MDBX_LOCKING_POSIX2001 +#endif +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) +#define MDBX_LOCKING MDBX_LOCKING_POSIX1988 +#else +#define MDBX_LOCKING MDBX_LOCKING_SYSV +#endif +#define MDBX_LOCKING_CONFIG "AUTO=" STRINGIFY(MDBX_LOCKING) +#else +#define MDBX_LOCKING_CONFIG STRINGIFY(MDBX_LOCKING) +#endif /* MDBX_LOCKING */ +#endif /* !Windows */ + +#ifndef MDBX_USE_OFDLOCKS +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \ + !defined(MDBX_SAFE4QEMU) && \ + !defined(__sun) /* OFD-lock are broken on Solaris */ +#define MDBX_USE_OFDLOCKS 1 +#else +#define MDBX_USE_OFDLOCKS 0 +#endif +#define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" STRINGIFY(MDBX_USE_OFDLOCKS) +#else +#define MDBX_USE_OFDLOCKS_CONFIG STRINGIFY(MDBX_USE_OFDLOCKS) +#endif /* MDBX_USE_OFDLOCKS */ + +//------------------------------------------------------------------------------ + +#ifndef MDBX_CPU_WRITEBACK_INCOHERENT +#if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ + defined(__hppa__) +#define MDBX_CPU_WRITEBACK_INCOHERENT 0 +#else +#define MDBX_CPU_WRITEBACK_INCOHERENT 1 +#endif +#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ + +#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE +#ifdef __OpenBSD__ +#define MDBX_MMAP_INCOHERENT_FILE_WRITE 1 +#else +#define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 +#endif +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + +#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE +#if defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS has cache coherency issues. */ +#define MDBX_MMAP_INCOHERENT_CPU_CACHE 1 +#else +/* LY: assume no relevant mmap/dcache issues. */ +#define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 +#endif +#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ + +#ifndef MDBX_64BIT_ATOMIC +#if MDBX_WORDBITS >= 64 +#define MDBX_64BIT_ATOMIC 1 +#else +#define MDBX_64BIT_ATOMIC 0 +#endif +#define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" STRINGIFY(MDBX_64BIT_ATOMIC) +#else +#define MDBX_64BIT_ATOMIC_CONFIG STRINGIFY(MDBX_64BIT_ATOMIC) +#endif /* MDBX_64BIT_ATOMIC */ + +#ifndef MDBX_64BIT_CAS +#if defined(ATOMIC_LLONG_LOCK_FREE) +#if ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE) +#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE) +#if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(_MSC_VER) || defined(__APPLE__) +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC +#endif +#define MDBX_64BIT_CAS_CONFIG "AUTO=" STRINGIFY(MDBX_64BIT_CAS) +#else +#define MDBX_64BIT_CAS_CONFIG STRINGIFY(MDBX_64BIT_CAS) +#endif /* MDBX_64BIT_CAS */ + +#if !defined(MDBX_UNALIGNED_OK) +#if defined(_MSC_VER) +#define MDBX_UNALIGNED_OK 1 /* avoid MSVC misoptimization */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +#define MDBX_UNALIGNED_OK 0 /* expecting optimization is well done */ +#elif (defined(__ia32__) || defined(__ARM_FEATURE_UNALIGNED)) && \ + !defined(__ALIGNED__) +#define MDBX_UNALIGNED_OK 1 +#else +#define MDBX_UNALIGNED_OK 0 +#endif +#endif /* MDBX_UNALIGNED_OK */ + +#ifndef MDBX_CACHELINE_SIZE +#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) +#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE +#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) +#define MDBX_CACHELINE_SIZE 128 +#else +#define MDBX_CACHELINE_SIZE 64 +#endif +#endif /* MDBX_CACHELINE_SIZE */ + +/******************************************************************************* + ******************************************************************************* + ******************************************************************************/ diff --git a/libs/libmdbx/src/src/osal.c b/libs/libmdbx/src/src/elements/osal.c index 7d0ff083ed..a4fec8e4df 100644 --- a/libs/libmdbx/src/src/osal.c +++ b/libs/libmdbx/src/src/elements/osal.c @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> @@ -14,10 +14,12 @@ * <http://www.OpenLDAP.org/license.html>. */ -#include "./bits.h" +#include "internals.h" #if defined(_WIN32) || defined(_WIN64) +#include <winioctl.h> + static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: @@ -52,10 +54,6 @@ static int ntstatus2errcode(NTSTATUS status) { * conflict with the regular user-level headers, so we explicitly * declare them here. Using these APIs also means we must link to * ntdll.dll, which is not linked by default in user code. */ -#pragma comment(lib, "ntdll.lib") -#ifdef MDBX_AVOID_CRT -#pragma comment(lib, "mdbx_ntdll_extra.lib") -#endif extern NTSTATUS NTAPI NtCreateSection( OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, @@ -181,8 +179,24 @@ __extern_C void __assert_rtn(const char *function, const char *file, int line, #define __assert_fail(assertion, file, line, function) \ __assert_rtn(function, file, line, assertion) -#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) +__extern_C void __assert_c99(const char *assection, const char *file, int line, + const char *function) __noreturn; +#define __assert_fail(assertion, file, line, function) \ + __assert_c99(assertion, file, line, function) +#elif defined(__OpenBSD__) +__extern_C __dead void __assert2(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert2(file, line, function, assertion) +#elif defined(__NetBSD__) +__extern_C __dead void __assert13(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert13(file, line, function, assertion) +#elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \ defined(__DragonFly__) __extern_C void __assert(const char *function, const char *file, int line, const char *assertion) /* __nothrow */ @@ -200,8 +214,9 @@ __extern_C void __assert(const char *function, const char *file, int line, #endif /* __assert_fail */ -void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, - const char *func, int line) { +MDBX_INTERNAL_FUNC void __cold mdbx_assert_fail(const MDBX_env *env, + const char *msg, + const char *func, int line) { #if MDBX_DEBUG if (env && env->me_assert_func) { env->me_assert_func(env, msg, func, line); @@ -212,7 +227,7 @@ void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, #endif /* MDBX_DEBUG */ if (mdbx_debug_logger) - mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); + mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) char *message = nullptr; @@ -235,24 +250,25 @@ void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, #endif } -__cold void mdbx_panic(const char *fmt, ...) { +MDBX_INTERNAL_FUNC __cold void mdbx_panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); char *message = nullptr; const int num = mdbx_vasprintf(&message, fmt, ap); va_end(ap); - if (num < 1 || !message) - message = "<troubles with panic-message preparation>"; + const char *const const_message = + (num < 1 || !message) ? "<troubles with panic-message preparation>" + : message; #if defined(_WIN32) || defined(_WIN64) OutputDebugStringA("\r\nMDBX-PANIC: "); - OutputDebugStringA(message); + OutputDebugStringA(const_message); if (IsDebuggerPresent()) DebugBreak(); FatalExit(ERROR_UNHANDLED_ERROR); #else - __assert_fail(message, "mdbx", 0, "panic"); + __assert_fail(const_message, "mdbx", 0, "panic"); abort(); #endif } @@ -260,7 +276,8 @@ __cold void mdbx_panic(const char *fmt, ...) { /*----------------------------------------------------------------------------*/ #ifndef mdbx_vasprintf -int mdbx_vasprintf(char **strp, const char *fmt, va_list ap) { +MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, + va_list ap) { va_list ones; va_copy(ones, ap); int needed = vsnprintf(nullptr, 0, fmt, ap); @@ -295,7 +312,7 @@ int mdbx_vasprintf(char **strp, const char *fmt, va_list ap) { #endif /* mdbx_vasprintf */ #ifndef mdbx_asprintf -int mdbx_asprintf(char **strp, const char *fmt, ...) { +MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); int rc = mdbx_vasprintf(strp, fmt, ap); @@ -305,7 +322,8 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { #endif /* mdbx_asprintf */ #ifndef mdbx_memalign_alloc -int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { +MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, + void **result) { #if defined(_WIN32) || defined(_WIN64) (void)alignment; *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); @@ -326,7 +344,7 @@ int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { #endif /* mdbx_memalign_alloc */ #ifndef mdbx_memalign_free -void mdbx_memalign_free(void *ptr) { +MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) { #if defined(_WIN32) || defined(_WIN64) VirtualFree(ptr, 0, MEM_RELEASE); #else @@ -349,7 +367,7 @@ char *mdbx_strdup(const char *str) { /*----------------------------------------------------------------------------*/ -int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) int rc = MDBX_SUCCESS; condmutex->event = NULL; @@ -384,7 +402,7 @@ static bool is_allzeros(const void *ptr, size_t bytes) { return true; } -int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { int rc = MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) if (condmutex->event) { @@ -412,7 +430,7 @@ int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { return rc; } -int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(condmutex->mutex, INFINITE); return waitstatus2errcode(code); @@ -421,7 +439,7 @@ int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { #endif } -int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) return ReleaseMutex(condmutex->mutex) ? MDBX_SUCCESS : GetLastError(); #else @@ -429,7 +447,7 @@ int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { #endif } -int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) return SetEvent(condmutex->event) ? MDBX_SUCCESS : GetLastError(); #else @@ -437,7 +455,7 @@ int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { #endif } -int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { +MDBX_INTERNAL_FUNC int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) DWORD code = SignalObjectAndWait(condmutex->mutex, condmutex->event, INFINITE, FALSE); @@ -451,7 +469,7 @@ int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { /*----------------------------------------------------------------------------*/ -int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -460,7 +478,7 @@ int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #endif } -int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) DeleteCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -469,7 +487,7 @@ int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #endif } -int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -478,7 +496,7 @@ int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { #endif } -int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -489,18 +507,26 @@ int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { /*----------------------------------------------------------------------------*/ -int mdbx_removefile(const char *pathname) { +MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #if defined(_WIN32) || defined(_WIN64) return DeleteFileA(pathname) ? MDBX_SUCCESS : GetLastError(); #else return unlink(pathname) ? errno : MDBX_SUCCESS; #endif } -int mdbx_openfile(const char *pathname, int flags, mode_t mode, - mdbx_filehandle_t *fd, bool exclusive) { + +MDBX_INTERNAL_FUNC int mdbx_openfile(const char *pathname, int flags, + mode_t mode, mdbx_filehandle_t *fd, + bool exclusive) { *fd = INVALID_HANDLE_VALUE; #if defined(_WIN32) || defined(_WIN64) (void)mode; + size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; DWORD DesiredAccess, ShareMode; DWORD FlagsAndAttributes = FILE_ATTRIBUTE_NORMAL; @@ -540,7 +566,7 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, break; } - *fd = CreateFileA(pathname, DesiredAccess, ShareMode, NULL, + *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) @@ -588,7 +614,7 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, return MDBX_SUCCESS; } -int mdbx_closefile(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) return CloseHandle(fd) ? MDBX_SUCCESS : GetLastError(); #else @@ -596,7 +622,8 @@ int mdbx_closefile(mdbx_filehandle_t fd) { #endif } -int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { +MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, + uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) @@ -622,8 +649,8 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; } -int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, - uint64_t offset) { +MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t bytes, uint64_t offset) { while (true) { #if defined(_WIN32) || defined(_WIN64) OVERLAPPED ov; @@ -632,9 +659,9 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, ov.OffsetHigh = HIGH_DWORD(offset); DWORD written; - if (unlikely(!WriteFile(fd, buf, - (bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, - &written, &ov))) + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, &ov))) return GetLastError(); if (likely(bytes == written)) return MDBX_SUCCESS; @@ -642,7 +669,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); const intptr_t written = - pwrite(fd, buf, (bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); + pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; if (written < 0) { @@ -658,6 +685,36 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, } } +MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, + size_t bytes) { + while (true) { +#if defined(_WIN32) || defined(_WIN64) + DWORD written; + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, nullptr))) + return GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + const intptr_t written = + write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE); + if (likely(bytes == (size_t)written)) + return MDBX_SUCCESS; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } +#endif + bytes -= written; + buf = (char *)buf + written; + } +} + int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, uint64_t offset, size_t expected_written) { #if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) @@ -686,7 +743,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits) { +MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, + enum mdbx_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) return ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) == 0 || FlushFileBuffers(fd)) @@ -745,7 +803,38 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { return MDBX_SUCCESS; } -int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + switch (GetFileType(fd)) { + case FILE_TYPE_DISK: + return MDBX_RESULT_FALSE; + case FILE_TYPE_CHAR: + case FILE_TYPE_PIPE: + return MDBX_RESULT_TRUE; + default: + return GetLastError(); + } +#else + struct stat info; + if (fstat(fd, &info)) + return errno; + switch (info.st_mode & S_IFMT) { + case S_IFBLK: + case S_IFREG: + return MDBX_RESULT_FALSE; + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + return MDBX_RESULT_TRUE; + case S_IFDIR: + case S_IFLNK: + default: + return MDBX_INCOMPATIBLE; + } +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_SetFileInformationByHandle) { FILE_END_OF_FILE_INFO EndOfFileInfo; @@ -769,7 +858,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } -int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; li.QuadPart = pos; @@ -784,9 +873,10 @@ int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { /*----------------------------------------------------------------------------*/ -int mdbx_thread_create(mdbx_thread_t *thread, - THREAD_RESULT(THREAD_CALL *start_routine)(void *), - void *arg) { +MDBX_INTERNAL_FUNC int +mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg) { #if defined(_WIN32) || defined(_WIN64) *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); return *thread ? MDBX_SUCCESS : GetLastError(); @@ -795,7 +885,7 @@ int mdbx_thread_create(mdbx_thread_t *thread, #endif } -int mdbx_thread_join(mdbx_thread_t thread) { +MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(thread, INFINITE); return waitstatus2errcode(code); @@ -807,14 +897,15 @@ int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { +MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, + size_t length, int async) { uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd))) return MDBX_SUCCESS; return GetLastError(); #else -#ifdef __linux__ +#if defined(__linux__) || defined(__gnu_linux__) if (async && mdbx_linux_kernel_version > 0x02061300) /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op, since the kernel properly tracks dirty pages and flushes them to storage @@ -832,7 +923,39 @@ int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { #endif } -int mdbx_check4nonlocal(mdbx_filehandle_t handle, int flags) { +MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, + const char *pathname, int err) { +#if defined(_WIN32) || defined(_WIN64) + (void)pathname; + (void)err; + if (!mdbx_GetVolumeInformationByHandleW) + return MDBX_ENOSYS; + DWORD unused, flags; + if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused, + &flags, nullptr, 0)) + return GetLastError(); + if ((flags & FILE_READ_ONLY_VOLUME) == 0) + return MDBX_EACCESS; +#else + struct statvfs info; + if (err != MDBX_ENOFILE) { + if (statvfs(pathname, &info)) + return errno; + if ((info.f_flag & ST_RDONLY) == 0) + return err; + } + if (fstatvfs(handle, &info)) + return errno; + if ((info.f_flag & ST_RDONLY) == 0) + return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err; +#endif /* !Windows */ + return MDBX_SUCCESS; +} + +static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { + if (flags & MDBX_EXCLUSIVE) + return 0; + #if defined(_WIN32) || defined(_WIN64) if (GetFileType(handle) != FILE_TYPE_DISK) return ERROR_FILE_OFFLINE; @@ -842,7 +965,6 @@ int mdbx_check4nonlocal(mdbx_filehandle_t handle, int flags) { if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo, &RemoteProtocolInfo, sizeof(RemoteProtocolInfo))) { - if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) && !(flags & MDBX_RDONLY)) return ERROR_FILE_OFFLINE; @@ -943,41 +1065,197 @@ int mdbx_check4nonlocal(mdbx_filehandle_t handle, int flags) { return rc; } #else - (void)handle; - /* TODO: check for NFS handle ? */ - (void)flags; + + struct statvfs statvfs_info; + if (fstatvfs(handle, &statvfs_info)) + return errno; +#if defined(ST_LOCAL) || defined(ST_EXPORTED) + const unsigned long st_flags = statvfs_info.f_flag; +#endif /* ST_LOCAL || ST_EXPORTED */ + +#if defined(__NetBSD__) + const unsigned type = 0; + const char *const name = statvfs_info.f_fstypename; + const size_t name_len = VFS_NAMELEN; +#elif defined(_AIX) || defined(__OS400__) + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); + struct stat st; + if (fstat(handle, &st)) + return errno; + const unsigned type = st.st_vfstype; + if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(FSTYPSZ) || defined(_FSTYPSZ) + const unsigned type = 0; + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ + defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ) + const unsigned type = 0; + struct stat st; + if (fstat(handle, &st)) + return errno; + const char *const name = st.st_fstype; + const size_t name_len = strlen(name); +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; #endif +#if defined(MNT_LOCAL) || defined(MNT_EXPORTED) + const unsigned long mnt_flags = statfs_info.f_flags; +#endif /* MNT_LOCAL || MNT_EXPORTED */ +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#else + + const char *name = ""; + unsigned name_len = 0; + + struct stat st; + if (fstat(handle, &st)) + return errno; + + char pathbuf[PATH_MAX]; + FILE *mounted = nullptr; +#if defined(__linux__) || defined(__gnu_linux__) + mounted = setmntent("/proc/mounts", "r"); +#endif /* Linux */ + if (!mounted) + mounted = setmntent("/etc/mtab", "r"); + if (mounted) { + const struct mntent *ent; +#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || \ + (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19)) + struct mntent entbuf; + const bool should_copy = false; + while (nullptr != + (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf)))) +#else + const bool should_copy = true; + while (nullptr != (ent = getmntent(mounted)))) +#endif + { + struct stat mnt; + if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) { + if (should_copy) { + name = + strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1); + pathbuf[name_len] = 0; + } else { + name = ent->mnt_fsname; + name_len = strlen(name); + } + break; + } + } + endmntent(mounted); + } +#endif /* !xBSD */ +#endif + + if (name_len) { + if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) || + strncasecmp("cifs", name, name_len) == 0 || + strncasecmp("ncpfs", name, name_len) == 0 || + strncasecmp("smbfs", name, name_len) == 0 || + ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && + strncasecmp("fuseblk", name, name_len) != 0)) && + !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; + if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 || + strcasecmp("sshfs", name) == 0) + return MDBX_EREMOTE; + } + +#ifdef ST_LOCAL + if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(MNT_LOCAL) + if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#endif /* ST/MNT_LOCAL */ + +#ifdef ST_EXPORTED + if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#elif defined(MNT_EXPORTED) + if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#endif /* ST/MNT_EXPORTED */ + + switch (type) { + case 0xFF534D42 /* CIFS_MAGIC_NUMBER */: + case 0x6969 /* NFS_SUPER_MAGIC */: + case 0x564c /* NCP_SUPER_MAGIC */: + case 0x517B /* SMB_SUPER_MAGIC */: +#if defined(__digital__) || defined(__osf__) || defined(__osf) + case 0x0E /* Tru64 NFS */: +#endif +#ifdef ST_FST_NFS + case ST_FST_NFS: +#endif + if ((flags & MDBX_EXCLUSIVE) == 0) + return MDBX_EREMOTE; + case 0: + default: + break; + } +#endif /* Unix */ + return MDBX_SUCCESS; } -int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { +MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, + const size_t size, const size_t limit, + const unsigned options) { assert(size <= limit); -#if defined(_WIN32) || defined(_WIN64) - map->length = 0; + map->limit = 0; map->current = 0; - map->section = NULL; map->address = nullptr; +#if defined(_WIN32) || defined(_WIN64) + map->section = NULL; + map->filesize = 0; +#endif /* Windows */ - if (!(flags & MDBX_EXCLUSIVE)) { - NTSTATUS rc = mdbx_check4nonlocal(map->fd, flags); - if (rc != MDBX_SUCCESS) - return rc; - } + int err = mdbx_check_fs_local(map->fd, flags); + if (unlikely(err != MDBX_SUCCESS)) + return err; - NTSTATUS rc = mdbx_filesize(map->fd, &map->filesize); - if (rc != MDBX_SUCCESS) - return rc; - if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - rc = mdbx_ftruncate(map->fd, size); - if (rc == MDBX_SUCCESS) - map->filesize = size; - /* ignore error, because Windows unable shrink file - * that already mapped (by another process) */ + if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { + err = mdbx_ftruncate(map->fd, size); + if (err != MDBX_SUCCESS) + return err; +#if defined(_WIN32) || defined(_WIN64) + map->filesize = size; +#else + map->current = size; +#endif + } else { + uint64_t filesize = 0; + err = mdbx_filesize(map->fd, &filesize); + if (err != MDBX_SUCCESS) + return err; +#if defined(_WIN32) || defined(_WIN64) + map->filesize = filesize; +#else + map->current = (filesize > limit) ? limit : (size_t)filesize; +#endif } +#if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER SectionSize; SectionSize.QuadPart = size; - rc = NtCreateSection( + err = NtCreateSection( &map->section, /* DesiredAccess */ (flags & MDBX_WRITEMAP) @@ -988,11 +1266,11 @@ int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { /* SectionPageProtection */ (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, /* AllocationAttributes */ SEC_RESERVE, map->fd); - if (!NT_SUCCESS(rc)) - return ntstatus2errcode(rc); + if (!NT_SUCCESS(err)) + return ntstatus2errcode(err); SIZE_T ViewSize = (flags & MDBX_RDONLY) ? 0 : limit; - rc = NtMapViewOfSection( + err = NtMapViewOfSection( map->section, GetCurrentProcess(), &map->address, /* ZeroBits */ 0, /* CommitSize */ 0, @@ -1001,62 +1279,93 @@ int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, /* Win32Protect */ (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); - if (!NT_SUCCESS(rc)) { + if (!NT_SUCCESS(err)) { NtClose(map->section); map->section = 0; map->address = nullptr; - return ntstatus2errcode(rc); + return ntstatus2errcode(err); } assert(map->address != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; - map->length = ViewSize; - return MDBX_SUCCESS; + map->limit = ViewSize; + #else - (void)size; + +#ifndef MAP_TRYFIXED +#define MAP_TRYFIXED 0 +#endif + +#ifndef MAP_HASSEMAPHORE +#define MAP_HASSEMAPHORE 0 +#endif + +#ifndef MAP_CONCEAL +#define MAP_CONCEAL 0 +#endif + +#ifndef MAP_NOSYNC +#define MAP_NOSYNC 0 +#endif + map->address = mmap( NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, - MAP_SHARED, map->fd, 0); - if (likely(map->address != MAP_FAILED)) { - map->length = limit; - return MDBX_SUCCESS; + MAP_SHARED | MAP_FILE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | + ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC + : MAP_CONCEAL), + map->fd, 0); + + if (unlikely(map->address == MAP_FAILED)) { + map->limit = 0; + map->current = 0; + map->address = nullptr; + return errno; } - map->length = 0; - map->address = nullptr; - return errno; + map->limit = limit; + +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + return errno; +#endif +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); +#endif + #endif + + return MDBX_SUCCESS; } -int mdbx_munmap(mdbx_mmap_t *map) { +MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { #if defined(_WIN32) || defined(_WIN64) if (map->section) NtClose(map->section); NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); if (!NT_SUCCESS(rc)) ntstatus2errcode(rc); - - map->length = 0; - map->current = 0; - map->address = nullptr; #else - if (unlikely(munmap(map->address, map->length))) + if (unlikely(munmap(map->address, map->limit))) return errno; - map->length = 0; - map->address = nullptr; #endif + + map->limit = 0; + map->current = 0; + map->address = nullptr; return MDBX_SUCCESS; } -int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { +MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, + size_t limit) { assert(size <= limit); #if defined(_WIN32) || defined(_WIN64) - assert(size != map->current || limit != map->length || size < map->filesize); + assert(size != map->current || limit != map->limit || size < map->filesize); NTSTATUS status; LARGE_INTEGER SectionSize; int err, rc = MDBX_SUCCESS; - if (!(flags & MDBX_RDONLY) && limit == map->length && size > map->current) { + if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current) { /* growth rw-section */ SectionSize.QuadPart = size; status = NtExtendSection(map->section, &SectionSize); @@ -1068,10 +1377,10 @@ int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { return ntstatus2errcode(status); } - if (limit > map->length) { + if (limit > map->limit) { /* check ability of address space for growth before umnap */ - PVOID BaseAddress = (PBYTE)map->address + map->length; - SIZE_T RegionSize = limit - map->length; + PVOID BaseAddress = (PBYTE)map->address + map->limit; + SIZE_T RegionSize = limit - map->limit; status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, &RegionSize, MEM_RESERVE, PAGE_NOACCESS); if (!NT_SUCCESS(status)) @@ -1101,7 +1410,7 @@ int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, size_t limit) { err = ntstatus2errcode(status); bailout: map->address = NULL; - map->current = map->length = 0; + map->current = map->limit = 0; if (ReservedAddress) (void)NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, &ReservedSize, MEM_RELEASE); @@ -1184,12 +1493,12 @@ retry_mapview:; NtClose(map->section); map->section = NULL; - if (map->address && (size != map->current || limit != map->length)) { + if (map->address && (size != map->current || limit != map->limit)) { /* try remap with previously size and limit, * but will return MDBX_RESULT_TRUE on success */ rc = MDBX_RESULT_TRUE; size = map->current; - limit = map->length; + limit = map->limit; goto retry_file_and_section; } @@ -1199,36 +1508,59 @@ retry_mapview:; assert(map->address != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; - map->length = ViewSize; - return rc; + map->limit = ViewSize; #else - if (limit != map->length) { -#if defined(_GNU_SOURCE) && \ - !(defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ - defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__)) - void *ptr = mremap(map->address, map->length, limit, + + uint64_t filesize = 0; + int rc = mdbx_filesize(map->fd, &filesize); + if (rc != MDBX_SUCCESS) + return rc; + + if (flags & MDBX_RDONLY) { + map->current = (filesize > limit) ? limit : (size_t)filesize; + if (map->current != size) + rc = MDBX_RESULT_TRUE; + } else if (filesize != size) { + rc = mdbx_ftruncate(map->fd, size); + if (rc != MDBX_SUCCESS) + return rc; + map->current = size; + } + + if (limit != map->limit) { +#if defined(_GNU_SOURCE) && (defined(__linux__) || defined(__gnu_linux__)) + void *ptr = mremap(map->address, map->limit, limit, /* LY: in case changing the mapping size calling code - must guarantees the absence of competing threads, and - a willingness to another base address */ + must guarantees the absence of competing threads, + and a willingness to another base address */ MREMAP_MAYMOVE); if (ptr == MAP_FAILED) { - int err = errno; - return (err == EAGAIN || err == ENOMEM) ? MDBX_RESULT_TRUE : err; + rc = errno; + return (rc == EAGAIN || rc == ENOMEM) ? MDBX_RESULT_TRUE : rc; } map->address = ptr; - map->length = limit; + map->limit = limit; + +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + return errno; +#endif + +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); +#endif + #else - return MDBX_RESULT_TRUE; -#endif /* mremap() <= _GNU_SOURCE && !__FreeBSD__ */ + rc = MDBX_RESULT_TRUE; +#endif /* _GNU_SOURCE && __linux__ */ } - return (flags & MDBX_RDONLY) ? MDBX_SUCCESS : mdbx_ftruncate(map->fd, size); #endif + return rc; } /*----------------------------------------------------------------------------*/ -__cold void mdbx_osal_jitter(bool tiny) { +MDBX_INTERNAL_FUNC __cold void mdbx_osal_jitter(bool tiny) { for (;;) { #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__) @@ -1256,7 +1588,7 @@ __cold void mdbx_osal_jitter(bool tiny) { #elif defined(__APPLE__) || defined(__MACH__) #include <mach/mach_time.h> #elif defined(__linux__) || defined(__gnu_linux__) -static __cold clockid_t choice_monoclock() { +static __cold clockid_t choice_monoclock(void) { struct timespec probe; #if defined(CLOCK_BOOTTIME) if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) @@ -1272,26 +1604,52 @@ static __cold clockid_t choice_monoclock() { } #endif -uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +/*----------------------------------------------------------------------------*/ + +#if defined(_WIN32) || defined(_WIN64) +static LARGE_INTEGER performance_frequency; +#elif defined(__APPLE__) || defined(__MACH__) +static uint64_t ratio_16dot16_to_monotine; +#endif + +MDBX_INTERNAL_FUNC uint64_t +mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #if defined(_WIN32) || defined(_WIN64) - static LARGE_INTEGER performance_frequency; - if (performance_frequency.QuadPart == 0) + if (unlikely(performance_frequency.QuadPart == 0)) QueryPerformanceFrequency(&performance_frequency); const uint64_t ratio = performance_frequency.QuadPart; #elif defined(__APPLE__) || defined(__MACH__) - static uint64_t ratio; - if (!ratio) { + if (unlikely(ratio_16dot16_to_monotine == 0)) { mach_timebase_info_data_t ti; mach_timebase_info(&ti); - ratio = UINT64_C(1000000000) * ti.denom / ti.numer; + ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; } + const uint64_t ratio = ratio_16dot16_to_monotine; #else const uint64_t ratio = UINT64_C(1000000000); #endif return (ratio * seconds_16dot16 + 32768) >> 16; } -uint64_t mdbx_osal_monotime(void) { +MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { + static uint64_t limit; + if (unlikely(monotime > limit)) { + if (limit != 0) + return UINT32_MAX; + limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); + if (monotime > limit) + return UINT32_MAX; + } +#if defined(_WIN32) || defined(_WIN64) + return (uint32_t)((monotime << 16) / performance_frequency.QuadPart); +#elif defined(__APPLE__) || defined(__MACH__) + return (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); +#else + return (uint32_t)(monotime * 128 / 1953125); +#endif +} + +MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER counter; counter.QuadPart = 0; @@ -1319,3 +1677,432 @@ uint64_t mdbx_osal_monotime(void) { return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; #endif } + +/*----------------------------------------------------------------------------*/ + +static void bootid_shake(bin128_t *p) { + /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */ + const uint32_t e = p->a - (p->b << 23 | p->b >> 9); + p->a = p->b ^ (p->c << 16 | p->c >> 16); + p->b = p->c + (p->d << 11 | p->d >> 21); + p->c = p->d + e; + p->d = e + p->a; +} + +static void bootid_collect(bin128_t *p, const void *s, size_t n) { + p->y += UINT64_C(64526882297375213); + bootid_shake(p); + for (size_t i = 0; i < n; ++i) { + bootid_shake(p); + p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i]; + bootid_shake(p); + p->y += 14621231; + } + bootid_shake(p); + + /* minor non-linear tomfoolery */ + const unsigned z = p->x % 61; + p->y = p->y << z | p->y >> (64 - z); + bootid_shake(p); + bootid_shake(p); + const unsigned q = p->x % 59; + p->y = p->y << q | p->y >> (64 - q); + bootid_shake(p); + bootid_shake(p); + bootid_shake(p); +} + +#if defined(_WIN32) || defined(_WIN64) + +static uint64_t windows_systemtime_ms() { + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; +} + +static uint64_t windows_bootime(void) { + unsigned confirmed = 0; + uint64_t boottime = 0; + uint64_t up0 = mdbx_GetTickCount64(); + uint64_t st0 = windows_systemtime_ms(); + for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) { + YieldProcessor(); + const uint64_t up1 = mdbx_GetTickCount64(); + const uint64_t st1 = windows_systemtime_ms(); + if (st1 > fuse && st1 == st0 && up1 == up0) { + uint64_t diff = st1 - up1; + if (boottime == diff) { + if (++confirmed > 4) + return boottime; + } else { + confirmed = 0; + boottime = diff; + } + fuse = st1; + Sleep(1); + } + st0 = st1; + up0 = up1; + } + return 0; +} + +static LSTATUS mdbx_RegGetValue(HKEY hkey, LPCWSTR lpSubKey, LPCWSTR lpValue, + DWORD dwFlags, LPDWORD pdwType, PVOID pvData, + LPDWORD pcbData) { + LSTATUS rc = + RegGetValueW(hkey, lpSubKey, lpValue, dwFlags, pdwType, pvData, pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + + rc = RegGetValueW(hkey, lpSubKey, lpValue, + dwFlags | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */, pdwType, + pvData, pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + return RegGetValueW(hkey, lpSubKey, lpValue, + dwFlags | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */, pdwType, + pvData, pcbData); +} +#endif + +static __cold __maybe_unused bool bootid_parse_uuid(bin128_t *s, const void *p, + const size_t n) { + if (n > 31) { + unsigned bits = 0; + for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { + uint8_t c = ((const uint8_t *)p)[i]; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + else + continue; + assert(c <= 15); + c ^= s->y >> 60; + s->y = s->y << 4 | s->x >> 60; + s->x = s->x << 4 | c; + bits += 4; + } + if (bits > 42 * 3) + /* UUID parsed successfully */ + return true; + } + + if (n > 15) /* is enough handle it as a binary? */ { + if (n == sizeof(bin128_t)) { + bin128_t aligned; + memcpy(&aligned, p, sizeof(bin128_t)); + s->x += aligned.x; + s->y += aligned.y; + } else + bootid_collect(s, p, n); + return true; + } + + if (n) + bootid_collect(s, p, n); + return false; +} + +__cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) { + bin128_t bin = {{0, 0}}; + bool got_machineid = false, got_boottime = false, got_bootseq = false; + +#if defined(__linux__) || defined(__gnu_linux__) + { + const int fd = + open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW); + if (fd != -1) { + struct statfs fs; + char buf[42]; + const ssize_t len = + (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0) + ? read(fd, buf, sizeof(buf)) + : -1; + close(fd); + if (len > 0 && bootid_parse_uuid(&bin, buf, len)) + return bin; + } + } +#endif /* Linux */ + +#if defined(__APPLE__) || defined(__MACH__) + { + char buf[42]; + size_t len = sizeof(buf); + if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) && + bootid_parse_uuid(&bin, buf, len)) + return bin; + +#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ + __MAC_OS_X_VERSION_MIN_REQUIRED > 1050 + uuid_t uuid; + struct timespec wait = {0, 1000000000u / 42}; + if (!gethostuuid(uuid, &wait) && + bootid_parse_uuid(&bin, uuid, sizeof(uuid))) + got_machineid = true; +#endif /* > 10.5 */ + + struct timeval boottime; + len = sizeof(boottime); + if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) && + len == sizeof(boottime) && boottime.tv_sec) + got_boottime = true; + } +#endif /* Apple/Darwin */ + +#if defined(_WIN32) || defined(_WIN64) + { + union buf { + DWORD BootId; + DWORD BaseTime; + SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo; + struct { + LARGE_INTEGER BootTime; + LARGE_INTEGER CurrentTime; + LARGE_INTEGER TimeZoneBias; + ULONG TimeZoneId; + ULONG Reserved; + ULONGLONG BootTimeBias; + ULONGLONG SleepTimeBias; + } SysTimeOfDayInfoHacked; + wchar_t MachineGuid[42]; + char DigitalProductId[248]; + } buf; + + static const wchar_t HKLM_MicrosoftCryptography[] = + L"SOFTWARE\\Microsoft\\Cryptography"; + DWORD len = sizeof(buf); + /* Windows is madness and must die */ + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography, + L"MachineGuid", RRF_RT_ANY, NULL, &buf.MachineGuid, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) + got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len); + + if (!got_machineid) { + /* again, Windows is madness */ + static const wchar_t HKLM_WindowsNT[] = + L"SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion"; + static const wchar_t HKLM_WindowsNT_DPK[] = + L"SOFTWARE\\Microsoft\\Windows " + L"NT\\CurrentVersion\\DefaultProductKey"; + static const wchar_t HKLM_WindowsNT_DPK2[] = + L"SOFTWARE\\Microsoft\\Windows " + L"NT\\CurrentVersion\\DefaultProductKey2"; + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT, + L"DigitalProductId", RRF_RT_ANY, NULL, + &buf.DigitalProductId, &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK, + L"DigitalProductId", RRF_RT_ANY, NULL, + &buf.DigitalProductId, &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2, + L"DigitalProductId", RRF_RT_ANY, NULL, + &buf.DigitalProductId, &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + } + + static const wchar_t HKLM_PrefetcherParams[] = + L"SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory " + L"Management\\PrefetchParameters"; + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, L"BootId", + RRF_RT_DWORD, NULL, &buf.BootId, + &len) == ERROR_SUCCESS && + len > 1 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.BootId, len); + got_bootseq = true; + } + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, L"BaseTime", + RRF_RT_DWORD, NULL, &buf.BaseTime, + &len) == ERROR_SUCCESS && + len >= sizeof(buf.BaseTime) && buf.BaseTime) { + bootid_collect(&bin, &buf.BaseTime, len); + got_boottime = true; + } + + /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */ + NTSTATUS status = NtQuerySystemInformation( + 0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo, + sizeof(buf.SysTimeOfDayInfo), &len); + if (NT_SUCCESS(status) && + len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTime) + + sizeof(buf.SysTimeOfDayInfoHacked.BootTime) && + buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) { + bootid_collect(&bin, &buf.SysTimeOfDayInfoHacked.BootTime, + sizeof(buf.SysTimeOfDayInfoHacked.BootTime)); + got_boottime = true; + } + + if (!got_boottime) { + uint64_t boottime = windows_bootime(); + if (boottime) { + bootid_collect(&bin, &boottime, sizeof(boottime)); + got_boottime = true; + } + } + } +#endif /* Windows */ + +#if defined(CTL_HW) && defined(HW_UUID) + if (!got_machineid) { + static const int mib[] = {CTL_HW, HW_UUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_HW && HW_UUID */ + +#if defined(CTL_KERN) && defined(KERN_HOSTUUID) + if (!got_machineid) { + static const int mib[] = {CTL_KERN, KERN_HOSTUUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_KERN && KERN_HOSTUUID */ + +#if defined(__NetBSD__) + if (!got_machineid) { + char buf[42]; + size_t len = sizeof(buf); + if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* __NetBSD__ */ + +#if _XOPEN_SOURCE_EXTENDED + if (!got_machineid) { + const int hostid = gethostid(); + if (hostid > 0) { + bootid_collect(&bin, &hostid, sizeof(hostid)); + got_machineid = true; + } + } +#endif /* _XOPEN_SOURCE_EXTENDED */ + + if (!got_machineid) { + lack: + bin.x = bin.y = 0; + return bin; + } + + /*--------------------------------------------------------------------------*/ + +#if defined(CTL_KERN) && defined(KERN_BOOTTIME) + if (!got_boottime) { + static const int mib[] = {CTL_KERN, KERN_BOOTTIME}; + struct timeval boottime; + size_t len = sizeof(boottime); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 && + len == sizeof(boottime) && boottime.tv_sec) { + bootid_collect(&bin, &boottime, len); + got_boottime = true; + } + } +#endif /* CTL_KERN && KERN_BOOTTIME */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) + if (!got_boottime) { + kstat_ctl_t *kc = kstat_open(); + if (kc) { + kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc"); + if (kp && kstat_read(kc, kp, 0) != -1) { + kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time"); + if (kn) { + switch (kn->data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + bootid_collect(&bin, &kn->value, sizeof(int32_t)); + got_boottime = true; + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + bootid_collect(&bin, &kn->value, sizeof(int64_t)); + got_boottime = true; + } + } + } + kstat_close(kc); + } + } +#endif /* SunOS / Solaris */ + +#if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME) + if (!got_boottime) { + setutxent(); + const struct utmpx id = {.ut_type = BOOT_TIME}; + const struct utmpx *entry = getutxid(&id); + if (entry) { + bootid_collect(&bin, entry, sizeof(*entry)); + got_boottime = true; + while (unlikely((entry = getutxid(&id)) != nullptr)) { + /* have multiple reboot records, assuming we can distinguish next + * bootsession even if RTC is wrong or absent */ + bootid_collect(&bin, entry, sizeof(*entry)); + got_bootseq = true; + } + } + endutxent(); + } +#endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */ + + if (!got_bootseq) { + if (!got_boottime || !MDBX_TRUST_RTC) + goto lack; + +#if defined(_WIN32) || defined(_WIN64) + FILETIME now; + GetSystemTimeAsFileTime(&now); + if (0x1CCCCCC > now.dwHighDateTime) +#else + struct timespec mono, real; + if (clock_gettime(CLOCK_MONOTONIC, &mono) || + clock_gettime(CLOCK_REALTIME, &real) || + /* wrong time, RTC is mad or absent */ + 1555555555l > real.tv_sec || + /* seems no adjustment by RTC/NTP, i.e. a fake time */ + real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec || + (real.tv_sec - mono.tv_sec) % 900u == 0) +#endif + goto lack; + } + + return bin; +} diff --git a/libs/libmdbx/src/src/osal.h b/libs/libmdbx/src/src/elements/osal.h index fca75568ae..0f57cdaab2 100644 --- a/libs/libmdbx/src/src/osal.h +++ b/libs/libmdbx/src/src/elements/osal.h @@ -1,4 +1,4 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> @@ -34,9 +34,12 @@ #if !defined(_CRT_SECURE_NO_WARNINGS) #define _CRT_SECURE_NO_WARNINGS #endif -#if !defined(_NO_CRT_STDIO_INLINE) && defined(MDBX_BUILD_DLL) +#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ + !defined(MDBX_TOOLS) #define _NO_CRT_STDIO_INLINE #endif +#elif !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 200809L #endif /* Windows */ /*----------------------------------------------------------------------------*/ @@ -70,26 +73,72 @@ /* Systems includes */ #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__BSD__) || defined(__NETBSD__) || defined(__bsdi__) || \ - defined(__DragonFly__) || defined(__APPLE__) || defined(__MACH__) + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) #include <sys/cdefs.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/types.h> +#if defined(__FreeBSD__) || defined(__DragonFly__) +#include <vm/vm_param.h> +#elif defined(__OpenBSD__) || defined(__NetBSD__) +#include <uvm/uvm_param.h> #else -#include <malloc.h> -#ifndef _POSIX_C_SOURCE -#ifdef _POSIX_SOURCE -#define _POSIX_C_SOURCE 1 +#define SYSCTL_LEGACY_NONCONST_MIB +#endif +#include <sys/vmmeter.h> #else -#define _POSIX_C_SOURCE 0 +#include <malloc.h> +#if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ + defined(_WIN32) || defined(_WIN64)) +#include <mntent.h> +#endif /* !Solaris */ +#endif /* !xBSD */ + +#if defined(__FreeBSD__) || __has_include(<malloc_np.h>) +#include <malloc_np.h> #endif + +#if defined(__APPLE__) || defined(__MACH__) || __has_include(<malloc/malloc.h>) +#include <malloc/malloc.h> +#endif /* MacOS */ + +#if defined(__MACH__) +#include <mach/host_info.h> +#include <mach/mach_host.h> +#include <mach/mach_port.h> +#include <uuid/uuid.h> +#undef P_DIRTY #endif -#endif /* !xBSD */ + +#if defined(__linux__) || defined(__gnu_linux__) +#include <linux/sysctl.h> +#include <sys/sendfile.h> +#include <sys/statfs.h> +#endif /* Linux */ #ifndef _XOPEN_SOURCE #define _XOPEN_SOURCE 0 #endif +#ifndef _XOPEN_SOURCE_EXTENDED +#define _XOPEN_SOURCE_EXTENDED 0 +#else +#include <utmpx.h> +#endif /* _XOPEN_SOURCE_EXTENDED */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) +#include <kstat.h> +#include <sys/mnttab.h> +/* On Solaris, it's easier to add a missing prototype rather than find a + * combination of #defines that break nothing. */ +__extern_C key_t ftok(const char *, int); +#endif /* SunOS/Solaris */ + #if defined(_WIN32) || defined(_WIN64) +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include <tlhelp32.h> #include <windows.h> #include <winnt.h> @@ -109,7 +158,7 @@ typedef struct { } mdbx_condmutex_t; typedef CRITICAL_SECTION mdbx_fastmutex_t; -#ifdef MDBX_AVOID_CRT +#if MDBX_AVOID_CRT #ifndef mdbx_malloc static inline void *mdbx_malloc(size_t bytes) { return LocalAlloc(LMEM_FIXED, bytes); @@ -150,11 +199,14 @@ static inline void *mdbx_realloc(void *ptr, size_t bytes) { #else /*----------------------------------------------------------------------*/ #include <pthread.h> +#include <semaphore.h> #include <signal.h> #include <sys/file.h> +#include <sys/ipc.h> #include <sys/mman.h> #include <sys/param.h> #include <sys/stat.h> +#include <sys/statvfs.h> #include <sys/uio.h> #include <unistd.h> typedef pthread_t mdbx_thread_t; @@ -174,6 +226,14 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #define mdbx_strdup strdup #endif /* Platform */ +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) +/* malloc_usable_size() already provided */ +#elif defined(__APPLE__) +#define malloc_usable_size(ptr) malloc_size(ptr) +#elif defined(_MSC_VER) && !MDBX_AVOID_CRT +#define malloc_usable_size(ptr) _msize(ptr) +#endif /* malloc_usable_size */ + /* *INDENT-OFF* */ /* clang-format off */ #if defined(HAVE_SYS_STAT_H) || __has_include(<sys/stat.h>) @@ -200,14 +260,6 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -#ifndef MADV_REMOVE_OR_FREE -#ifdef MADV_REMOVE -#define MADV_REMOVE_OR_FREE MADV_REMOVE -#elif defined(MADV_FREE) -#define MADV_REMOVE_OR_FREE MADV_FREE -#endif -#endif /* MADV_REMOVE_OR_FREE */ - #if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ defined(i486) || defined(__i486) || defined(__i486__) || \ defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ @@ -227,21 +279,17 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #endif /* __amd64__ */ #endif /* all x86 */ -#if !defined(UNALIGNED_OK) -#if (defined(__ia32__) || defined(__e2k__) || \ - defined(__ARM_FEATURE_UNALIGNED)) && \ - !defined(__ALIGNED__) -#define UNALIGNED_OK 1 -#else -#define UNALIGNED_OK 0 -#endif -#endif /* UNALIGNED_OK */ - #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF #error \ "Sanity checking failed: Two's complement, reasonably sized integer types" #endif +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#define MDBX_WORDBITS 64 +#else +#define MDBX_WORDBITS 32 +#endif /* MDBX_WORDBITS */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ @@ -299,7 +347,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include <sys/endian.h> #include <sys/types.h> #elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ - defined(__NETBSD__) || defined(__NetBSD__) || \ + defined(__NetBSD__) || \ defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>) #include <sys/param.h> #endif /* OS */ @@ -351,7 +399,16 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ -static __inline void mdbx_compiler_barrier(void) { +#if __has_include(<sys/cachectl.h>) +#include <sys/cachectl.h> +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS should have explicit cache control */ +#include <sys/cachectl.h> +#endif + +static __maybe_unused __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -379,7 +436,7 @@ static __inline void mdbx_compiler_barrier(void) { #endif } -static __inline void mdbx_memory_barrier(void) { +static __maybe_unused __inline void mdbx_memory_barrier(void) { #if __has_extension(c_atomic) || __has_extension(cxx_atomic) __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); #elif defined(__ATOMIC_SEQ_CST) @@ -410,71 +467,6 @@ static __inline void mdbx_memory_barrier(void) { } /*----------------------------------------------------------------------------*/ -/* Cache coherence and invalidation */ - -#ifndef MDBX_CPU_WRITEBACK_IS_COHERENT -#if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ - defined(__hppa__) -#define MDBX_CPU_WRITEBACK_IS_COHERENT 1 -#else -#define MDBX_CPU_WRITEBACK_IS_COHERENT 0 -#endif -#endif /* MDBX_CPU_WRITEBACK_IS_COHERENT */ - -#ifndef MDBX_CACHELINE_SIZE -#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) -#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE -#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) -#define MDBX_CACHELINE_SIZE 128 -#else -#define MDBX_CACHELINE_SIZE 64 -#endif -#endif /* MDBX_CACHELINE_SIZE */ - -#if MDBX_CPU_WRITEBACK_IS_COHERENT -#define mdbx_flush_noncoherent_cpu_writeback() mdbx_compiler_barrier() -#else -#define mdbx_flush_noncoherent_cpu_writeback() mdbx_memory_barrier() -#endif - -#if __has_include(<sys/cachectl.h>) -#include <sys/cachectl.h> -#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ - defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ - defined(__MWERKS__) || defined(__sgi) -/* MIPS should have explicit cache control */ -#include <sys/cachectl.h> -#endif - -#ifndef MDBX_CPU_CACHE_MMAP_NONCOHERENT -#if defined(__mips) || defined(__mips__) || defined(__mips64) || \ - defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ - defined(__MWERKS__) || defined(__sgi) -/* MIPS has cache coherency issues. */ -#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 1 -#else -/* LY: assume no relevant mmap/dcache issues. */ -#define MDBX_CPU_CACHE_MMAP_NONCOHERENT 0 -#endif -#endif /* ndef MDBX_CPU_CACHE_MMAP_NONCOHERENT */ - -static __inline void mdbx_invalidate_mmap_noncoherent_cache(void *addr, - size_t nbytes) { -#if MDBX_CPU_CACHE_MMAP_NONCOHERENT -#ifdef DCACHE - /* MIPS has cache coherency issues. - * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); -#else -#error "Oops, cacheflush() not available" -#endif /* DCACHE */ -#else /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ - (void)addr; - (void)nbytes; -#endif /* MDBX_CPU_CACHE_MMAP_NONCOHERENT */ -} - -/*----------------------------------------------------------------------------*/ /* libc compatibility stuff */ #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ @@ -482,24 +474,29 @@ static __inline void mdbx_invalidate_mmap_noncoherent_cache(void *addr, #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -__printf_args(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); +MDBX_INTERNAL_FUNC __printf_args(2, 3) int __maybe_unused + mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ /* max bytes to write in one call */ +#if defined(_WIN32) || defined(_WIN64) +#define MAX_WRITE UINT32_C(0x01000000) +#else #define MAX_WRITE UINT32_C(0x3fff0000) +#endif #if defined(__linux__) || defined(__gnu_linux__) -extern uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; #endif /* Linux */ /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -static __inline size_t mdbx_syspagesize(void) { +static __maybe_unused __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); @@ -513,7 +510,7 @@ static __inline size_t mdbx_syspagesize(void) { LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __inline int mdbx_get_errno(void) { +static __maybe_unused __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -523,34 +520,40 @@ static __inline int mdbx_get_errno(void) { } #ifndef mdbx_memalign_alloc -int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result); +MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, + void **result); #endif #ifndef mdbx_memalign_free -void mdbx_memalign_free(void *ptr); -#endif - -int mdbx_condmutex_init(mdbx_condmutex_t *condmutex); -int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex); -int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex); -int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex); -int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex); -int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex); - -int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); - -int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - uint64_t offset, size_t expected_written); -int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, - uint64_t offset); - -int mdbx_thread_create(mdbx_thread_t *thread, - THREAD_RESULT(THREAD_CALL *start_routine)(void *), - void *arg); -int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#endif + +MDBX_INTERNAL_FUNC int mdbx_condmutex_init(mdbx_condmutex_t *condmutex); +MDBX_INTERNAL_FUNC int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex); +MDBX_INTERNAL_FUNC int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex); +MDBX_INTERNAL_FUNC int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex); +MDBX_INTERNAL_FUNC int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex); +MDBX_INTERNAL_FUNC int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex); + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); + +MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, + int iovcnt, uint64_t offset, + size_t expected_written); +MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, + uint64_t offset); +MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t count, uint64_t offset); +MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, + size_t count); + +MDBX_INTERNAL_FUNC int +mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg); +MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); enum mdbx_syncmode_bits { MDBX_SYNC_DATA = 1, @@ -558,15 +561,17 @@ enum mdbx_syncmode_bits { MDBX_SYNC_IODQ = 4 }; -int mdbx_filesync(mdbx_filehandle_t fd, enum mdbx_syncmode_bits mode_bits); -int mdbx_filesize_sync(mdbx_filehandle_t fd); -int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); -int mdbx_openfile(const char *pathname, int flags, mode_t mode, - mdbx_filehandle_t *fd, bool exclusive); -int mdbx_closefile(mdbx_filehandle_t fd); -int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd, + enum mdbx_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int mdbx_openfile(const char *pathname, int flags, + mode_t mode, mdbx_filehandle_t *fd, + bool exclusive); +MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); typedef struct mdbx_mmap_param { union { @@ -575,32 +580,41 @@ typedef struct mdbx_mmap_param { struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; - size_t length; /* mapping length, but NOT a size of file or DB */ + size_t limit; /* mapping length, but NOT a size of file nor DB */ + size_t current; /* mapped region size, i.e. the size of file and DB */ #if defined(_WIN32) || defined(_WIN64) - size_t current; /* mapped region size, e.g. file and DB */ - uint64_t filesize; + uint64_t filesize /* in-process cache of a file size. */; #endif #ifdef MDBX_OSAL_SECTION MDBX_OSAL_SECTION section; #endif } mdbx_mmap_t; -int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t must, size_t limit); -int mdbx_munmap(mdbx_mmap_t *map); -int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna); +#define MMAP_OPTION_TRUNCATE 1 +#define MMAP_OPTION_SEMAPHORE 2 +MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, + const size_t must, const size_t limit, + const unsigned options); +MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, + size_t wanna); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; HANDLE handles[31]; } mdbx_handle_array_t; -int mdbx_suspend_threads_before_remap(MDBX_env *env, - mdbx_handle_array_t **array); -int mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +MDBX_INTERNAL_FUNC int +mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +MDBX_INTERNAL_FUNC int +mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async); -int mdbx_check4nonlocal(mdbx_filehandle_t handle, int flags); +MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, + size_t length, int async); +MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, + const char *pathname, int err); -static __inline mdbx_pid_t mdbx_getpid(void) { +static __maybe_unused __inline uint32_t mdbx_getpid(void) { + STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); #else @@ -608,104 +622,132 @@ static __inline mdbx_pid_t mdbx_getpid(void) { #endif } -static __inline mdbx_tid_t mdbx_thread_self(void) { +static __maybe_unused __inline size_t mdbx_thread_self(void) { + STATIC_ASSERT(sizeof(mdbx_tid_t) <= sizeof(size_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentThreadId(); #else - return pthread_self(); + return (size_t)pthread_self(); #endif } -void mdbx_osal_jitter(bool tiny); -uint64_t mdbx_osal_monotime(void); -uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t +mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); + +typedef union bin128 { + __anonymous_struct_extension__ struct { uint64_t x, y; }; + __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; +} bin128_t; +MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ -#if defined(_WIN32) || defined(_WIN64) -#undef MDBX_OSAL_LOCK -#define MDBX_OSAL_LOCK_SIGN UINT32_C(0xF10C) -#else -#define MDBX_OSAL_LOCK pthread_mutex_t -#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017) -#endif /* MDBX_OSAL_LOCK */ - -/// \brief Инициализация объектов синхронизации внутри текущего процесса -/// связанных с экземпляром MDBX_env. -/// \return Код ошибки или 0 в случае успеха. -int mdbx_lck_init(MDBX_env *env); - -/// \brief Отключение от общих межпроцесных объектов и разрушение объектов -/// синхронизации внутри текущего процесса связанных с экземпляром MDBX_env. -void mdbx_lck_destroy(MDBX_env *env); - -/// \brief Подключение к общим межпроцесным объектам блокировки с попыткой -/// захвата блокировки максимального уровня (разделяемой при недоступности -/// эксклюзивной). -/// В зависимости от реализации и/или платформы (Windows) может -/// захватывать блокировку не-операционного супер-уровня (например, для -/// инициализации разделяемых объектов синхронизации), которая затем будет -/// понижена до операционно-эксклюзивной или разделяемой посредством -/// явного вызова mdbx_lck_downgrade(). -/// \return -/// MDBX_RESULT_TRUE (-1) - если удалось захватить эксклюзивную блокировку и, -/// следовательно, текущий процесс является первым и единственным -/// после предыдущего использования БД. -/// MDBX_RESULT_FALSE (0) - если удалось захватить только разделяемую -/// блокировку и, следовательно, БД уже открыта и используется другими -/// процессами. -/// Иначе (не 0 и не -1) - код ошибки. -int mdbx_lck_seize(MDBX_env *env); - -/// \brief Снижает уровень первоначальной захваченной блокировки до -/// операционного уровня определяемого аргументом. +/// \brief Initialization of synchronization primitives linked with MDBX_env +/// instance both in LCK-file and within the current process. /// \param -/// complete = TRUE - понижение до разделяемой блокировки. -/// complete = FALSE - понижение до эксклюзивной операционной блокировки. -/// \return Код ошибки или 0 в случае успеха. -int mdbx_lck_downgrade(MDBX_env *env, bool complete); - -/// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации. -/// \return Код ошибки или 0 в случае успеха. -int mdbx_rdt_lock(MDBX_env *env); - -/// \brief Разблокирует lck-файл и/или таблицу читателей после (де)регистрации. -void mdbx_rdt_unlock(MDBX_env *env); +/// global_uniqueness_flag = true - denotes that there are no other processes +/// working with DB and LCK-file. Thus the function MUST initialize +/// shared synchronization objects in memory-mapped LCK-file. +/// global_uniqueness_flag = false - denotes that at least one process is +/// already working with DB and LCK-file, including the case when DB +/// has already been opened in the current process. Thus the function +/// MUST NOT initialize shared synchronization objects in memory-mapped +/// LCK-file that are already in use. +/// \return Error code or zero on success. +MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag); + +/// \brief Disconnects from shared interprocess objects and destructs +/// synchronization objects linked with MDBX_env instance +/// within the current process. +/// \param +/// inprocess_neighbor = NULL - if the current process does not have other +/// instances of MDBX_env linked with the DB being closed. +/// Thus the function MUST check for other processes working with DB or +/// LCK-file, and keep or destroy shared synchronization objects in +/// memory-mapped LCK-file depending on the result. +/// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env +/// (anyone of there is several) working with DB or LCK-file within the +/// current process. Thus the function MUST NOT try to acquire exclusive +/// lock and/or try to destruct shared synchronization objects linked with +/// DB or LCK-file. Moreover, the implementation MUST ensure correct work +/// of other instances of MDBX_env within the current process, e.g. +/// restore POSIX-fcntl locks after the closing of file descriptors. +/// \return Error code (MDBX_PANIC) or zero on success. +MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor); + +/// \brief Connects to shared interprocess locking objects and tries to acquire +/// the maximum lock level (shared if exclusive is not available) +/// Depending on implementation or/and platform (Windows) this function may +/// acquire the non-OS super-level lock (e.g. for shared synchronization +/// objects initialization), which will be downgraded to OS-exclusive or +/// shared via explicit calling of mdbx_lck_downgrade(). +/// \return +/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus +/// the current process is the first and only after the last use of DB. +/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus +/// DB has already been opened and now is used by other processes. +/// Otherwise (not 0 and not -1) - error code. +MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); + +/// \brief Downgrades the level of initially acquired lock to +/// operational level specified by agrument. The reson for such downgrade: +/// - unblocking of other processes that are waiting for access, i.e. +/// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes +/// should be made aware that access is unavailable rather than +/// wait for it. +/// - freeing locks that interfere file operation (expecially for Windows) +/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock. +/// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive +/// operational lock. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); + +/// \brief Locks LCK-file or/and table of readers for (de)registering. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); + +/// \brief Unlocks LCK-file or/and table of readers after (de)registering. +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); + +/// \brief Acquires lock for DB change (on writing transaction start) +/// Reading transactions will not be blocked. +/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \return Error code or zero on success +LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); + +/// \brief Releases lock once DB changes is made (after writing transaction +/// has finished). +/// Declared as LIBMDBX_API because it is used in mdbx_chk. +LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); -/// \brief Захватывает блокировку для изменения БД (при старте пишущей -/// транзакции). Транзакции чтения при этом никак не блокируются. -/// \return Код ошибки или 0 в случае успеха. -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dontwait); +/// \brief Sets alive-flag of reader presence (indicative lock) for PID of +/// the current process. The function does no more than needed for +/// the correct working of mdbx_rpid_check() in other processes. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); -/// \brief Освобождает блокировку по окончанию изменения БД (после завершения -/// пишущей транзакции). -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Resets alive-flag of reader presence (indicative lock) +/// for PID of the current process. The function does no more than needed +/// for the correct working of mdbx_rpid_check() in other processes. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); -/// \brief Устанавливает alive-флажок присутствия (индицирующую блокировку) -/// читателя для pid текущего процесса. Функции может выполнить не более -/// необходимого минимума для корректной работы mdbx_rpid_check() в других -/// процессах. -/// \return Код ошибки или 0 в случае успеха. -int mdbx_rpid_set(MDBX_env *env); - -/// \brief Снимает alive-флажок присутствия (индицирующую блокировку) -/// читателя для pid текущего процесса. Функции может выполнить не более -/// необходимого минимума для корректной работы mdbx_rpid_check() в других -/// процессах. -/// \return Код ошибки или 0 в случае успеха. -int mdbx_rpid_clear(MDBX_env *env); - -/// \brief Проверяет жив ли процесс-читатель с заданным pid -/// по alive-флажку присутствия (индицирующей блокировку), -/// либо любым другим способом. +/// \brief Checks for reading process status with the given pid with help of +/// alive-flag of presence (indicative lock) or using another way. /// \return -/// MDBX_RESULT_TRUE (-1) - если процесс-читатель с соответствующим pid жив -/// и работает с БД (индицирующая блокировка присутствует). -/// MDBX_RESULT_FALSE (0) - если процесс-читатель с соответствующим pid -/// отсутствует или не работает с БД (индицирующая блокировка отсутствует). -/// Иначе (не 0 и не -1) - код ошибки. -int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); +/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive +/// and working with DB (indicative lock is present). +/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent +/// or not working with DB (indicative lock is not present). +/// Otherwise (not 0 and not -1) - error code. +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) typedef union MDBX_srwlock { @@ -717,14 +759,15 @@ typedef union MDBX_srwlock { } MDBX_srwlock; typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -extern MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, - mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive, - mdbx_srwlock_ReleaseExclusive; +MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, + mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, + mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -extern MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; +MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx + mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, @@ -732,18 +775,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -extern MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; +MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW + mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -extern MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -extern MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; +MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle + mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( IN HANDLE FileHandle, IN OUT HANDLE Event, @@ -751,19 +796,48 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -extern MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; + +typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); +MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; -#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 +#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { PVOID VirtualAddress; SIZE_T NumberOfBytes; } WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; -#endif +#endif /* Windows 8.x */ typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; + +#if 0 /* LY: unused for now */ +#if !defined(_WIN32_WINNT_WIN81) || _WIN32_WINNT < _WIN32_WINNT_WIN81 +typedef enum OFFER_PRIORITY { + VmOfferPriorityVeryLow = 1, + VmOfferPriorityLow, + VmOfferPriorityBelowNormal, + VmOfferPriorityNormal +} OFFER_PRIORITY; +#endif /* Windows 8.1 */ + +typedef DWORD(WINAPI *MDBX_DiscardVirtualMemory)(PVOID VirtualAddress, + SIZE_T Size); +MDBX_INTERNAL_VAR MDBX_DiscardVirtualMemory mdbx_DiscardVirtualMemory; + +typedef DWORD(WINAPI *MDBX_ReclaimVirtualMemory)(PVOID VirtualAddress, + SIZE_T Size); +MDBX_INTERNAL_VAR MDBX_ReclaimVirtualMemory mdbx_ReclaimVirtualMemory; + +typedef DWORD(WINAPI *MDBX_OfferVirtualMemory( + PVOID VirtualAddress, + SIZE_T Size, + OFFER_PRIORITY Priority +); +MDBX_INTERNAL_VAR MDBX_OfferVirtualMemory mdbx_OfferVirtualMemory; +#endif /* unused for now */ #endif /* Windows */ @@ -793,81 +867,6 @@ extern MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; #error FIXME atomic-ops #endif -static __inline uint32_t mdbx_atomic_add32(volatile uint32_t *p, uint32_t v) { -#if !defined(__cplusplus) && defined(ATOMIC_VAR_INIT) - assert(atomic_is_lock_free(p)); - return atomic_fetch_add((_Atomic uint32_t *)p, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_fetch_and_add(p, v); -#else -#ifdef _MSC_VER - STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return _InterlockedExchangeAdd((volatile long *)p, v); -#endif -#ifdef __APPLE__ - return OSAtomicAdd32(v, (volatile int32_t *)p); -#endif -#endif -} - -static __inline uint64_t mdbx_atomic_add64(volatile uint64_t *p, uint64_t v) { -#if !defined(__cplusplus) && defined(ATOMIC_VAR_INIT) - assert(atomic_is_lock_free(p)); - return atomic_fetch_add((_Atomic uint64_t *)p, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_fetch_and_add(p, v); -#else -#ifdef _MSC_VER -#ifdef _WIN64 - return _InterlockedExchangeAdd64((volatile int64_t *)p, v); -#else - return InterlockedExchangeAdd64((volatile int64_t *)p, v); -#endif -#endif /* _MSC_VER */ -#ifdef __APPLE__ - return OSAtomicAdd64(v, (volatile int64_t *)p); -#endif -#endif -} - -#define mdbx_atomic_sub32(p, v) mdbx_atomic_add32(p, 0 - (v)) -#define mdbx_atomic_sub64(p, v) mdbx_atomic_add64(p, 0 - (v)) - -static __inline bool mdbx_atomic_compare_and_swap32(volatile uint32_t *p, - uint32_t c, uint32_t v) { -#if !defined(__cplusplus) && defined(ATOMIC_VAR_INIT) - assert(atomic_is_lock_free(p)); - return atomic_compare_exchange_strong((_Atomic uint32_t *)p, &c, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(p, c, v); -#else -#ifdef _MSC_VER - STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return c == _InterlockedCompareExchange((volatile long *)p, v, c); -#endif -#ifdef __APPLE__ - return c == OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); -#endif -#endif -} - -static __inline bool mdbx_atomic_compare_and_swap64(volatile uint64_t *p, - uint64_t c, uint64_t v) { -#if !defined(__cplusplus) && defined(ATOMIC_VAR_INIT) - assert(atomic_is_lock_free(p)); - return atomic_compare_exchange_strong((_Atomic uint64_t *)p, &c, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(p, c, v); -#else -#ifdef _MSC_VER - return c == _InterlockedCompareExchange64((volatile int64_t *)p, v, c); -#endif -#ifdef __APPLE__ - return c == OSAtomicCompareAndSwap64Barrier(c, v, (volatile uint64_t *)p); -#endif -#endif -} - /*----------------------------------------------------------------------------*/ #if defined(_MSC_VER) && _MSC_VER >= 1900 diff --git a/libs/libmdbx/src/src/elements/version.c.in b/libs/libmdbx/src/src/elements/version.c.in new file mode 100644 index 0000000000..2854bd5d9d --- /dev/null +++ b/libs/libmdbx/src/src/elements/version.c.in @@ -0,0 +1,46 @@ +/* This is CMake-template for libmdbx's version.c + ******************************************************************************/ + +#include "internals.h" + +#if MDBX_VERSION_MAJOR != ${MDBX_VERSION_MAJOR} || \ + MDBX_VERSION_MINOR != ${MDBX_VERSION_MINOR} +#error "API version mismatch! Had `git fetch --tags` done?" +#endif + +static const char sourcery[] = STRINGIFY(MDBX_BUILD_SOURCERY); + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const mdbx_version_info mdbx_version = { + ${MDBX_VERSION_MAJOR}, + ${MDBX_VERSION_MINOR}, + ${MDBX_VERSION_RELEASE}, + ${MDBX_VERSION_REVISION}, + {"@MDBX_GIT_TIMESTAMP@", "@MDBX_GIT_TREE@", "@MDBX_GIT_COMMIT@", + "@MDBX_GIT_DESCRIBE@"}, + sourcery}; + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const char *const mdbx_sourcery_anchor = sourcery; diff --git a/libs/libmdbx/src/src/lck-linux.c b/libs/libmdbx/src/src/lck-linux.c deleted file mode 100644 index 925941b229..0000000000 --- a/libs/libmdbx/src/src/lck-linux.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * <http://www.OpenLDAP.org/license.html>. - */ - -#if !(defined(__linux__) || defined(__gnu_linux__)) -#error "This implementation of locking only supports Linux,\ - where is no interaction between the types of lock placed\ - by flock() and fcntl()." -#endif - -#include "./bits.h" -#include <sys/utsname.h> - -/* Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDBX_USE_ROBUST=0. */ -#ifndef MDBX_USE_ROBUST -/* Howard Chu: Android currently lacks Robust Mutex support */ -#if defined(EOWNERDEAD) && \ - !defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \ - with Robust Mutex too. */ \ - && (!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) || \ - _POSIX_C_SOURCE >= 200809L) -#define MDBX_USE_ROBUST 1 -#else -#define MDBX_USE_ROBUST 0 -#endif -#endif /* MDBX_USE_ROBUST */ - -/*----------------------------------------------------------------------------*/ -/* global constructor/destructor */ - -uint32_t mdbx_linux_kernel_version; -static __cold __attribute__((__constructor__)) void -mdbx_global_constructor(void) { - struct utsname buffer; - if (uname(&buffer) == 0) { - int i = 0; - char *p = buffer.release; - while (*p && i < 4) { - if (*p >= '0' && *p <= '9') { - long number = strtol(p, &p, 10); - if (number > 0) { - if (number > 255) - number = 255; - mdbx_linux_kernel_version += number << (24 - i * 8); - } - ++i; - } else { - ++p; - } - } - } - - mdbx_rthc_global_init(); -} - -static __cold __attribute__((__destructor__)) void -mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); -} - -/*----------------------------------------------------------------------------*/ -/* lck */ - -/* Описание реализации блокировок для Linux: - * - * lck-файл отображается в память, в нём организуется таблица читателей и - * размещаются совместно используемые posix-мьютексы (futex). Посредством - * этих мьютексов (см struct MDBX_lockinfo) реализуются: - * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). - * - Блокировка БД для пишущих транзакций, - * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). - * - * Остальной функционал реализуется отдельно посредством файловых блокировок: - * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). - * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). - * - * Используется два вида файловых блокировок flock() и fcntl(F_SETLK), - * как для lck-файла, так и для основного файла БД: - * - Для контроля процессов-читателей используются однобайтовые - * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом - * в качестве позиции используется pid процесса-читателя. - * - Для первоначального захвата и shared/exclusive блокировок используется - * комбинация flock() и fcntl(F_SETLK) блокировки одного байта lck-файла - * в нулевой позиции (нулевая позиция не используется механизмом контроля - * процессов-читателей, так как pid пользовательского процесса в Linux - * всегда больше 0). - * - Кроме этого, flock() блокировка основного файла БД используется при работе - * в режимах без lck-файла, как в в read-only, так и в эксклюзивном. - * - Блокировки flock() и fcntl(F_SETLK) в Linux работают независимо. Поэтому - * их комбинирование позволяет предотвратить совместное использование БД - * через NFS, что позволяет fcntl(F_SETLK), одновременно защитившись - * от проблем не-аторманости flock() при переходе между эксклюзивным - * и атомарным режимами блокировок. - */ - -static int op_setlk, op_setlkw, op_getlk; -static void __cold choice_fcntl() { - assert(!op_setlk && !op_setlkw && !op_getlk); -#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) - if (mdbx_linux_kernel_version > - 0x030f0000 /* OFD locks are available since 3.15, but engages here - only for 3.16 and larer kernels (LTS) for reliability reasons */ - && (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0) { - op_setlk = F_OFD_SETLK; - op_setlkw = F_OFD_SETLKW; - op_getlk = F_OFD_GETLK; - return; - } -#endif /* OFD locks */ - op_setlk = F_SETLK; - op_setlkw = F_SETLKW; - op_getlk = F_GETLK; -} - -#ifndef OFF_T_MAX -#define OFF_T_MAX \ - ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) -#endif -#define LCK_WHOLE OFF_T_MAX - -static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, - off_t len) { - for (;;) { - struct flock lock_op; - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = lck; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = offset; - lock_op.l_len = len; - if (fcntl(fd, cmd, &lock_op) == 0) { - if (cmd == op_getlk) { - /* Checks reader by pid. Returns: - * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ - return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; - } - return 0; - } - int rc = errno; - if (rc != EINTR || cmd == op_setlkw) - return rc; - } -} - -static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) { - assert(lfd != INVALID_HANDLE_VALUE); - if (flock(lfd, LOCK_EX | LOCK_NB)) - return errno; - int rc = mdbx_lck_op(lfd, op_setlk, F_WRLCK, 0, 1); - if (rc != 0 && fallback2shared) { - while (flock(lfd, LOCK_SH)) { - int rc = errno; - if (rc != EINTR) - return rc; - } - } - return rc; -} - -static __inline int mdbx_lck_shared(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - while (flock(lfd, LOCK_SH)) { - int rc = errno; - if (rc != EINTR) - return rc; - } - return mdbx_lck_op(lfd, op_setlkw, F_RDLCK, 0, 1); -} - -int mdbx_lck_downgrade(MDBX_env *env, bool complete) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; -} - -int mdbx_rpid_set(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - return mdbx_lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); -} - -int mdbx_rpid_clear(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - return mdbx_lck_op(env->me_lfd, op_setlkw, F_UNLCK, env->me_pid, 1); -} - -int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(pid > 0); - return mdbx_lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); -} - -/*---------------------------------------------------------------------------*/ - -static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int rc); - -int __cold mdbx_lck_init(MDBX_env *env) { - pthread_mutexattr_t ma; - int rc = pthread_mutexattr_init(&ma); - if (rc) - return rc; - - rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); - if (rc) - goto bailout; - -#if MDBX_USE_ROBUST -#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ - !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L - rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); -#else - rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); -#endif - if (rc) - goto bailout; -#endif /* MDBX_USE_ROBUST */ - -#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); - if (rc == ENOTSUP) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); - if (rc) - goto bailout; -#endif /* PTHREAD_PRIO_INHERIT */ - - rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); - if (rc) - goto bailout; - - rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); - if (rc) - goto bailout; - rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); - -bailout: - pthread_mutexattr_destroy(&ma); - return rc; -} - -void __cold mdbx_lck_destroy(MDBX_env *env) { - if (env->me_lfd != INVALID_HANDLE_VALUE) { - /* try get exclusive access */ - if (env->me_lck && mdbx_lck_exclusive(env->me_lfd, false) == 0) { - mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); - int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); - if (rc == 0) - rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); - assert(rc == 0); - (void)rc; - /* file locks would be released (by kernel) - * while the me_lfd will be closed */ - } - } -} - -static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_trylock(mutex); - if (unlikely(rc != 0 && rc != EBUSY)) - rc = mdbx_mutex_failed(env, mutex, rc); - return (rc != EBUSY) ? rc : MDBX_BUSY; -} - -static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - return rc; -} - -void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -int mdbx_txn_lock(MDBX_env *env, bool dontwait) { - mdbx_trace(">>"); - int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) - : mdbx_robust_lock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; -} - -void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -static int __cold internal_seize_lck(int lfd) { - assert(lfd != INVALID_HANDLE_VALUE); - - /* try exclusive access */ - int rc = mdbx_lck_exclusive(lfd, false); - if (rc == 0) - /* got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { - /* get shared access */ - rc = mdbx_lck_shared(lfd); - if (rc == 0) { - /* got shared, try exclusive again */ - rc = mdbx_lck_exclusive(lfd, true); - if (rc == 0) - /* now got exclusive */ - return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) - /* unable exclusive, but stay shared */ - return MDBX_RESULT_FALSE; - } - } - assert(MDBX_IS_ERROR(rc)); - return rc; -} - -int __cold mdbx_lck_seize(MDBX_env *env) { - assert(env->me_fd != INVALID_HANDLE_VALUE); - if (unlikely(op_setlk == 0)) - choice_fcntl(); - - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - LCK_WHOLE); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); - return rc; - } - return MDBX_RESULT_TRUE; - } - - if ((env->me_flags & MDBX_RDONLY) == 0) { - /* Check that another process don't operates in without-lck mode. */ - int rc = mdbx_lck_op(env->me_fd, op_setlk, F_WRLCK, env->me_pid, 1); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "lock-against-without-lck", rc); - return rc; - } - } - - return internal_seize_lck(env->me_lfd); -} - -static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int err) { - int rc = err; -#if MDBX_USE_ROBUST - if (err == EOWNERDEAD) { - /* We own the mutex. Clean up after dead previous owner. */ - - int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); - rc = MDBX_SUCCESS; - if (!rlocked) { - if (unlikely(env->me_txn)) { - /* env is hosed if the dead thread was ours */ - env->me_flags |= MDBX_FATAL_ERROR; - env->me_txn = NULL; - rc = MDBX_PANIC; - } - } - mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - - int check_rc = mdbx_reader_check0(env, rlocked, NULL); - check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; - -#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \ - !defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L - int mreco_rc = pthread_mutex_consistent_np(mutex); -#else - int mreco_rc = pthread_mutex_consistent(mutex); -#endif - check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; - - if (unlikely(mreco_rc)) - mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); - - rc = (rc == MDBX_SUCCESS) ? check_rc : rc; - if (MDBX_IS_ERROR(rc)) - pthread_mutex_unlock(mutex); - return rc; - } -#else - (void)mutex; -#endif /* MDBX_USE_ROBUST */ - - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); - if (rc != EDEADLK) - env->me_flags |= MDBX_FATAL_ERROR; - return rc; -} diff --git a/libs/libmdbx/src/src/lck-posix.c b/libs/libmdbx/src/src/lck-posix.c deleted file mode 100644 index 3b93157980..0000000000 --- a/libs/libmdbx/src/src/lck-posix.c +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * <http://www.OpenLDAP.org/license.html>. - */ - -#include "./bits.h" - -/* Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDBX_USE_ROBUST=0. */ -#ifndef MDBX_USE_ROBUST -#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__) -#define MDBX_USE_ROBUST 1 -#else -#define MDBX_USE_ROBUST 0 -#endif -#endif /* MDBX_USE_ROBUST */ - -/*----------------------------------------------------------------------------*/ -/* rthc */ - -static __cold __attribute__((__constructor__)) void -mdbx_global_constructor(void) { - mdbx_rthc_global_init(); -} - -static __cold __attribute__((__destructor__)) void -mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); -} - -/*----------------------------------------------------------------------------*/ -/* lck */ - -/* Описание реализации блокировок для POSIX: - * - * lck-файл отображается в память, в нём организуется таблица читателей и - * размещаются совместно используемые posix-мьютексы (futex). Посредством - * этих мьютексов (см struct MDBX_lockinfo) реализуются: - * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). - * - Блокировка БД для пишущих транзакций, - * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). - * - * Остальной функционал реализуется отдельно посредством файловых блокировок: - * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). - * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). - * - * Для блокировки файлов Используется только fcntl(F_SETLK), так как: - * - lockf() оперирует только эксклюзивной блокировкой и требует - * открытия файла в RW-режиме. - * - flock() не гарантирует атомарности при смене блокировок - * и оперирует только всем файлом целиком. - * - Для контроля процессов-читателей используются однобайтовые - * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом - * в качестве позиции используется pid процесса-читателя. - * - Для первоначального захвата и shared/exclusive выполняется блокировка - * основного файла БД и при успехе lck-файла. - */ - -#ifndef OFF_T_MAX -#define OFF_T_MAX \ - ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) -#endif -#ifndef PID_T_MAX -#define PID_T_MAX INT_MAX -#endif - -#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) -#define OP_SETLK F_OFD_SETLK -#define OP_SETLKW F_OFD_SETLKW -#define OP_GETLK F_OFD_GETLK -#else -#define OP_SETLK F_SETLK -#define OP_SETLKW F_SETLKW -#define OP_GETLK F_GETLK -#endif /* OFD locks */ - -static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, - off_t len) { - for (;;) { - struct flock lock_op; - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = lck; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = offset; - lock_op.l_len = len; - if (fcntl(fd, cmd, &lock_op) == 0) { - if (cmd == OP_GETLK) { - /* Checks reader by pid. Returns: - * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ - return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; - } - return 0; - } - int rc = errno; - if (rc != EINTR || cmd == F_SETLKW) - return rc; - } -} - -int mdbx_rpid_set(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1); -} - -int mdbx_rpid_clear(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1); -} - -int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(pid > 0 && pid <= PID_T_MAX); - assert(PID_T_MAX < OFF_T_MAX); - return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1); -} - -int __cold mdbx_lck_seize(MDBX_env *env) { - assert(env->me_fd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); - - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc != 0) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); - return rc; - } - return MDBX_RESULT_TRUE; - } - - /* try exclusive access */ - int rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc == 0) { - continue_exclusive: - /* got dxb-exclusive, continue lck-exclusive */ - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX); - if (rc == 0) { - /* got both exclusive */ - return MDBX_RESULT_TRUE; - } - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "lck-after-dxb-exclusive", rc); - assert(MDBX_IS_ERROR(rc)); - goto bailout; - } - - if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { - rc = mdbx_lck_op(env->me_fd, OP_SETLKW, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - env->me_pid, 1); - if (rc == 0) { - /* got dxb-shared, try again dxb-exclusive */ - rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX); - if (rc == 0) - goto continue_exclusive; - - /* continue lck-shared */ - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); - if (rc == 0) { - /* got both dxb and lck shared lock */ - return MDBX_RESULT_FALSE; - } - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc); - } else { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc); - } - assert(MDBX_IS_ERROR(rc)); - } - -bailout: - (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - assert(MDBX_IS_ERROR(rc)); - return rc; -} - -int mdbx_lck_downgrade(MDBX_env *env, bool complete) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1); - if (rc == 0) - rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); - if (unlikely(rc != 0)) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc); - goto bailout; - } - if (complete) { - rc = mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - env->me_pid, 1); - if (unlikely(rc != 0)) { - mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc); - goto bailout; - } - } - return MDBX_SUCCESS; - -bailout: - (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - assert(MDBX_IS_ERROR(rc)); - return rc; -} - -/*---------------------------------------------------------------------------*/ - -static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int rc); - -int __cold mdbx_lck_init(MDBX_env *env) { - pthread_mutexattr_t ma; - int rc = pthread_mutexattr_init(&ma); - if (rc) - return rc; - - rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); - if (rc) - goto bailout; - -#if MDBX_USE_ROBUST - rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); - if (rc) - goto bailout; -#endif /* MDBX_USE_ROBUST */ - -#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); - if (rc == ENOTSUP) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); - if (rc) - goto bailout; -#endif /* PTHREAD_PRIO_INHERIT */ - - rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); - if (rc) - goto bailout; - - rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); - if (rc) - goto bailout; - rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); - -bailout: - pthread_mutexattr_destroy(&ma); - return rc; -} - -void __cold mdbx_lck_destroy(MDBX_env *env) { - /* File locks would be released (by kernel) while the file-descriptors - * will be closed. But to avoid false-positive EDEADLK from the kernel, - * locks should be released here explicitly with properly order. */ - if (env->me_lfd != INVALID_HANDLE_VALUE) { - /* try get exclusive access */ - if (env->me_lck && - mdbx_lck_op(env->me_fd, OP_SETLK, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX) == 0 && - mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); - int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); - if (rc == 0) - rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); - assert(rc == 0); - (void)rc; - msync(env->me_lck, env->me_os_psize, MS_ASYNC); - } - (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); - } - if (env->me_fd != INVALID_HANDLE_VALUE) - (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); -} - -static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_trylock(mutex); - if (unlikely(rc != 0 && rc != EBUSY)) - rc = mdbx_mutex_failed(env, mutex, rc); - return (rc != EBUSY) ? rc : MDBX_BUSY; -} - -static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - if (unlikely(rc != 0)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - return rc; -} - -void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -int mdbx_txn_lock(MDBX_env *env, bool dontwait) { - mdbx_trace(">>"); - int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) - : mdbx_robust_lock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; -} - -void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, env->me_wmutex); - mdbx_trace("<< rc %d", rc); - if (unlikely(MDBX_IS_ERROR(rc))) - mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); -} - -static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, - const int err) { - int rc = err; -#if MDBX_USE_ROBUST - if (err == EOWNERDEAD) { - /* We own the mutex. Clean up after dead previous owner. */ - - int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); - rc = MDBX_SUCCESS; - if (!rlocked) { - if (unlikely(env->me_txn)) { - /* env is hosed if the dead thread was ours */ - env->me_flags |= MDBX_FATAL_ERROR; - env->me_txn = NULL; - rc = MDBX_PANIC; - } - } - mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - - int check_rc = mdbx_reader_check0(env, rlocked, NULL); - check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; - - int mreco_rc = pthread_mutex_consistent(mutex); - check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; - - if (unlikely(mreco_rc)) - mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); - - rc = (rc == MDBX_SUCCESS) ? check_rc : rc; - if (MDBX_IS_ERROR(rc)) - pthread_mutex_unlock(mutex); - return rc; - } -#else - (void)mutex; -#endif /* MDBX_USE_ROBUST */ - - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); - if (rc != EDEADLK) - env->me_flags |= MDBX_FATAL_ERROR; - return rc; -} diff --git a/libs/libmdbx/src/src/man1/mdbx_chk.1 b/libs/libmdbx/src/src/man1/mdbx_chk.1 new file mode 100644 index 0000000000..f9f603e634 --- /dev/null +++ b/libs/libmdbx/src/src/man1/mdbx_chk.1 @@ -0,0 +1,87 @@ +.\" Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>. +.\" Copying restrictions apply. See COPYRIGHT/LICENSE. +.TH MDBX_CHK 1 "2019-12-05" "MDBX 0.4.x" +.SH NAME +mdbx_chk \- MDBX checking tool +.SH SYNOPSIS +.B mdbx_chk +[\c +.BR \-V ] +[\c +.BR \-v [ v [ v ]]] +[\c +.BR \-n ] +[\c +.BR \-q ] +[\c +.BR \-c ] +[\c +.BR \-w ] +[\c +.BR \-d ] +[\c +.BR \-i ] +[\c +.BI \-s \ subdb\fR] +.BR \ dbpath +.SH DESCRIPTION +The +.B mdbx_chk +utility intended to check an MDBX database file. +.SH OPTIONS +.TP +.BR \-V +Write the library version number to the standard output, and exit. +.TP +.BR \-v +Produce verbose output, including summarize space and page usage statistics. +If \fB\-vv\fP is given, be more verbose, show summarized B-tree info +and space allocation. +If \fB\-vvv\fP is given, be more verbose, include summarized statistics +of leaf B-tree pages. +If \fB\-vvvv\fP is given, be even more verbose, show info of each page +during B-tree traversal and basic info of each GC record. +If \fB\-vvvvv\fP is given, turn maximal verbosity, display the full list +of page IDs in the GC records and size of each key-value pair of database(s). +.TP +.BR \-n +Open MDBX environment(s) which do not use subdirectories. +.TP +.BR \-q +Be quiet; do not output anything even if an error was detected. +.TP +.BR \-c +Force using cooperative mode while opening environment, i.e. don't try to open +in exclusive/monopolistic mode. Only exclusive/monopolistic mode allow complete +check, including full check of all meta-pages and actual size of database file. +.TP +.BR \-w +Open environment in read-write mode and lock for writing while checking. +This could be impossible if environment already used by another process(s) +in an incompatible read-write mode. This allow rollback to last steady commit +(in case environment was not closed properly) and then check transaction IDs +of meta-pages. Otherwise, without \fB\-w\fP option environment will be +opened in read-only mode. +.TP +.BR \-d +Disable page-by-page traversal of B-tree. In this case, without B-tree +traversal, it is unable to check for lost-unused pages nor for double-used +pages. +.TP +.BR \-i +Ignore wrong order errors, which will likely false-positive if custom +comparator(s) was used. +.TP +.BR \-s \ subdb +Verify and show info only for a specific subdatabase. +.SH DIAGNOSTICS +Exit status is zero if no errors occur. Errors result in a non-zero exit status +and a diagnostic message being written to standard error +if no quiet mode was requested. +.SH "SEE ALSO" +.BR mdbx_stat (1), +.BR mdbx_copy (1), +.BR mdbx_dump (1), +.BR mdbx_load (1) +.SH AUTHOR +Leonid Yuriev <https://github.com/leo-yuriev> diff --git a/libs/libmdbx/src/src/tools/mdbx_copy.1 b/libs/libmdbx/src/src/man1/mdbx_copy.1 index 74d94b6b98..82acdde8fa 100644 --- a/libs/libmdbx/src/src/tools/mdbx_copy.1 +++ b/libs/libmdbx/src/src/man1/mdbx_copy.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_COPY 1 "2019-12-05" "MDBX 0.4.x" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS @@ -10,12 +10,14 @@ mdbx_copy \- MDBX environment copy tool [\c .BR \-V ] [\c +.BR \-q ] +[\c .BR \-c ] [\c .BR \-n ] -.B srcpath +.B src_path [\c -.BR dstpath ] +.BR dest_path ] .SH DESCRIPTION The .B mdbx_copy @@ -24,7 +26,7 @@ be copied regardless of whether it is currently in use. No lockfile is created, since it gets recreated at need. If -.I dstpath +.I dest_path is specified it must be the path of an empty directory for storing the backup. Otherwise, the backup will be written to stdout. @@ -34,6 +36,9 @@ written to stdout. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-q +Be quiet. +.TP .BR \-c Compact while copying. Only current data pages will be copied; freed or unused pages will be omitted from the copy. This option will @@ -41,7 +46,7 @@ slow down the backup process as it is more CPU-intensive. Currently it fails if the environment has suffered a page leak. .TP .BR \-n -Open LDMB environment(s) which do not use subdirectories. +Open MDBX environment(s) which do not use subdirectories. .SH DIAGNOSTICS Exit status is zero if no errors occur. @@ -52,6 +57,10 @@ This utility can trigger significant file size growth if run in parallel with write transactions, because pages which they free during copying cannot be reused until the copy is done. .SH "SEE ALSO" -.BR mdbx_stat (1) +.BR mdbx_dump (1), +.BR mdbx_chk (1), +.BR mdbx_stat (1), +.BR mdbx_load (1) .SH AUTHOR -Howard Chu of Symas Corporation <http://www.symas.com> +Howard Chu of Symas Corporation <http://www.symas.com>, +Leonid Yuriev <https://github.com/leo-yuriev> diff --git a/libs/libmdbx/src/src/tools/mdbx_dump.1 b/libs/libmdbx/src/src/man1/mdbx_dump.1 index 93d29a7cf3..a9481f6bf8 100644 --- a/libs/libmdbx/src/src/tools/mdbx_dump.1 +++ b/libs/libmdbx/src/src/man1/mdbx_dump.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_DUMP 1 "2019-12-05" "MDBX 0.4.x" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS @@ -10,17 +10,21 @@ mdbx_dump \- MDBX environment export tool [\c .BR \-V ] [\c +.BR \-q ] +[\c .BI \-f \ file\fR] [\c .BR \-l ] [\c -.BR \-n ] -[\c .BR \-p ] [\c .BR \-a \ | .BI \-s \ subdb\fR] -.BR \ envpath +[\c +.BR \-r ] +[\c +.BR \-n ] +.BR \ dbpath .SH DESCRIPTION The .B mdbx_dump @@ -34,6 +38,9 @@ utility. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-q +Be quiet. +.TP .BR \-f \ file Write to the specified file instead of to the standard output. .TP @@ -41,9 +48,6 @@ Write to the specified file instead of to the standard output. List the databases stored in the environment. Just the names will be listed, no data will be output. .TP -.BR \-n -Dump an MDBX database which does not use subdirectories. -.TP .BR \-p If characters in either the key or data items are printing characters (as defined by isprint(3)), output them directly. This option permits users to @@ -58,6 +62,12 @@ Dump all of the subdatabases in the environment. .TP .BR \-s \ subdb Dump a specific subdatabase. If no database is specified, only the main database is dumped. +.TP +.BR \-r +Rescure mode. Ignore some errors to dump corrupted DB. +.TP +.BR \-n +Dump an MDBX database which does not use subdirectories. .SH DIAGNOSTICS Exit status is zero if no errors occur. Errors result in a non-zero exit status and @@ -72,6 +82,10 @@ The only available workaround is to modify the source for the .BR mdbx_load (1) utility to load the database using the correct comparison functions. .SH "SEE ALSO" -.BR mdbx_load (1) +.BR mdbx_load (1), +.BR mdbx_copy (1), +.BR mdbx_chk (1), +.BR mdbx_stat (1) .SH AUTHOR -Howard Chu of Symas Corporation <http://www.symas.com> +Howard Chu of Symas Corporation <http://www.symas.com>, +Leonid Yuriev <https://github.com/leo-yuriev> diff --git a/libs/libmdbx/src/src/tools/mdbx_load.1 b/libs/libmdbx/src/src/man1/mdbx_load.1 index e23ec78eee..927969d5cc 100644 --- a/libs/libmdbx/src/src/tools/mdbx_load.1 +++ b/libs/libmdbx/src/src/man1/mdbx_load.1 @@ -2,7 +2,7 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_LOAD 1 "2019-12-05" "MDBX 0.4.x" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS @@ -10,22 +10,28 @@ mdbx_load \- MDBX environment import tool [\c .BR \-V ] [\c -.BI \-f \ file\fR] +.BR \-q ] [\c -.BR \-n ] +.BR \-a ] +[\c +.BI \-f \ file\fR] [\c .BI \-s \ subdb\fR] [\c .BR \-N ] [\c .BR \-T ] -.BR \ envpath +[\c +.BR \-r ] +[\c +.BR \-n ] +.BR \ dbpath .SH DESCRIPTION The .B mdbx_load utility reads from the standard input and loads it into the MDBX environment -.BR envpath . +.BR dbpath . The input to .B mdbx_load @@ -39,6 +45,9 @@ option below. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-q +Be quiet. +.TP .BR \-a Append all records in the order they appear in the input. The input is assumed to already be in correctly sorted order and no sorting or checking for redundant values will be performed. @@ -49,9 +58,6 @@ on a database that uses custom compare functions. .BR \-f \ file Read from the specified file instead of from the standard input. .TP -.BR \-n -Load an MDBX database which does not use subdirectories. -.TP .BR \-s \ subdb Load a specific subdatabase. If no database is specified, data is loaded into the main database. .TP @@ -62,6 +68,12 @@ Don't overwrite existing records when loading into an already existing database; Load data from simple text files. The input must be paired lines of text, where the first line of the pair is the key item, and the second line of the pair is its corresponding data item. +.TP +.BR \-r +Rescure mode. Ignore errors to load corrupted DB dump. +.TP +.BR \-n +Load an MDBX database which does not use subdirectories. A simple escape mechanism, where newline and backslash (\\) characters are special, is applied to the text input. Newline characters are interpreted as record separators. @@ -81,6 +93,10 @@ Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdbx_dump (1) +.BR mdbx_dump (1), +.BR mdbx_chk (1), +.BR mdbx_stat (1), +.BR mdbx_copy (1) .SH AUTHOR -Howard Chu of Symas Corporation <http://www.symas.com> +Howard Chu of Symas Corporation <http://www.symas.com>, +Leonid Yuriev <https://github.com/leo-yuriev> diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.1 b/libs/libmdbx/src/src/man1/mdbx_stat.1 index 50a30b4f97..b78df372f6 100644 --- a/libs/libmdbx/src/src/tools/mdbx_stat.1 +++ b/libs/libmdbx/src/src/man1/mdbx_stat.1 @@ -2,7 +2,7 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC <http://billing.ru/>. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_STAT 1 "2019-12-05" "MDBX 0.4.x" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS @@ -10,17 +10,19 @@ mdbx_stat \- MDBX environment status tool [\c .BR \-V ] [\c +.BR \-q ] +[\c .BR \-e ] [\c .BR \-f [ f [ f ]]] [\c -.BR \-n ] -[\c .BR \-r [ r ]] [\c .BR \-a \ | .BI \-s \ subdb\fR] -.BR \ envpath +.BR \ dbpath +[\c +.BR \-n ] .SH DESCRIPTION The .B mdbx_stat @@ -30,6 +32,9 @@ utility displays the status of an MDBX environment. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-q +Be quiet. +.TP .BR \-e Display information about the database environment. .TP @@ -38,9 +43,6 @@ Display information about the environment freelist. If \fB\-ff\fP is given, summarize each freelist entry. If \fB\-fff\fP is given, display the full list of page IDs in the freelist. .TP -.BR \-n -Display the status of an MDBX database which does not use subdirectories. -.TP .BR \-r Display information about the environment reader table. Shows the process ID, thread ID, and transaction ID for each active @@ -56,11 +58,18 @@ Display the status of all of the subdatabases in the environment. .TP .BR \-s \ subdb Display the status of a specific subdatabase. +.TP +.BR \-n +Display the status of an MDBX database which does not use subdirectories. .SH DIAGNOSTICS Exit status is zero if no errors occur. Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdbx_copy (1) +.BR mdbx_chk (1), +.BR mdbx_copy (1), +.BR mdbx_dump (1), +.BR mdbx_load (1) .SH AUTHOR -Howard Chu of Symas Corporation <http://www.symas.com> +Howard Chu of Symas Corporation <http://www.symas.com>, +Leonid Yuriev <https://github.com/leo-yuriev> diff --git a/libs/libmdbx/src/src/tools/CMakeLists.txt b/libs/libmdbx/src/src/tools/CMakeLists.txt index 3e200098f8..99167fa86f 100644 --- a/libs/libmdbx/src/src/tools/CMakeLists.txt +++ b/libs/libmdbx/src/src/tools/CMakeLists.txt @@ -1,19 +1,60 @@ -project(mdbx_tools) - -set(MDBX_TOOLS - mdbx_chk - mdbx_copy - mdbx_dump - mdbx_load - mdbx_stat - ) - -foreach (TOOL ${MDBX_TOOLS}) - add_executable(${TOOL} ${TOOL}.c) - - target_link_libraries(${TOOL} mdbx) - install(TARGETS ${TOOL} DESTINATION ${CMAKE_INSTALL_PREFIX}/bin COMPONENT mdbx) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TOOL}.1) - install(FILES ${TOOL}.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/man/man1 COMPONENT mdbx) +set(MDBX_TOOLS mdbx_chk mdbx_copy mdbx_dump mdbx_load mdbx_stat) + +# use, i.e. don't skip the full RPATH for the build tree +set(CMAKE_SKIP_BUILD_RPATH FALSE) + +# when building, don't use the install RPATH already (but later on when installing) +set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + +# add the automatically determined parts of the RPATH +# which point to directories outside the build tree to the install RPATH +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + +# the RPATH to be used when installing, but only if it's not a system directory +list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) +if(isSystemDir EQUAL -1) + if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(CMAKE_INSTALL_RPATH "@executable_path/../lib") + else() + set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib") + endif() +endif() + +foreach(TOOL ${MDBX_TOOLS}) + if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + add_executable(${TOOL} ${TOOL}.c wingetopt.c wingetopt.h) + else() + add_executable(${TOOL} ${TOOL}.c) endif() + + target_link_libraries(${TOOL} mdbx ${CMAKE_THREAD_LIBS_INIT}) + set_target_properties(${TOOL} PROPERTIES + C_STANDARD ${MDBX_C_STANDARD} C_STANDARD_REQUIRED ON + INTERPROCEDURAL_OPTIMIZATION $<BOOL:${INTERPROCEDURAL_OPTIMIZATION}>) endforeach() + +if(LIB_MATH) + target_link_libraries(mdbx_chk ${LIB_MATH}) + target_link_libraries(mdbx_stat ${LIB_MATH}) +endif() + +install( + TARGETS + mdbx_chk + mdbx_stat + mdbx_copy + mdbx_dump + mdbx_load + RUNTIME + DESTINATION bin + COMPONENT runtime) + +install( + FILES + ../man1/mdbx_chk.1 + ../man1/mdbx_stat.1 + ../man1/mdbx_copy.1 + ../man1/mdbx_dump.1 + ../man1/mdbx_load.1 + DESTINATION man/man1 + COMPONENT doc) diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.c b/libs/libmdbx/src/src/tools/mdbx_chk.c index 9d3b9e063c..ce1a4a0efd 100644 --- a/libs/libmdbx/src/src/tools/mdbx_chk.c +++ b/libs/libmdbx/src/src/tools/mdbx_chk.c @@ -20,22 +20,21 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../elements/internals.h" typedef struct flagbit { int bit; - char *name; + const char *name; } flagbit; -flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, - {MDBX_INTEGERKEY, "integerkey"}, - {MDBX_REVERSEKEY, "reversekey"}, - {MDBX_DUPFIXED, "dupfixed"}, - {MDBX_REVERSEDUP, "reversedup"}, - {MDBX_INTEGERDUP, "integerdup"}, - {0, NULL}}; +const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, + {MDBX_INTEGERKEY, "integerkey"}, + {MDBX_REVERSEKEY, "reversekey"}, + {MDBX_DUPFIXED, "dupfixed"}, + {MDBX_REVERSEDUP, "reversedup"}, + {MDBX_INTEGERDUP, "integerdup"}, + {0, NULL}}; #if defined(_WIN32) || defined(_WIN64) #include "wingetopt.h" @@ -43,7 +42,7 @@ flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, static volatile BOOL user_break; static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { (void)dwCtrlType; - user_break = true; + user_break = 1; return true; } @@ -78,24 +77,23 @@ struct { short *pagemap; uint64_t total_payload_bytes; uint64_t pgcount; - walk_dbi_t dbi[MAX_DBI]; + walk_dbi_t dbi[MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1]; } walk; #define dbi_free walk.dbi[FREE_DBI] #define dbi_main walk.dbi[MAIN_DBI] #define dbi_meta walk.dbi[CORE_DBS] -uint64_t total_unused_bytes; int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE; - MDBX_env *env; MDBX_txn *txn; MDBX_envinfo envinfo; MDBX_stat envstat; -size_t maxkeysize, userdb_count, skipped_subdb; -uint64_t reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; +size_t userdb_count, skipped_subdb; +uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, + unused_pages, backed_pages; unsigned verbose; -char ignore_wrong_order, quiet; +bool ignore_wrong_order, quiet; const char *only_subdb; struct problem { @@ -124,7 +122,7 @@ static void __printf_args(1, 2) error(const char *msg, ...) { if (!quiet) { va_list args; - fflush(stdout); + fflush(NULL); va_start(args, msg); fputs(" ! ", stderr); vfprintf(stderr, msg, args); @@ -133,8 +131,21 @@ static void __printf_args(1, 2) error(const char *msg, ...) { } } +static int check_user_break(void) { + switch (user_break) { + case 0: + return MDBX_SUCCESS; + case 1: + print(" - interrupted by signal\n"); + fflush(NULL); + user_break = 2; + } + return MDBX_EINTR; +} + static void pagemap_cleanup(void) { - for (int i = CORE_DBS; ++i < MAX_DBI;) { + for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; + i < ARRAY_LENGTH(walk.dbi); ++i) { if (walk.dbi[i].name) { mdbx_free((void *)walk.dbi[i].name); walk.dbi[i].name = NULL; @@ -158,20 +169,21 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (last && strcmp(last->name, dbi_name) == 0) return last; - walk_dbi_t *dbi = walk.dbi + CORE_DBS; - for (dbi = walk.dbi + CORE_DBS; (++dbi)->name;) { + walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; + for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { if (strcmp(dbi->name, dbi_name) == 0) return last = dbi; - if (dbi == walk.dbi + MAX_DBI) - return NULL; } - dbi->name = mdbx_strdup(dbi_name); if (verbose > 0 && !silent) { print(" - found '%s' area\n", dbi_name); fflush(NULL); } + if (dbi == ARRAY_END(walk.dbi)) + return NULL; + + dbi->name = mdbx_strdup(dbi_name); return last = dbi; } @@ -387,7 +399,7 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, } } - return user_break ? MDBX_EINTR : MDBX_SUCCESS; + return check_user_break(); } typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, @@ -400,106 +412,112 @@ static int handle_userdb(const uint64_t record_number, const MDBX_val *key, (void)record_number; (void)key; (void)data; - return MDBX_SUCCESS; + return check_user_break(); } static int handle_freedb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { char *bad = ""; pgno_t *iptr = data->iov_base; - txnid_t txnid = *(txnid_t *)key->iov_base; if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR, key->iov_len); - else if (txnid < 1 || txnid > envinfo.mi_recent_txnid) - problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid); else { - if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) - problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, data->iov_len); - size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; - if (number < 1 || number > MDBX_PNL_MAX) - problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); - else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { - problem_add("entry", txnid, "trimmed idl", - "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", - (number + 1) * sizeof(pgno_t), data->iov_len); - number = data->iov_len / sizeof(pgno_t) - 1; - } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= - /* LY: allow gap upto one page. it is ok - * and better than shink-and-retry inside mdbx_update_gc() */ - envstat.ms_psize) - problem_add("entry", txnid, "extra idl space", - "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", - (number + 1) * sizeof(pgno_t), data->iov_len); - - gc_pages += number; - if (envinfo.mi_latter_reader_txnid > txnid) - reclaimable_pages += number; - - pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; - pgno_t span = 1; - for (unsigned i = 0; i < number; ++i) { - const pgno_t pgno = iptr[i]; - if (pgno < NUM_METAS) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); - else if (pgno >= backed_pages) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, - backed_pages); - else if (pgno >= alloc_pages) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, - alloc_pages - 1); - else { - if (MDBX_PNL_DISORDERED(prev, pgno)) { - bad = " [bad sequence]"; - problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, - (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), - i, pgno); - } - if (walk.pagemap) { - int idx = walk.pagemap[pgno]; - if (idx == 0) - walk.pagemap[pgno] = -1; - else if (idx > 0) - problem_add("page", pgno, "already used", "by %s", - walk.dbi[idx - 1].name); - else - problem_add("page", pgno, "already listed in GC", nullptr); + txnid_t txnid; + memcpy(&txnid, key->iov_base, sizeof(txnid)); + if (txnid < 1 || txnid > envinfo.mi_recent_txnid) + problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid); + else { + if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) + problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, + data->iov_len); + size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; + if (number < 1 || number > MDBX_PNL_MAX) + problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); + else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { + problem_add("entry", txnid, "trimmed idl", + "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", + (number + 1) * sizeof(pgno_t), data->iov_len); + number = data->iov_len / sizeof(pgno_t) - 1; + } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= + /* LY: allow gap upto one page. it is ok + * and better than shink-and-retry inside mdbx_update_gc() */ + envstat.ms_psize) + problem_add("entry", txnid, "extra idl space", + "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", + (number + 1) * sizeof(pgno_t), data->iov_len); + + gc_pages += number; + if (envinfo.mi_latter_reader_txnid > txnid) + reclaimable_pages += number; + + pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; + pgno_t span = 1; + for (unsigned i = 0; i < number; ++i) { + if (check_user_break()) + return MDBX_EINTR; + const pgno_t pgno = iptr[i]; + if (pgno < NUM_METAS) + problem_add("entry", txnid, "wrong idl entry", + "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); + else if (pgno >= backed_pages) + problem_add("entry", txnid, "wrong idl entry", + "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, + backed_pages); + else if (pgno >= alloc_pages) + problem_add("entry", txnid, "wrong idl entry", + "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, + alloc_pages - 1); + else { + if (MDBX_PNL_DISORDERED(prev, pgno)) { + bad = " [bad sequence]"; + problem_add("entry", txnid, "bad sequence", + "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, + (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), + i, pgno); + } + if (walk.pagemap) { + int idx = walk.pagemap[pgno]; + if (idx == 0) + walk.pagemap[pgno] = -1; + else if (idx > 0) + problem_add("page", pgno, "already used", "by %s", + walk.dbi[idx - 1].name); + else + problem_add("page", pgno, "already listed in GC", nullptr); + } } - } - prev = pgno; - while (i + span < number && - iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span))) - ++span; - } - if (verbose > 3 && !only_subdb) { - print(" transaction %" PRIaTXN ", %" PRIuPTR - " pages, maxspan %" PRIaPGNO "%s\n", - txnid, number, span, bad); - if (verbose > 4) { - for (unsigned i = 0; i < number; i += span) { - const pgno_t pgno = iptr[i]; - for (span = 1; - i + span < number && + prev = pgno; + while (i + span < number && iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span)); - ++span) - ; - if (span > 1) { - print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span); - } else - print(" %9" PRIaPGNO "\n", pgno); + : pgno_sub(pgno, span))) + ++span; + } + if (verbose > 3 && !only_subdb) { + print(" transaction %" PRIaTXN ", %" PRIuPTR + " pages, maxspan %" PRIaPGNO "%s\n", + txnid, number, span, bad); + if (verbose > 4) { + for (unsigned i = 0; i < number; i += span) { + const pgno_t pgno = iptr[i]; + for (span = 1; + i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span)); + ++span) + ; + if (span > 1) { + print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span); + } else + print(" %9" PRIaPGNO "\n", pgno); + } } } } } - return MDBX_SUCCESS; + return check_user_break(); } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, @@ -555,7 +573,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, } if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && - strcmp(only_subdb, dbi_name)) { + strcmp(only_subdb, dbi_name) != 0) { if (verbose) { print("Skip processing '%s'...\n", dbi_name); fflush(NULL); @@ -630,6 +648,8 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, return rc; } + const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); + saved_list = problems_push(); prev_key.iov_base = NULL; prev_key.iov_len = 0; @@ -637,12 +657,9 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, prev_data.iov_len = 0; rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); while (rc == MDBX_SUCCESS) { - if (user_break) { - print(" - interrupted by signal\n"); - fflush(NULL); - rc = MDBX_EINTR; + rc = check_user_break(); + if (rc) goto bailout; - } bool bad_key = false; if (key.iov_len > maxkeysize) { @@ -742,16 +759,16 @@ bailout: static void usage(char *prog) { fprintf(stderr, - "usage: %s dbpath [-V] [-v] [-n] [-q] [-w] [-c] [-d] [-s subdb]\n" - " -V\t\tshow version\n" + "usage: %s [-V] [-v] [-n] [-q] [-c] [-w] [-d] [-i] [-s subdb] dbpath\n" + " -V\t\tprint version and exit\n" " -v\t\tmore verbose, could be used multiple times\n" " -n\t\tNOSUBDIR mode for open\n" " -q\t\tbe quiet\n" - " -w\t\tlock DB for writing while checking\n" - " -d\t\tdisable page-by-page traversal of b-tree\n" - " -s subdb\tprocess a specific subdatabase only\n" " -c\t\tforce cooperative mode (don't try exclusive)\n" - " -i\t\tignore wrong order errors (for custom comparators case)\n", + " -w\t\tlock DB for writing while checking\n" + " -d\t\tdisable page-by-page traversal of B-tree\n" + " -i\t\tignore wrong order errors (for custom comparators case)\n" + " -s subdb\tprocess a specific subdatabase only\n", prog); exit(EXIT_INTERRUPTED); } @@ -830,9 +847,13 @@ static int meta_steady(void) { return meta_recent(true); } static int meta_head(void) { return meta_recent(false); } -void verbose_meta(int num, txnid_t txnid, uint64_t sign) { +void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_h, + uint64_t bootid_l) { print(" - meta-%d: %s %" PRIu64, num, meta_synctype(sign), txnid); bool stay = true; + const bool bootid_match = bootid_h == envinfo.mi_bootid.current.h && + bootid_l == envinfo.mi_bootid.current.l && + (bootid_h | bootid_l) != 0; const int steady = meta_steady(); const int head = meta_head(); @@ -843,7 +864,7 @@ void verbose_meta(int num, txnid_t txnid, uint64_t sign) { print(", head-steady"); stay = false; } else if (num == head) { - print(", head-weak"); + print(", head-weak%s", bootid_match ? "-intact (same boot-id)" : ""); stay = false; } if (num == meta_tail(head)) { @@ -860,38 +881,19 @@ void verbose_meta(int num, txnid_t txnid, uint64_t sign) { print("\n"); } -static int check_meta_head(bool steady) { - switch (meta_recent(steady)) { +static uint64_t get_meta_txnid(const unsigned meta_id) { + switch (meta_id) { default: assert(false); - error("unexpected internal error (%s)\n", - steady ? "meta_steady_head" : "meta_weak_head"); - __fallthrough; + error("unexpected meta_id %u\n", meta_id); + return 0; case 0: - if (envinfo.mi_meta0_txnid != envinfo.mi_recent_txnid) { - print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - 0, envinfo.mi_meta0_txnid, envinfo.mi_recent_txnid); - return 1; - } - break; + return envinfo.mi_meta0_txnid; case 1: - if (envinfo.mi_meta1_txnid != envinfo.mi_recent_txnid) { - print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - 1, envinfo.mi_meta1_txnid, envinfo.mi_recent_txnid); - return 1; - } - break; + return envinfo.mi_meta1_txnid; case 2: - if (envinfo.mi_meta2_txnid != envinfo.mi_recent_txnid) { - print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - 2, envinfo.mi_meta2_txnid, envinfo.mi_recent_txnid); - return 1; - } + return envinfo.mi_meta2_txnid; } - return 0; } static void print_size(const char *prefix, const uint64_t value, @@ -910,7 +912,7 @@ int main(int argc, char *argv[]) { char *prog = argv[0]; char *envname; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; - int dont_traversal = 0; + bool dont_traversal = false; bool locked = false; double elapsed; @@ -931,22 +933,30 @@ int main(int argc, char *argv[]) { dbi_main.name = "@MAIN"; atexit(pagemap_cleanup); - if (argc < 2) { + if (argc < 2) usage(prog); - } for (int i; (i = getopt(argc, argv, "Vvqnwcdsi:")) != EOF;) { switch (i) { case 'V': - printf("%s (%s, build %s)\n", mdbx_version.git.describe, - mdbx_version.git.datetime, mdbx_build.datetime); - exit(EXIT_SUCCESS); - break; + printf("mdbx_chk version %d.%d.%d.%d\n" + " - source: %s %s, commit %s, tree %s\n" + " - anchor: %s\n" + " - build: %s for %s by %s\n" + " - flags: %s\n" + " - options: %s\n", + mdbx_version.major, mdbx_version.minor, mdbx_version.release, + mdbx_version.revision, mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_version.git.commit, + mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, + mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, + mdbx_build.options); + return EXIT_SUCCESS; case 'v': verbose++; break; case 'q': - quiet = 1; + quiet = true; break; case 'n': envflags |= MDBX_NOSUBDIR; @@ -955,10 +965,10 @@ int main(int argc, char *argv[]) { envflags &= ~MDBX_RDONLY; break; case 'c': - envflags &= ~MDBX_EXCLUSIVE; + envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE; break; case 'd': - dont_traversal = 1; + dont_traversal = true; break; case 's': if (only_subdb && strcmp(only_subdb, optarg)) @@ -966,7 +976,7 @@ int main(int argc, char *argv[]) { only_subdb = optarg; break; case 'i': - ignore_wrong_order = 1; + ignore_wrong_order = true; break; default: usage(prog); @@ -990,7 +1000,9 @@ int main(int argc, char *argv[]) { #endif /* !WINDOWS */ envname = argv[optind]; - print("Running mdbx_chk for %s in 'read-%s' mode...\n", envname, + print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, envname, (envflags & MDBX_RDONLY) ? "only" : "write"); fflush(NULL); @@ -1006,7 +1018,7 @@ int main(int argc, char *argv[]) { goto bailout; } - rc = mdbx_env_open(env, envname, envflags, 0664); + rc = mdbx_env_open(env, envname, envflags, 0); if ((envflags & MDBX_EXCLUSIVE) && (rc == MDBX_BUSY || #if defined(_WIN32) || defined(_WIN64) @@ -1016,7 +1028,7 @@ int main(int argc, char *argv[]) { #endif )) { envflags &= ~MDBX_EXCLUSIVE; - rc = mdbx_env_open(env, envname, envflags, 0664); + rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0); } if (rc) { @@ -1044,21 +1056,13 @@ int main(int argc, char *argv[]) { goto bailout; } - rc = mdbx_env_get_maxkeysize(env); - if (rc < 0) { - error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, - mdbx_strerror(rc)); - goto bailout; - } - maxkeysize = rc; - - rc = mdbx_env_info2(env, txn, &envinfo, sizeof(envinfo)); + rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo)); if (rc) { error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } - rc = mdbx_env_stat2(env, txn, &envstat, sizeof(envstat)); + rc = mdbx_env_stat_ex(env, txn, &envstat, sizeof(envstat)); if (rc) { error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; @@ -1100,14 +1104,11 @@ int main(int argc, char *argv[]) { const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize; alloc_pages = txn->mt_next_pgno; backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; -#if !(defined(_WIN32) || defined(_WIN64)) - if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY && - backed_pages != dxbfile_pages) { - print(" ! backed-pages %" PRIu64 " != file-pages %" PRIu64 "\n", + if (backed_pages > dxbfile_pages) { + print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", backed_pages, dxbfile_pages); ++problems_meta; } -#endif /* !Windows */ if (dxbfile_pages < NUM_METAS) print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS); if (backed_pages < NUM_METAS) @@ -1153,10 +1154,11 @@ int main(int argc, char *argv[]) { } if (verbose) { - print(" - pagesize %u (%u system), max keysize %" PRIuPTR + print(" - pagesize %u (%u system), max keysize %d..%d" ", max readers %u\n", - envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, maxkeysize, - envinfo.mi_maxreaders); + envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, + mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), + mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders); print_size(" - mapsize ", envinfo.mi_mapsize, "\n"); if (envinfo.mi_geo.lower == envinfo.mi_geo.upper) print_size(" - fixed datafile: ", envinfo.mi_geo.current, ""); @@ -1174,12 +1176,15 @@ int main(int argc, char *argv[]) { envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); - verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign); - verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign); - verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign); + verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_bootid.meta0.h, envinfo.mi_bootid.meta0.l); + verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, + envinfo.mi_bootid.meta1.h, envinfo.mi_bootid.meta1.l); + verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, + envinfo.mi_bootid.meta2.h, envinfo.mi_bootid.meta2.l); } - if (verbose) + if (verbose > 1) print(" - performs check for meta-pages clashes\n"); if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { @@ -1197,19 +1202,34 @@ int main(int argc, char *argv[]) { ++problems_meta; } + const unsigned steady_meta_id = meta_recent(true); + const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); + const unsigned weak_meta_id = meta_recent(false); + const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); if (envflags & MDBX_EXCLUSIVE) { - if (verbose) + if (verbose > 1) print(" - performs full check recent-txn-id with meta-pages\n"); - problems_meta += check_meta_head(true); + if (steady_meta_txnid != envinfo.mi_recent_txnid) { + print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")\n", + steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); + ++problems_meta; + } } else if (locked) { - if (verbose) + if (verbose > 1) print(" - performs lite check recent-txn-id with meta-pages (not a " "monopolistic mode)\n"); - problems_meta += check_meta_head(false); + if (weak_meta_txnid != envinfo.mi_recent_txnid) { + print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")\n", + weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); + ++problems_meta; + } } else if (verbose) { print(" - skip check recent-txn-id with meta-pages (monopolistic or " "read-write mode only)\n"); } + total_problems += problems_meta; if (!dont_traversal) { struct problem *saved_list; @@ -1230,12 +1250,8 @@ int main(int argc, char *argv[]) { traversal_problems = problems_pop(saved_list); if (rc) { - if (rc == MDBX_EINTR && user_break) { - print(" - interrupted by signal\n"); - fflush(NULL); - } else { + if (rc != MDBX_EINTR || !check_user_break()) error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); - } goto bailout; } @@ -1244,7 +1260,7 @@ int main(int argc, char *argv[]) { unused_pages += 1; empty_pages = lost_bytes = 0; - for (walk_dbi_t *dbi = &dbi_main; dbi < walk.dbi + MAX_DBI && dbi->name; + for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { empty_pages += dbi->pages.empty; lost_bytes += dbi->lost_bytes; @@ -1255,7 +1271,7 @@ int main(int argc, char *argv[]) { print(" - pages: total %" PRIu64 ", unused %" PRIu64 "\n", walk.pgcount, unused_pages); if (verbose > 1) { - for (walk_dbi_t *dbi = walk.dbi; dbi < walk.dbi + MAX_DBI && dbi->name; + for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total); if (dbi->pages.other && dbi->pages.other != dbi->pages.total) @@ -1289,7 +1305,7 @@ int main(int argc, char *argv[]) { (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); if (verbose > 2) { - for (walk_dbi_t *dbi = walk.dbi; dbi < walk.dbi + MAX_DBI && dbi->name; + for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) if (dbi->pages.total) { uint64_t dbi_bytes = dbi->pages.total * envstat.ms_psize; @@ -1359,12 +1375,13 @@ int main(int argc, char *argv[]) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (walk.pgcount != alloc_pages - gc_pages) { - error("used pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", + error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64 + "(allocated - GC))\n", walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { - error("gc pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", unused_pages, - gc_pages); + error("gc pages mismatch (%" PRIu64 "(walked) != %" PRIu64 "(GC))\n", + unused_pages, gc_pages); } } else if (verbose) { print(" - skip check used and gc pages (btree-traversal with " @@ -1377,17 +1394,41 @@ int main(int argc, char *argv[]) { } } + if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && + (envflags & MDBX_RDONLY) == 0 && !only_subdb && + steady_meta_txnid < envinfo.mi_recent_txnid) { + print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 + "\n", + envinfo.mi_recent_txnid); + fflush(NULL); + if (locked) { + mdbx_txn_unlock(env); + locked = false; + } + rc = mdbx_env_sync_ex(env, true, false); + if (rc != MDBX_SUCCESS) + error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); + else { + total_problems -= 1; + problems_meta -= 1; + } + } + bailout: if (txn) mdbx_txn_abort(txn); - if (locked) + if (locked) { mdbx_txn_unlock(env); - if (env) - mdbx_env_close(env); + locked = false; + } + if (env) { + const bool dont_sync = rc != 0 || total_problems; + mdbx_env_close_ex(env, dont_sync); + } fflush(NULL); if (rc) { if (rc < 0) - return (user_break) ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; + return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; return EXIT_FAILURE_MDB; } @@ -1404,8 +1445,7 @@ bailout: (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; #endif /* !WINDOWS */ - total_problems += problems_meta; - if (total_problems || problems_maindb || problems_freedb) { + if (total_problems) { print("Total %" PRIu64 " error%s detected, elapsed %.3f seconds.\n", total_problems, (total_problems > 1) ? "s are" : " is", elapsed); if (problems_meta || problems_maindb || problems_freedb) diff --git a/libs/libmdbx/src/src/tools/mdbx_chk.vcxproj b/libs/libmdbx/src/src/tools/mdbx_chk.vcxproj deleted file mode 100644 index e6c2686286..0000000000 --- a/libs/libmdbx/src/src/tools/mdbx_chk.vcxproj +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{15030120-5F7F-48F9-ABE5-DFC814F2A4BE}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbx_chk</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ProjectReference Include="..\..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <ClCompile Include="mdbx_chk.c" /> - <ClCompile Include="wingetopt.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="..\..\mdbx.h" /> - <ClInclude Include="wingetopt.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/src/tools/mdbx_copy.c b/libs/libmdbx/src/src/tools/mdbx_copy.c index f700b1785f..b3d0ea58b2 100644 --- a/libs/libmdbx/src/src/tools/mdbx_copy.c +++ b/libs/libmdbx/src/src/tools/mdbx_copy.c @@ -20,9 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../elements/internals.h" #if defined(_WIN32) || defined(_WIN64) #include "wingetopt.h" @@ -44,30 +43,57 @@ static void signal_handler(int sig) { #endif /* !WINDOWS */ +static void usage(const char *prog) { + fprintf(stderr, + "usage: %s [-V] [-q] [-c] [-n] src_path [dest_path]\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -c\t\tenable compactification (skip unused pages)\n" + " -n\t\tNOSUBDIR mode for open\n" + " src_path\tsource database\n" + " dest_path\tdestination (stdout if not specified)\n", + prog); + exit(EXIT_FAILURE); +} + int main(int argc, char *argv[]) { int rc; MDBX_env *env = NULL; const char *progname = argv[0], *act; unsigned flags = MDBX_RDONLY; unsigned cpflags = 0; + bool quiet = false; for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') flags |= MDBX_NOSUBDIR; else if (argv[1][1] == 'c' && argv[1][2] == '\0') cpflags |= MDBX_CP_COMPACT; + else if (argv[1][1] == 'q' && argv[1][2] == '\0') + quiet = true; + else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || + strcmp(argv[1], "--help") == 0) + usage(progname); else if (argv[1][1] == 'V' && argv[1][2] == '\0') { - printf("%s (%s, build %s)\n", mdbx_version.git.describe, - mdbx_version.git.datetime, mdbx_build.datetime); - exit(EXIT_SUCCESS); + printf("mdbx_copy version %d.%d.%d.%d\n" + " - source: %s %s, commit %s, tree %s\n" + " - anchor: %s\n" + " - build: %s for %s by %s\n" + " - flags: %s\n" + " - options: %s\n", + mdbx_version.major, mdbx_version.minor, mdbx_version.release, + mdbx_version.revision, mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_version.git.commit, + mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, + mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, + mdbx_build.options); + return EXIT_SUCCESS; } else argc = 0; } - if (argc < 2 || argc > 3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); - exit(EXIT_FAILURE); - } + if (argc < 2 || argc > 3) + usage(progname); #if defined(_WIN32) || defined(_WIN64) SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); @@ -82,6 +108,14 @@ int main(int argc, char *argv[]) { signal(SIGTERM, signal_handler); #endif /* !WINDOWS */ + if (!quiet) { + fprintf((argc == 2) ? stderr : stdout, + "mdbx_copy %s (%s, T-%s)\nRunning for copy %s to %s...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, argv[1], (argc == 2) ? "stdout" : argv[2]); + fflush(NULL); + } + act = "opening environment"; rc = mdbx_env_create(&env); if (rc == MDBX_SUCCESS) { diff --git a/libs/libmdbx/src/src/tools/mdbx_copy.vcxproj b/libs/libmdbx/src/src/tools/mdbx_copy.vcxproj deleted file mode 100644 index d47513c204..0000000000 --- a/libs/libmdbx/src/src/tools/mdbx_copy.vcxproj +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{15030120-5F7F-48F9-ABE5-DFC814F2A4BD}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbx_copy</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ProjectReference Include="..\..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <ClCompile Include="mdbx_copy.c" /> - <ClCompile Include="wingetopt.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="..\..\mdbx.h" /> - <ClInclude Include="wingetopt.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/src/tools/mdbx_dump.c b/libs/libmdbx/src/src/tools/mdbx_dump.c index 8cb9c86e46..20f47938af 100644 --- a/libs/libmdbx/src/src/tools/mdbx_dump.c +++ b/libs/libmdbx/src/src/tools/mdbx_dump.c @@ -20,9 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../elements/internals.h" #include <ctype.h> @@ -116,7 +115,7 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { if (rc) return rc; - rc = mdbx_env_info(mdbx_txn_env(txn), &info, sizeof(info)); + rc = mdbx_env_info_ex(mdbx_txn_env(txn), txn, &info, sizeof(info)); if (rc) return rc; @@ -125,7 +124,7 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%" PRIu64 "\n", info.mi_mapsize); + printf("mapsize=%" PRIu64 "\n", info.mi_geo.upper); printf("maxreaders=%u\n", info.mi_maxreaders); for (i = 0; dbflags[i].bit; i++) @@ -161,7 +160,18 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { static void usage(char *prog) { fprintf(stderr, - "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", + "usage: %s [-V] [-q] [-f file] [-l] [-p] [-a|-s subdb] [-r] [-n] " + "dbpath\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -f\t\twrite to file instead of stdout\n" + " -l\t\tlist subDBs and exit\n" + " -p\t\tuse printable characters\n" + " -a\t\tdump main DB and all subDBs,\n" + " \t\tby default dump only the main DB\n" + " -s\t\tdump only the named subDB\n" + " -r\t\trescure mode (ignore errors to dump corrupted DB)\n" + " -n\t\tNOSUBDIR mode for open\n", prog); exit(EXIT_FAILURE); } @@ -174,27 +184,27 @@ int main(int argc, char *argv[]) { char *prog = argv[0]; char *envname; char *subname = NULL; - int alldbs = 0, envflags = 0, list = 0; + int alldbs = 0, envflags = 0, list = 0, quiet = 0, rescue = 0; - if (argc < 2) { + if (argc < 2) usage(prog); - } - /* -a: dump main DB and all subDBs - * -s: dump only the named subDB - * -n: use NOSUBDIR flag on env_open - * -p: use printable characters - * -f: write to file instead of stdout - * -V: print version and exit - * (default) dump only the main DB - */ - while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { + while ((i = getopt(argc, argv, "af:lnps:Vrq")) != EOF) { switch (i) { case 'V': - printf("%s (%s, build %s)\n", mdbx_version.git.describe, - mdbx_version.git.datetime, mdbx_build.datetime); - exit(EXIT_SUCCESS); - break; + printf("mdbx_dump version %d.%d.%d.%d\n" + " - source: %s %s, commit %s, tree %s\n" + " - anchor: %s\n" + " - build: %s for %s by %s\n" + " - flags: %s\n" + " - options: %s\n", + mdbx_version.major, mdbx_version.minor, mdbx_version.release, + mdbx_version.revision, mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_version.git.commit, + mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, + mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, + mdbx_build.options); + return EXIT_SUCCESS; case 'l': list = 1; /*FALLTHROUGH*/; @@ -222,6 +232,12 @@ int main(int argc, char *argv[]) { usage(prog); subname = optarg; break; + case 'q': + quiet = 1; + break; + case 'r': + rescue = 1; + break; default: usage(prog); } @@ -244,6 +260,13 @@ int main(int argc, char *argv[]) { #endif /* !WINDOWS */ envname = argv[optind]; + if (!quiet) { + fprintf(stderr, "mdbx_dump %s (%s, T-%s)\nRunning for %s...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, envname); + fflush(NULL); + } + rc = mdbx_env_create(&env); if (rc) { fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, @@ -255,7 +278,9 @@ int main(int argc, char *argv[]) { mdbx_env_set_maxdbs(env, 2); } - rc = mdbx_env_open(env, envname, envflags | MDBX_RDONLY, 0664); + rc = mdbx_env_open( + env, envname, + envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE : MDBX_RDONLY), 0); if (rc) { fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -306,8 +331,32 @@ int main(int argc, char *argv[]) { list++; } else { rc = dumpit(txn, db2, str); - if (rc) - break; + if (rc) { + if (!rescue) + break; + fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog, + envname, mdbx_strerror(rc), str); + /* Here is a hack for rescue mode, don't do that: + * - we should restart transaction in case error due + * database corruption; + * - but we won't close cursor, reopen and re-positioning it + * for new a transaction; + * - this is possible since DB is opened in read-only exclusive + * mode and transaction is the same, i.e. has the same address + * and so on. */ + rc = mdbx_txn_reset(txn); + if (rc) { + fprintf(stderr, "mdbx_txn_reset failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + rc = mdbx_txn_renew(txn); + if (rc) { + fprintf(stderr, "mdbx_txn_renew failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + } } mdbx_dbi_close(env, db2); } diff --git a/libs/libmdbx/src/src/tools/mdbx_dump.vcxproj b/libs/libmdbx/src/src/tools/mdbx_dump.vcxproj deleted file mode 100644 index 6978a2c22d..0000000000 --- a/libs/libmdbx/src/src/tools/mdbx_dump.vcxproj +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{15030120-5F7F-48F9-ABE5-DFC814F2A4BC}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbx_dump</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ProjectReference Include="..\..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <ClCompile Include="mdbx_dump.c" /> - <ClCompile Include="wingetopt.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="..\..\mdbx.h" /> - <ClInclude Include="wingetopt.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/src/tools/mdbx_load.c b/libs/libmdbx/src/src/tools/mdbx_load.c index 7bbc52b87e..c6b2ee8801 100644 --- a/libs/libmdbx/src/src/tools/mdbx_load.c +++ b/libs/libmdbx/src/src/tools/mdbx_load.c @@ -1,4 +1,4 @@ -/* mdbx_load.c - memory-mapped database load tool */ +/* mdbx_load.c - memory-mapped database load tool */ /* * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> @@ -20,9 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../elements/internals.h" #include <ctype.h> @@ -308,7 +307,17 @@ static int readline(MDBX_val *out, MDBX_val *buf) { static void usage(void) { fprintf(stderr, - "usage: %s [-V] [-a] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", + "usage: %s [-V] [-q] [-a] [-f file] [-s name] [-N] [-T] [-r] [-n] " + "dbpath\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -a\t\tappend records in input order\n" + " -f file\tread from file instead of stdin\n" + " -s name\tload into named subDB\n" + " -N\t\tuse NOOVERWRITE on puts\n" + " -T\t\tread plaintext\n" + " -r\t\trescure mode (ignore errors to load corrupted DB dump)\n" + " -n\t\tNOSUBDIR mode for open\n", prog); exit(EXIT_FAILURE); } @@ -328,27 +337,30 @@ int main(int argc, char *argv[]) { char *envname = NULL; int envflags = MDBX_UTTERLY_NOSYNC, putflags = 0; int append = 0; + int quiet = 0; + int rescue = 0; MDBX_val prevk; prog = argv[0]; if (argc < 2) usage(); - /* -a: append records in input order - * -f: load file instead of stdin - * -n: use NOSUBDIR flag on env_open - * -s: load into named subDB - * -N: use NOOVERWRITE on puts - * -T: read plaintext - * -V: print version and exit - */ - while ((i = getopt(argc, argv, "af:ns:NTV")) != EOF) { + while ((i = getopt(argc, argv, "af:ns:NTVrq")) != EOF) { switch (i) { case 'V': - printf("%s (%s, build %s)\n", mdbx_version.git.describe, - mdbx_version.git.datetime, mdbx_build.datetime); - exit(EXIT_SUCCESS); - break; + printf("mdbx_load version %d.%d.%d.%d\n" + " - source: %s %s, commit %s, tree %s\n" + " - anchor: %s\n" + " - build: %s for %s by %s\n" + " - flags: %s\n" + " - options: %s\n", + mdbx_version.major, mdbx_version.minor, mdbx_version.release, + mdbx_version.revision, mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_version.git.commit, + mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, + mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, + mdbx_build.options); + return EXIT_SUCCESS; case 'a': append = 1; break; @@ -371,6 +383,12 @@ int main(int argc, char *argv[]) { case 'T': mode |= NOHDR | PRINT; break; + case 'q': + quiet = 1; + break; + case 'r': + rescue = 1; + break; default: usage(); } @@ -392,6 +410,13 @@ int main(int argc, char *argv[]) { signal(SIGTERM, signal_handler); #endif /* !WINDOWS */ + envname = argv[optind]; + if (!quiet) + printf("mdbx_load %s (%s, T-%s)\nRunning for %s...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, envname); + fflush(NULL); + dbuf.iov_len = 4096; dbuf.iov_base = mdbx_malloc(dbuf.iov_len); @@ -399,7 +424,6 @@ int main(int argc, char *argv[]) { if (!(mode & NOHDR)) readhdr(); - envname = argv[optind]; rc = mdbx_env_create(&env); if (rc) { fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, @@ -418,7 +442,13 @@ int main(int argc, char *argv[]) { mdbx_strerror(MDBX_TOO_LARGE)); return EXIT_FAILURE; } - mdbx_env_set_mapsize(env, (size_t)envinfo.mi_mapsize); + rc = mdbx_env_set_geometry(env, 0, -1, (size_t)envinfo.mi_mapsize, -1, -1, + -1); + if (rc) { + fprintf(stderr, "mdbx_env_set_geometry failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } } #ifdef MDBX_FIXEDMAP @@ -433,7 +463,7 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxkeysize(env); + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, MDBX_DUPSORT); if (kbuf.iov_len >= SIZE_MAX / 4) { fprintf(stderr, "mdbx_env_get_maxkeysize failed, returns %zu\n", kbuf.iov_len); @@ -503,6 +533,11 @@ int main(int argc, char *argv[]) { rc = mdbx_cursor_put(mc, &key, &data, putflags | appflag); if (rc == MDBX_KEYEXIST && putflags) continue; + if (rc == MDBX_BAD_VALSIZE && rescue) { + fprintf(stderr, "%s: skip line %" PRIiSIZE ": due %s\n", prog, lineno, + mdbx_strerror(rc)); + continue; + } if (rc) { fprintf(stderr, "mdbx_cursor_put failed, error %d %s\n", rc, mdbx_strerror(rc)); diff --git a/libs/libmdbx/src/src/tools/mdbx_load.vcxproj b/libs/libmdbx/src/src/tools/mdbx_load.vcxproj deleted file mode 100644 index 05a100fc64..0000000000 --- a/libs/libmdbx/src/src/tools/mdbx_load.vcxproj +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{15030120-5F7F-48F9-ABE5-DFC814F2A4BB}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbx_load</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ProjectReference Include="..\..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <ClCompile Include="mdbx_load.c" /> - <ClCompile Include="wingetopt.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="..\..\mdbx.h" /> - <ClInclude Include="wingetopt.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.c b/libs/libmdbx/src/src/tools/mdbx_stat.c index d4ebc2e654..bd70eb7006 100644 --- a/libs/libmdbx/src/src/tools/mdbx_stat.c +++ b/libs/libmdbx/src/src/tools/mdbx_stat.c @@ -20,9 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../elements/internals.h" #if defined(_WIN32) || defined(_WIN64) #include "wingetopt.h" @@ -55,11 +54,40 @@ static void prstat(MDBX_stat *ms) { static void usage(char *prog) { fprintf(stderr, - "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", + "usage: %s [-V] [-e] [-f[f[f]]] [-r[r]] [-a|-s name] [-n] dbpath\n" + " -V\t\tprint version and exit\n" + " -e\t\tshow whole DB info\n" + " -f\t\tshow GC info\n" + " -r\t\tshow readers\n" + " -a\t\tprint stat of main DB and all subDBs\n" + " \t\t(default) print stat of only the main DB\n" + " -s name\tprint stat of only the named subDB\n" + " -n\t\tNOSUBDIR mode for open\n", prog); exit(EXIT_FAILURE); } +static int reader_list_func(void *ctx, int num, int slot, mdbx_pid_t pid, + mdbx_tid_t thread, uint64_t txnid, uint64_t lag, + size_t bytes_used, size_t bytes_retained) { + (void)ctx; + if (num == 1) + printf("Reader Table Status\n" + " #\tslot\t%6s %*s %20s %10s %13s %13s\n", + "pid", (int)sizeof(size_t) * 2, "thread", "txnid", "lag", "used", + "retained"); + + printf(" %3d)\t[%d]\t%6" PRIdSIZE " %*" PRIxSIZE, num, slot, (size_t)pid, + (int)sizeof(size_t) * 2, (size_t)thread); + if (txnid) + printf(" %20" PRIu64 " %10" PRIu64 " %12.1fM %12.1fM\n", txnid, lag, + bytes_used / 1048576.0, bytes_retained / 1048576.0); + else + printf(" %20s %10s %13s %13s\n", "-", "0", "0", "0"); + + return user_break ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE; +} + int main(int argc, char *argv[]) { int o, rc; MDBX_env *env; @@ -72,26 +100,25 @@ int main(int argc, char *argv[]) { char *subname = NULL; int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; - if (argc < 2) { + if (argc < 2) usage(prog); - } - /* -a: print stat of main DB and all subDBs - * -s: print stat of only the named subDB - * -e: print env info - * -f: print freelist info - * -r: print reader info - * -n: use NOSUBDIR flag on env_open - * -V: print version and exit - * (default) print stat of only the main DB - */ while ((o = getopt(argc, argv, "Vaefnrs:")) != EOF) { switch (o) { case 'V': - printf("%s (%s, build %s)\n", mdbx_version.git.describe, - mdbx_version.git.datetime, mdbx_build.datetime); - exit(EXIT_SUCCESS); - break; + printf("mdbx_stat version %d.%d.%d.%d\n" + " - source: %s %s, commit %s, tree %s\n" + " - anchor: %s\n" + " - build: %s for %s by %s\n" + " - flags: %s\n" + " - options: %s\n", + mdbx_version.major, mdbx_version.minor, mdbx_version.release, + mdbx_version.revision, mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_version.git.commit, + mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, + mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, + mdbx_build.options); + return EXIT_SUCCESS; case 'a': if (subname) usage(prog); @@ -136,6 +163,12 @@ int main(int argc, char *argv[]) { #endif /* !WINDOWS */ envname = argv[optind]; + envname = argv[optind]; + printf("mdbx_stat %s (%s, T-%s)\nRunning for %s...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, envname); + fflush(NULL); + rc = mdbx_env_create(&env); if (rc) { fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, @@ -143,9 +176,8 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } - if (alldbs || subname) { + if (alldbs || subname) mdbx_env_set_maxdbs(env, 4); - } rc = mdbx_env_open(env, envname, envflags | MDBX_RDONLY, 0664); if (rc) { @@ -154,15 +186,22 @@ int main(int argc, char *argv[]) { goto env_close; } + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + if (envinfo || freinfo) { - (void)mdbx_env_info(env, &mei, sizeof(mei)); + (void)mdbx_env_info_ex(env, txn, &mei, sizeof(mei)); } else { /* LY: zap warnings from gcc */ memset(&mei, 0, sizeof(mei)); } if (envinfo) { - (void)mdbx_env_stat(env, &mst, sizeof(mst)); + (void)mdbx_env_stat_ex(env, txn, &mst, sizeof(mst)); printf("Environment Info\n"); printf(" Pagesize: %u\n", mst.ms_psize); if (mei.mi_geo.lower != mei.mi_geo.upper) { @@ -194,25 +233,29 @@ int main(int argc, char *argv[]) { } if (rdrinfo) { - printf("Reader Table Status\n"); - rc = mdbx_reader_list(env, (MDBX_msg_func *)fputs, stdout); - if (rdrinfo > 1) { + rc = mdbx_reader_list(env, reader_list_func, nullptr); + if (rc == MDBX_RESULT_TRUE) + printf("Reader Table is empty\n"); + else if (rc == MDBX_SUCCESS && rdrinfo > 1) { int dead; - mdbx_reader_check(env, &dead); - printf(" %d stale readers cleared.\n", dead); - rc = mdbx_reader_list(env, (MDBX_msg_func *)fputs, stdout); + rc = mdbx_reader_check(env, &dead); + if (rc == MDBX_RESULT_TRUE) { + printf(" %d stale readers cleared.\n", dead); + rc = mdbx_reader_list(env, reader_list_func, nullptr); + if (rc == MDBX_RESULT_TRUE) + printf(" Now Reader Table is empty\n"); + } else + printf(" No stale readers.\n"); + } + if (MDBX_IS_ERROR(rc)) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; } if (!(subname || alldbs || freinfo)) goto env_close; } - rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); - if (rc) { - fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, - mdbx_strerror(rc)); - goto env_close; - } - if (freinfo) { MDBX_cursor *cursor; MDBX_val key, data; diff --git a/libs/libmdbx/src/src/tools/mdbx_stat.vcxproj b/libs/libmdbx/src/src/tools/mdbx_stat.vcxproj deleted file mode 100644 index 4027491d39..0000000000 --- a/libs/libmdbx/src/src/tools/mdbx_stat.vcxproj +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup Label="ProjectConfigurations"> - <ProjectConfiguration Include="Debug|Win32"> - <Configuration>Debug</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|Win32"> - <Configuration>Release</Configuration> - <Platform>Win32</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Debug|x64"> - <Configuration>Debug</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - <ProjectConfiguration Include="Release|x64"> - <Configuration>Release</Configuration> - <Platform>x64</Platform> - </ProjectConfiguration> - </ItemGroup> - <PropertyGroup Label="Globals"> - <ProjectGuid>{15030120-5F7F-48F9-ABE5-DFC814F2A4BF}</ProjectGuid> - <Keyword>Win32Proj</Keyword> - <RootNamespace>mdbx_stat</RootNamespace> - <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> - <ConfigurationType>Application</ConfigurationType> - <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v140</PlatformToolset> - <WholeProgramOptimization>true</WholeProgramOptimization> - <CharacterSet>MultiByte</CharacterSet> - </PropertyGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - </ImportGroup> - <ImportGroup Label="Shared"> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> - </ImportGroup> - <PropertyGroup Label="UserMacros" /> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <LinkIncremental>true</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <LinkIncremental>false</LinkIncremental> - <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir> - <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir> - </PropertyGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <ClCompile> - <PrecompiledHeader> - </PrecompiledHeader> - <WarningLevel>EnableAllWarnings</WarningLevel> - <Optimization>Disabled</Optimization> - <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - <TreatWarningAsError>true</TreatWarningAsError> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <ClCompile> - <WarningLevel>EnableAllWarnings</WarningLevel> - <PrecompiledHeader> - </PrecompiledHeader> - <Optimization>MaxSpeed</Optimization> - <FunctionLevelLinking>true</FunctionLevelLinking> - <IntrinsicFunctions>true</IntrinsicFunctions> - <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1</PreprocessorDefinitions> - </ClCompile> - <Link> - <SubSystem>Console</SubSystem> - <EnableCOMDATFolding>true</EnableCOMDATFolding> - <OptimizeReferences>true</OptimizeReferences> - </Link> - </ItemDefinitionGroup> - <ItemGroup> - <ProjectReference Include="..\..\dll.vcxproj"> - <Project>{6d19209b-ece7-4b9c-941c-0aa2b484f199}</Project> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <ClCompile Include="mdbx_stat.c" /> - <ClCompile Include="wingetopt.c" /> - </ItemGroup> - <ItemGroup> - <ClInclude Include="..\..\mdbx.h" /> - <ClInclude Include="wingetopt.h" /> - </ItemGroup> - <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - </ImportGroup> -</Project> diff --git a/libs/libmdbx/src/src/tools/wingetopt.h b/libs/libmdbx/src/src/tools/wingetopt.h index fdff368374..d328e38ccc 100644 --- a/libs/libmdbx/src/src/tools/wingetopt.h +++ b/libs/libmdbx/src/src/tools/wingetopt.h @@ -9,6 +9,10 @@ #ifndef _WINGETOPT_H_ #define _WINGETOPT_H_ +/* Bit of madness for Windows console */ +#define mdbx_strerror mdbx_strerror_ANSI2OEM +#define mdbx_strerror_r mdbx_strerror_r_ANSI2OEM + #ifdef __cplusplus extern "C" { #endif diff --git a/libs/libmdbx/src/src/version.c b/libs/libmdbx/src/src/version.c deleted file mode 100644 index a0f7b4f58d..0000000000 --- a/libs/libmdbx/src/src/version.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru> - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * <http://www.OpenLDAP.org/license.html>. - */ - -#include "./bits.h" - -#if MDBX_VERSION_MAJOR != 0 || MDBX_VERSION_MINOR != 3 -#error "API version mismatch!" -#endif - -#define MDBX_VERSION_RELEASE 0 -#define MDBX_VERSION_REVISION 0 - -/*LIBMDBX_EXPORTS*/ const mdbx_version_info mdbx_version = { - MDBX_VERSION_MAJOR, - MDBX_VERSION_MINOR, - MDBX_VERSION_RELEASE, - MDBX_VERSION_REVISION, - {"@MDBX_GIT_TIMESTAMP@", "@MDBX_GIT_TREE@", "@MDBX_GIT_COMMIT@", - "@MDBX_GIT_DESCRIBE@"}}; - -/*LIBMDBX_EXPORTS*/ const mdbx_build_info mdbx_build = { - "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TARGET@", "@MDBX_BUILD_OPTIONS@", - "@MDBX_BUILD_COMPILER@", "@MDBX_BUILD_FLAGS@"}; diff --git a/libs/libmdbx/src/stdafx.cxx b/libs/libmdbx/src/stdafx.cxx index d79ecdb2b4..fd443374cb 100644 --- a/libs/libmdbx/src/stdafx.cxx +++ b/libs/libmdbx/src/stdafx.cxx @@ -5,6 +5,6 @@ BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD reason, LPVOID reserved) { - mdbx_dll_callback(hInstDLL, reason, reserved); + mdbx_dll_handler(hInstDLL, reason, reserved); return TRUE; } diff --git a/libs/libmdbx/src/tutorial/CMakeLists.txt b/libs/libmdbx/src/tutorial/CMakeLists.txt deleted file mode 100644 index 196f1f48d7..0000000000 --- a/libs/libmdbx/src/tutorial/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -set(TARGET mdbx_tutorial) -project(${TARGET}) - -add_executable(${TARGET} sample-mdbx.c) - -target_link_libraries(${TARGET} mdbx) - diff --git a/libs/libmdbx/src/tutorial/README.md b/libs/libmdbx/src/tutorial/README.md deleted file mode 100644 index b5218da304..0000000000 --- a/libs/libmdbx/src/tutorial/README.md +++ /dev/null @@ -1 +0,0 @@ -This directory is just a placeholder for now. Tutorial and examples will be added later. diff --git a/plugins/Dbx_mdbx/src/dbintf.cpp b/plugins/Dbx_mdbx/src/dbintf.cpp index 40b899d1e2..9a94709342 100644 --- a/plugins/Dbx_mdbx/src/dbintf.cpp +++ b/plugins/Dbx_mdbx/src/dbintf.cpp @@ -291,7 +291,7 @@ static VOID CALLBACK DoBufferFlushTimerProc(HWND hwnd, UINT, UINT_PTR idEvent, D void CDbxMDBX::DBFlush(bool bForce)
{
if (bForce) {
- mdbx_env_sync(m_env, true);
+ mdbx_env_sync(m_env);
}
else if (m_safetyMode) {
::KillTimer(m_hwndTimer, 1);
diff --git a/plugins/Dbx_mdbx/src/init.cpp b/plugins/Dbx_mdbx/src/init.cpp index ff0e8eac34..70349fa0b5 100644 --- a/plugins/Dbx_mdbx/src/init.cpp +++ b/plugins/Dbx_mdbx/src/init.cpp @@ -99,7 +99,7 @@ static DATABASELINK dblink = int CMPlugin::Load()
{
- mdbx_setup_debug(MDBX_DBG_ASSERT | MDBX_DBG_PRINT, &logger);
+ mdbx_setup_debug(MDBX_DBG_ASSERT, MDBX_LOG_WARN, &logger);
RegisterDatabasePlugin(&dblink);
return 0;
}
diff --git a/tools/mdbx_dump/src/mdbx_dump.cc b/tools/mdbx_dump/src/mdbx_dump.cc index 7b01e47ca1..af5776d27e 100644 --- a/tools/mdbx_dump/src/mdbx_dump.cc +++ b/tools/mdbx_dump/src/mdbx_dump.cc @@ -20,9 +20,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../../libs/libmdbx/src/src/bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../../libs/libmdbx/src/src/elements/internals.h" #include <ctype.h> diff --git a/tools/mdbx_load/src/mdbx_load.cc b/tools/mdbx_load/src/mdbx_load.cc index b4cd1e5c87..deceaca224 100644 --- a/tools/mdbx_load/src/mdbx_load.cc +++ b/tools/mdbx_load/src/mdbx_load.cc @@ -22,9 +22,8 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -/* Avoid reference to mdbx_runtime_flags from assert() */ -#define mdbx_runtime_flags (~0u) -#include "../../libs/libmdbx/src/src/bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../../libs/libmdbx/src/src/elements/internals.h" #include <ctype.h> diff --git a/tools/mdbx_load/src/stdafx.h b/tools/mdbx_load/src/stdafx.h index 670a09a54d..c31574e6bf 100644 --- a/tools/mdbx_load/src/stdafx.h +++ b/tools/mdbx_load/src/stdafx.h @@ -21,7 +21,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. #include <stdint.h> #include <windows.h> -#include "../../libs/libmdbx/src/src/bits.h" +#define MDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#include "../../libs/libmdbx/src/src/elements/internals.h" #define CMP_UINT(x, y) { if ((x) != (y)) return (x) < (y) ? -1 : 1; } |