diff options
Diffstat (limited to 'plugins/Dbx_mdb/src/mdbx/bits.h')
-rw-r--r-- | plugins/Dbx_mdb/src/mdbx/bits.h | 1224 |
1 files changed, 1224 insertions, 0 deletions
diff --git a/plugins/Dbx_mdb/src/mdbx/bits.h b/plugins/Dbx_mdb/src/mdbx/bits.h new file mode 100644 index 0000000000..74b30c628a --- /dev/null +++ b/plugins/Dbx_mdb/src/mdbx/bits.h @@ -0,0 +1,1224 @@ +/* + * Copyright 2015-2017 Leonid Yuriev <leo@yuriev.ru> + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. */ + +#pragma once +/* *INDENT-OFF* */ +/* clang-format off */ + +#ifndef MDBX_DEBUG +# define MDBX_DEBUG 0 +#endif + +#if MDBX_DEBUG +# undef NDEBUG +#endif + +/* Features under development */ +#ifndef MDBX_DEVEL +# define MDBX_DEVEL 1 +#endif + +/*----------------------------------------------------------------------------*/ + +/* Should be defined before any includes */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif +#ifndef _FILE_OFFSET_BITS +# define _FILE_OFFSET_BITS 64 +#endif + +#ifdef _MSC_VER +# ifndef _CRT_SECURE_NO_WARNINGS +# define _CRT_SECURE_NO_WARNINGS +# endif +#if _MSC_VER > 1800 +# pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ +#pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* assignment within conditional expression */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ +#pragma warning(disable : 4310) /* cast truncates constant value */ +#pragma warning(disable : 4820) /* bytes padding added after data member for aligment */ +#pragma warning(disable : 4548) /* expression before comma has no effect; expected expression with side - effect */ +#endif /* _MSC_VER (warnings) */ + +#include "./mdbx.h" +#include "./defs.h" + +#if defined(__GNUC__) && !__GNUC_PREREQ(4,2) + /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." +#endif + +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2,12) + /* Actualy libmdbx was not tested with something older than glibc 2.12 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +# warning "libmdbx required at least GLIBC 2.12." +#endif + +#ifdef __SANITIZE_THREAD__ +# warning "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." +#endif /* __SANITIZE_THREAD__ */ + +#include "./osal.h" + +/* *INDENT-ON* */ +/* clang-format on */ + +/*----------------------------------------------------------------------------*/ +/* Basic constants and types */ + +/* The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. */ +#define MDBX_MINKEYS 2 + +/* A stamp that identifies a file as an MDBX file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. */ +#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) + +/* The version number for a database's datafile format. */ +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 255 : 2) +/* The version number for a database's lockfile format. */ +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 255 : 2) + +/* handle for the DB used to track free pages. */ +#define FREE_DBI 0 +/* handle for the default DB. */ +#define MAIN_DBI 1 +/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 +#define MAX_DBI (INT16_MAX - CORE_DBS) + +/* Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 3 + +/* A page number in the database. + * + * MDBX uses 32 bit for page numbers. This limits database + * size up to 2^44 bytes, in case of 4K pages. */ +typedef uint32_t pgno_t; +#define PRIaPGNO PRIu32 +#define MAX_PAGENO ((pgno_t)UINT64_C(0xffffFFFFffff)) +#define MIN_PAGENO NUM_METAS + +/* A transaction ID. */ +typedef uint64_t txnid_t; +#define PRIaTXN PRIi64 +#if MDBX_DEVEL +#define MIN_TXNID (UINT64_MAX - UINT32_MAX) +#elif MDBX_DEBUG +#define MIN_TXNID UINT64_C(0x100000000) +#else +#define MIN_TXNID UINT64_C(1) +#endif /* MIN_TXNID */ + +/* Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. */ +typedef uint16_t indx_t; + +#define MEGABYTE ((size_t)1 << 20) + +/*----------------------------------------------------------------------------*/ +/* Core structures for database and shared memory (i.e. format definition) */ +#pragma pack(push, 1) + +/* Reader Lock Table + * + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. + * + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ + +/* The actual reader record, with cacheline padding. */ +typedef struct MDBX_reader { + /* Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. */ + volatile txnid_t mr_txnid; + + /* The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * + * NOTE: We currently don't check for stale records. + * We simply re-init the table when we know that we're the only process + * opening the lock file. */ + + /* The process ID of the process owning this reader txn. */ + volatile mdbx_pid_t mr_pid; + /* The thread ID of the thread owning this txn. */ + volatile mdbx_tid_t mr_tid; + + /* cache line alignment */ + uint8_t pad[MDBX_CACHELINE_SIZE - + (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % + MDBX_CACHELINE_SIZE]; +} __cache_aligned MDBX_reader; + +/* Information about a single database in the environment. */ +typedef struct MDBX_db { + uint16_t md_flags; /* see mdbx_dbi_open */ + uint16_t md_depth; /* depth of this tree */ + uint32_t md_xsize; /* also ksize for LEAF2 pages */ + pgno_t md_root; /* the root page of this tree */ + pgno_t md_branch_pages; /* number of internal pages */ + pgno_t md_leaf_pages; /* number of leaf pages */ + pgno_t md_overflow_pages; /* number of overflow pages */ + uint64_t md_seq; /* table sequence counter */ + uint64_t md_entries; /* number of data items */ + uint64_t md_merkle; /* Merkle tree checksum */ +} MDBX_db; + +/* Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ +typedef struct MDBX_meta { + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ + uint64_t mm_magic_and_version; + + /* txnid that committed this page, the first of a two-phase-update pair */ + volatile txnid_t mm_txnid_a; + + uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ + uint8_t mm_validator_id; /* ID of checksum and page validation method, + * zero (nothing) for now */ + uint8_t mm_extra_pagehdr; /* extra bytes in the page header, + * zero (nothing) for now */ + + struct { + uint16_t grow; /* datafile growth step in pages */ + uint16_t shrink; /* datafile shrink threshold in pages */ + pgno_t lower; /* minimal size of datafile in pages */ + pgno_t upper; /* maximal size of datafile in pages */ + pgno_t now; /* current size of datafile in pages */ + pgno_t next; /* first unused page in the datafile, + * but actually the file may be shorter. */ + } mm_geo; + + MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ + /* The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_xsize +/* Any persistent environment flags, see mdbx_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + mdbx_canary mm_canary; + +#define MDBX_DATASIGN_NONE 0u +#define MDBX_DATASIGN_WEAK 1u +#define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) +#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) +#define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) +#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) + volatile uint64_t mm_datasync_sign; + + /* txnid that committed this page, the second of a two-phase-update pair */ + volatile txnid_t mm_txnid_b; +} MDBX_meta; + +/* Common header for all page types. The page type depends on mp_flags. + * + * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with + * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages + * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. + * + * P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of F_BIGDATA nodes. + * + * P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. + * + * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. */ +typedef struct MDBX_page { + union { + struct MDBX_page *mp_next; /* for in-memory list of freed pages, + * must be first field, see NEXT_LOOSE_PAGE */ + uint64_t mp_validator; /* checksum of page content or a txnid during + * which the page has been updated */ + }; + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01 /* branch page */ +#define P_LEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_META 0x08 /* meta page */ +#define P_DIRTY 0x10 /* dirty page, also set for P_SUBP pages */ +#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /* leave this page alone during spill */ + uint16_t mp_flags; + union { + struct { + indx_t mp_lower; /* lower bound of free space */ + indx_t mp_upper; /* upper bound of free space */ + }; + uint32_t mp_pages; /* number of overflow pages */ + }; + pgno_t mp_pgno; /* page number */ + + /* dynamic size */ + union { + indx_t mp_ptrs[1]; + MDBX_meta mp_meta; + uint8_t mp_data[1]; + }; +} MDBX_page; + +/* Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) + +/* The maximum size of a database page. +* +* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. +* +* MDBX will use database pages < OS pages if needed. +* That causes more I/O in write transactions: The OS must +* know (read) the whole page before writing a partial page. +* +* Note that we don't currently support Huge pages. On Linux, +* regular data files cannot use Huge pages, and in general +* Huge pages aren't actually pageable. We rely on the OS +* demand-pager to read our data and page it out when memory +* pressure from other processes is high. So until OSs have +* actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE 0x10000u +#define MIN_PAGESIZE 512u + +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#if defined(_WIN32) || defined(_WIN64) +#define MAX_MAPSIZE32 UINT32_C(0x38000000) +#else +#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#endif +#define MAX_MAPSIZE64 \ + ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ + : MAX_PAGENO * (uint64_t)MAX_PAGESIZE) + +#define MAX_MAPSIZE ((sizeof(size_t) < 8) ? MAX_MAPSIZE32 : MAX_MAPSIZE64) + +/* The header for the reader table (a memory-mapped lock file). */ +typedef struct MDBX_lockinfo { + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ + uint64_t mti_magic_and_version; + + /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ + uint32_t mti_os_and_format; + + /* Flags which environment was opened. */ + volatile uint32_t mti_envmode; + + union { +#ifdef MDBX_OSAL_LOCK + MDBX_OSAL_LOCK mti_wmutex; +#endif + uint64_t align_wmutex; + }; + + union { + /* The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. */ + volatile unsigned __cache_aligned mti_numreaders; + uint64_t align_numreaders; + }; + + union { +#ifdef MDBX_OSAL_LOCK + /* Mutex protecting access to this table. */ + MDBX_OSAL_LOCK mti_rmutex; +#endif + uint64_t align_rmutex; + }; + + union { + volatile txnid_t mti_oldest; + uint64_t align_oldest; + }; + + union { + volatile uint32_t mti_reader_finished_flag; + uint64_t align_reader_finished_flag; + }; + + uint8_t pad_align[MDBX_CACHELINE_SIZE - sizeof(uint64_t) * 7]; + + MDBX_reader __cache_aligned mti_readers[1]; +} MDBX_lockinfo; + +#ifdef _MSC_VER +#pragma pack(pop) +#endif /* MSVC: Enable aligment */ + +#define MDBX_LOCKINFO_WHOLE_SIZE \ + ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ + ~((size_t)MDBX_CACHELINE_SIZE - 1)) + +/* Lockfile format signature: version, features and field layout */ +#define MDBX_LOCK_FORMAT \ + ((MDBX_OSAL_LOCK_SIGN << 16) + \ + (uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1)) + +#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) + +#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) + +/*----------------------------------------------------------------------------*/ +/* Two kind lists of pages (aka PNL) */ + +/* An PNL is an Page Number List, a sorted array of IDs. The first element of + * the array is a counter for how many actual page-numbers are in the list. + * PNLs are sorted in descending order, this allow cut off a page with lowest + * pgno (at the tail) just truncating the list */ +#define MDBX_PNL_ASCENDING 0 +typedef pgno_t *MDBX_PNL; + +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_ORDERED(first, last) ((first) < (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) +#else +#define MDBX_PNL_ORDERED(first, last) ((first) > (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) +#endif + +/* List of txnid, only for MDBX_env.mt_lifo_reclaimed */ +typedef txnid_t *MDBX_TXL; + +/* An ID2 is an ID/pointer pair. */ +typedef struct MDBX_ID2 { + pgno_t mid; /* The ID */ + void *mptr; /* The pointer */ +} MDBX_ID2; + +/* An ID2L is an ID2 List, a sorted array of ID2s. + * The first element's mid member is a count of how many actual + * elements are in the array. The mptr member of the first element is + * unused. The array is sorted in ascending order by mid. */ +typedef MDBX_ID2 *MDBX_ID2L; + +/* PNL sizes - likely should be even bigger + * limiting factors: sizeof(pgno_t), thread stack size */ +#define MDBX_PNL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDBX_PNL_DB_SIZE (1 << MDBX_PNL_LOGN) +#define MDBX_PNL_UM_SIZE (1 << (MDBX_PNL_LOGN + 1)) + +#define MDBX_PNL_DB_MAX (MDBX_PNL_DB_SIZE - 1) +#define MDBX_PNL_UM_MAX (MDBX_PNL_UM_SIZE - 1) + +#define MDBX_PNL_SIZEOF(pl) (((pl)[0] + 1) * sizeof(pgno_t)) +#define MDBX_PNL_IS_ZERO(pl) ((pl)[0] == 0) +#define MDBX_PNL_CPY(dst, src) (memcpy(dst, src, MDBX_PNL_SIZEOF(src))) +#define MDBX_PNL_FIRST(pl) ((pl)[1]) +#define MDBX_PNL_LAST(pl) ((pl)[(pl)[0]]) + +/* Current max length of an mdbx_pnl_alloc()ed PNL */ +#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) + +/*----------------------------------------------------------------------------*/ +/* Internal structures */ + +/* Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. */ +typedef struct MDBX_dbx { + MDBX_val md_name; /* name of the database */ + MDBX_cmp_func *md_cmp; /* function for comparing keys */ + MDBX_cmp_func *md_dcmp; /* function for comparing data items */ +} MDBX_dbx; + +/* A database transaction. + * Every operation requires a transaction handle. */ +struct MDBX_txn { +#define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) + size_t mt_signature; + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + pgno_t mt_next_pgno; /* next unallocated page */ + pgno_t mt_end_pgno; /* corresponding to the current size of datafile */ + /* The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. */ + txnid_t mt_txnid; + MDBX_env *mt_env; /* the DB environment */ + /* The list of reclaimed txns from freeDB */ + MDBX_TXL mt_lifo_reclaimed; + /* The list of pages that became unused during this transaction. */ + MDBX_PNL mt_befree_pages; + /* The list of loose pages that became unused and may be reused + * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ + MDBX_page *mt_loose_pages; + /* Number of loose pages (mt_loose_pages) */ + unsigned mt_loose_count; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL mt_spill_pages; + union { + /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ + MDBX_ID2L mt_rw_dirtylist; + /* For read txns: This thread/txn's reader table slot, or NULL. */ + MDBX_reader *mt_ro_reader; + }; + /* Array of records for each DB known in the environment. */ + MDBX_dbx *mt_dbxs; + /* Array of MDBX_db records for each known DB */ + MDBX_db *mt_dbs; + /* Array of sequence numbers for each DB handle */ + unsigned *mt_dbiseqs; + +/* Transaction DB Flags */ +#define DB_DIRTY MDBX_TBL_DIRTY /* DB was written in this txn */ +#define DB_STALE MDBX_TBL_STALE /* Named-DB record is older than txnID */ +#define DB_NEW MDBX_TBL_NEW /* Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /* DB handle is valid, see also MDBX_VALID */ +#define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */ +#define DB_DUPDATA 0x20 /* DB is MDBX_DUPSORT data */ + /* In write txns, array of cursors for each DB */ + MDBX_cursor **mt_cursors; + /* Array of flags for each DB */ + uint8_t *mt_dbflags; + /* Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. */ + MDBX_dbi mt_numdbs; + +/* Transaction Flags */ +/* mdbx_txn_begin() flags */ +#define MDBX_TXN_BEGIN_FLAGS (MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_RDONLY) +#define MDBX_TXN_NOMETASYNC \ + MDBX_NOMETASYNC /* don't sync meta for this txn on commit */ +#define MDBX_TXN_NOSYNC MDBX_NOSYNC /* don't sync this txn on commit */ +#define MDBX_TXN_RDONLY MDBX_RDONLY /* read-only transaction */ + /* internal txn flags */ +#define MDBX_TXN_WRITEMAP MDBX_WRITEMAP /* copy of MDBX_env flag in writers */ +#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ +#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ +#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ +#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ +#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ +/* most operations on the txn are currently illegal */ +#define MDBX_TXN_BLOCKED \ + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD) + unsigned mt_flags; + /* dirtylist room: Array size - dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirtylist into mt_parent after freeing hidden mt_parent pages. */ + unsigned mt_dirtyroom; + mdbx_tid_t mt_owner; /* thread ID that owns this transaction */ + mdbx_canary mt_canary; +}; + +/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. */ +#define CURSOR_STACK 32 + +struct MDBX_xcursor; + +/* Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. MDBX_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a P_SUBP page can be stale. + * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ +struct MDBX_cursor { +#define MDBX_MC_SIGNATURE UINT32_C(0xFE05D5B1) +#define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) +#define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) + uint32_t mc_signature; + /* The database handle this cursor operates on */ + MDBX_dbi mc_dbi; + /* Next cursor on this DB in this txn */ + MDBX_cursor *mc_next; + /* Backup of the original cursor if this cursor is a shadow */ + MDBX_cursor *mc_backup; + /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ + struct MDBX_xcursor *mc_xcursor; + /* The transaction that owns this cursor */ + MDBX_txn *mc_txn; + /* The database record for this cursor */ + MDBX_db *mc_db; + /* The database auxiliary record for this cursor */ + MDBX_dbx *mc_dbx; + /* The mt_dbflag for this database */ + uint8_t *mc_dbflag; + uint16_t mc_snum; /* number of pushed pages */ + uint16_t mc_top; /* index of top page, normally mc_snum-1 */ + /* Cursor state flags. */ +#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ +#define C_EOF 0x02 /* No more data */ +#define C_SUB 0x04 /* Cursor is a sub-cursor */ +#define C_DEL 0x08 /* last op was a cursor_del */ +#define C_UNTRACK 0x40 /* Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /* FreeDB lookup is prohibited */ + unsigned mc_flags; /* see mdbx_cursor */ + MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ +}; + +/* Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. */ +typedef struct MDBX_xcursor { + /* A sub-cursor for traversing the Dup DB */ + MDBX_cursor mx_cursor; + /* The database record for this Dup DB */ + MDBX_db mx_db; + /* The auxiliary DB record for this Dup DB */ + MDBX_dbx mx_dbx; + /* The mt_dbflag for this Dup DB */ + uint8_t mx_dbflag; +} MDBX_xcursor; + +/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/* Update sub-page pointer, if any, in mc->mc_xcursor. + * Needed when the node which contains the sub-page may have moved. + * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDBX_page *xr_pg = (mp); \ + MDBX_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ + } while (0) + +/* State of FreeDB old pages, stored in the MDBX_env */ +typedef struct MDBX_pgstate { + pgno_t *mf_reclaimed_pglist; /* Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_last_reclaimed; /* ID of last used record, or 0 if + !mf_reclaimed_pglist */ +} MDBX_pgstate; + +/* The database environment. */ +struct MDBX_env { +#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) + size_t me_signature; + mdbx_mmap_t me_dxb_mmap; /* The main data file */ +#define me_map me_dxb_mmap.dxb +#define me_fd me_dxb_mmap.fd +#define me_mapsize me_dxb_mmap.length + mdbx_mmap_t me_lck_mmap; /* The lock file */ +#define me_lfd me_lck_mmap.fd +#define me_lck me_lck_mmap.lck + +/* Failed to update the meta page. Probably an I/O error. */ +#define MDBX_FATAL_ERROR UINT32_C(0x80000000) +/* Additional flag for mdbx_sync_locked() */ +#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) +/* Some fields are initialized. */ +#define MDBX_ENV_ACTIVE UINT32_C(0x20000000) +/* me_txkey is set */ +#define MDBX_ENV_TXKEY UINT32_C(0x10000000) + uint32_t me_flags; /* see mdbx_env */ + unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize2log; /* log2 of DB page size */ + unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_maxreaders; /* size of the reader table */ + /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ + unsigned me_close_readers; + mdbx_fastmutex_t me_dbi_lock; + MDBX_dbi me_numdbs; /* number of DBs opened */ + MDBX_dbi me_maxdbs; /* size of the DB table */ + mdbx_pid_t me_pid; /* process ID of this env */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ + char *me_path; /* path to the DB files */ + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn; /* current write transaction */ + MDBX_txn *me_txn0; /* prealloc'd write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ +#define me_last_reclaimed me_pgstate.mf_last_reclaimed +#define me_reclaimed_pglist me_pgstate.mf_reclaimed_pglist + MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + /* PNL of pages that became unused in a write txn */ + MDBX_PNL me_free_pgs; + /* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */ + MDBX_ID2L me_dirtylist; + /* Max number of freelist items that can fit in a single overflow page */ + unsigned me_maxfree_1pg; + /* Max size of a node on a page */ + unsigned me_nodemax; + unsigned me_maxkey_limit; /* max size of a key */ + mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ + size_t me_sync_pending; /* Total dirty/non-sync'ed bytes + * since the last mdbx_env_sync() */ + size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ + MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ + txnid_t me_oldest_stub; +#if MDBX_DEBUG + MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ +#endif +#ifdef USE_VALGRIND + int me_valgrind_handle; +#endif + + struct { + size_t lower; /* minimal size of datafile */ + size_t upper; /* maximal size of datafile */ + size_t now; /* current size of datafile */ + size_t grow; /* step to grow datafile */ + size_t shrink; /* threshold to shrink datafile */ + } me_dbgeo; /* */ +}; + +/* Nested transaction */ +typedef struct MDBX_ntxn { + MDBX_txn mnt_txn; /* the transaction */ + MDBX_pgstate mnt_pgstate; /* parent transaction's saved freestate */ +} MDBX_ntxn; + +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +extern int mdbx_runtime_flags; +extern MDBX_debug_func *mdbx_debug_logger; +extern txnid_t mdbx_debug_edge; + +void mdbx_debug_log(int type, const char *function, int line, const char *fmt, + ...) +#if defined(__GNUC__) || __has_attribute(format) + __attribute__((format(printf, 4, 5))) +#endif + ; + +void mdbx_panic(const char *fmt, ...) +#if defined(__GNUC__) || __has_attribute(format) + __attribute__((format(printf, 1, 2))) +#endif + ; + +#if MDBX_DEBUG + +#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) + +#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) + +#define mdbx_debug_enabled(type) \ + unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) + +#else +#define mdbx_debug_enabled(type) (0) +#define mdbx_audit_enabled() (0) +#ifndef NDEBUG +#define mdbx_assert_enabled() (1) +#else +#define mdbx_assert_enabled() (0) +#endif /* NDEBUG */ +#endif /* MDBX_DEBUG */ + +#define mdbx_print(fmt, ...) \ + mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) + +#define mdbx_trace(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_verbose(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_info(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_notice(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_warning(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_error(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_fatal(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_ensure_msg(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ + } while (0) + +#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define mdbx_assert(env, expr) \ + do { \ + if (mdbx_assert_enabled()) \ + mdbx_ensure(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) + +static __inline void mdbx_jitter4testing(bool tiny) { +#ifndef NDEBUG + if (MDBX_DBG_JITTER & mdbx_runtime_flags) + mdbx_osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +/*----------------------------------------------------------------------------*/ +/* Internal prototypes and inlines */ + +int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); +void mdbx_rthc_dtor(void *rthc); +void mdbx_rthc_lock(void); +void mdbx_rthc_unlock(void); +int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +void mdbx_rthc_remove(mdbx_thread_key_t key); +void mdbx_rthc_cleanup(void); + +static __inline bool mdbx_is_power2(size_t x) { return (x & (x - 1)) == 0; } + +static __inline size_t mdbx_roundup2(size_t value, size_t granularity) { + assert(mdbx_is_power2(granularity)); + return (value + granularity - 1) & ~(granularity - 1); +} + +static __inline unsigned mdbx_log2(size_t value) { + assert(mdbx_is_power2(value)); + + unsigned log = 0; + while (value > 1) { + log += 1; + value >>= 1; + } + return log; +} + +#define MDBX_IS_ERROR(rc) \ + ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) + +/* Internal error codes, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) + +/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) + +/* Key size which fits in a DKBUF. */ +#define DKBUF_MAXKEYSIZE 511 /* FIXME */ + +#if MDBX_DEBUG +#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] +#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) +#define DVAL(x) \ + mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) +#else +#define DKBUF ((void)(0)) +#define DKEY(x) ("-") +#define DVAL(x) ("-") +#endif + +/* An invalid page number. + * Mainly used to denote an empty tree. */ +#define P_INVALID (~(pgno_t)0) + +/* Test if the flags f are set in a flag word w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/* Round n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/* Default size of memory map. + * This is certainly too small for any actual applications. Apps should + * always set the size explicitly using mdbx_env_set_mapsize(). */ +#define DEFAULT_MAPSIZE 1048576 + +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 + +/* Address of first usable data byte in a page, after the header */ +#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + +/* Number of nodes on a page */ +#define NUMKEYS(p) ((unsigned)(p)->mp_lower >> 1) + +/* The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + +/* The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) \ + (1024UL * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) +/* The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. */ +#define FILL_THRESHOLD 256 + +/* Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +/* Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) +/* Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +/* Test if a page is an overflow page */ +#define IS_OVERFLOW(p) unlikely(F_ISSET((p)->mp_flags, P_OVERFLOW)) +/* Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + +/* The number of overflow pages needed to store the given size. */ +#define OVPAGES(env, size) (bytes2pgno(env, PAGEHDRSZ - 1 + (size)) + 1) + +/* Link in MDBX_txn.mt_loose_pages list. + * Kept outside the page header, which is needed when reusing the page. */ +#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) + +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. + * We guarantee 2-byte alignment for 'MDBX_node's. + * + * mn_lo and mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just F_SUBDATA). */ +typedef struct MDBX_node { + union { + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + union { + struct { + uint16_t mn_lo, mn_hi; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; + uint16_t mn_flags; /* see mdbx_node */ + uint16_t mn_ksize; /* key size */ +#else + uint16_t mn_ksize; /* key size */ + uint16_t mn_flags; /* see mdbx_node */ + union { + struct { + uint16_t mn_hi, mn_lo; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; +#endif + }; + pgno_t mn_ksize_and_pgno; + }; + +/* mdbx_node Flags */ +#define F_BIGDATA 0x01 /* data put on overflow page */ +#define F_SUBDATA 0x02 /* data is a sub-database */ +#define F_DUPDATA 0x04 /* data has duplicates */ + +/* valid flags for mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) + uint8_t mn_data[1]; /* key and data are appended here */ +} MDBX_node; + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDBX_node, mn_data) + +/* Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + +/* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) + +/* Address of node i in page p */ +static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { + assert(NUMKEYS(p) > (unsigned)(i)); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); +} + +/* Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + +/* Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + +/* Get the page number pointed to by a branch node */ +static __inline pgno_t NODEPGNO(const MDBX_node *node) { + pgno_t pgno; + if (UNALIGNED_OK) { + pgno = node->mn_ksize_and_pgno; + if (sizeof(pgno_t) > 4) + pgno &= MAX_PAGENO; + } else { + pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_flags) << 32; + } + return pgno; +} + +/* Set the page number in a branch node */ +static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { + assert(pgno <= MAX_PAGENO); + + if (UNALIGNED_OK) { + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_ksize) << 48; + node->mn_ksize_and_pgno = pgno; + } else { + node->mn_lo = (uint16_t)pgno; + node->mn_hi = (uint16_t)(pgno >> 16); + if (sizeof(pgno_t) > 4) + node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); + } +} + +/* Get the size of the data in a leaf node */ +static __inline size_t NODEDSZ(const MDBX_node *node) { + size_t size; + if (UNALIGNED_OK) { + size = node->mn_dsize; + } else { + size = node->mn_lo | ((size_t)node->mn_hi << 16); + } + return size; +} + +/* Set the size of the data for a leaf node */ +static __inline void SETDSZ(MDBX_node *node, size_t size) { + assert(size < INT_MAX); + if (UNALIGNED_OK) { + node->mn_dsize = (uint32_t)size; + } else { + node->mn_lo = (uint16_t)size; + node->mn_hi = (uint16_t)(size >> 16); + } +} + +/* The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) + +/* Set the node's key into keyptr, if requested. */ +#define MDBX_GET_KEY(node, keyptr) \ + do { \ + if ((keyptr) != NULL) { \ + (keyptr)->iov_len = NODEKSZ(node); \ + (keyptr)->iov_base = NODEKEY(node); \ + } \ + } while (0) + +/* Set the node's key into key. */ +#define MDBX_GET_KEY2(node, key) \ + do { \ + key.iov_len = NODEKSZ(node); \ + key.iov_base = NODEKEY(node); \ + } while (0) + +#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) +/* mdbx_dbi_open() flags */ +#define VALID_FLAGS \ + (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ + MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) + +/* max number of pages to commit in one writev() call */ +#define MDBX_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ +#undef MDBX_COMMIT_PAGES +#define MDBX_COMMIT_PAGES IOV_MAX +#endif + +/* Check txn and dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + +/* Check for misused dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +/* LY: fast enough on most systems + * + * / + * | -1, a < b + * cmp2int(a,b) = < 0, a == b + * | 1, a > b + * \ + */ +#if 1 +#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) +#else +#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) +#endif + +static __inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { + mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + return ((size_t)pgno) << env->me_psize2log; +} + +static __inline MDBX_page *pgno2page(const MDBX_env *env, pgno_t pgno) { + return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); +} + +static __inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { + mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + return (pgno_t)(bytes >> env->me_psize2log); +} + +static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { + assert(base <= MAX_PAGENO); + return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; +} + +static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { + assert(base >= MIN_PAGENO); + return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; +} + +static __inline size_t pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { + return mdbx_roundup2(pgno2bytes(env, pgno), env->me_os_psize); +} + +static __inline pgno_t pgno_align2os_pgno(const MDBX_env *env, pgno_t pgno) { + return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); +} |